diff --git a/.circleci/config.yml b/.circleci/config.yml
index 856211e280cb..1bfe5d29f7f0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -587,6 +587,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
             - run: pip install -r examples/pytorch/_tests_requirements.txt
+            - run: pip install git+https://github.com/huggingface/accelerate
             - save_cache:
                   key: v0.4-torch_examples-{{ checksum "setup.py" }}
                   paths:
diff --git a/README.md b/README.md
index 024e45220129..f37b11088022 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ In Audio:
 ## If you are looking for custom support from the Hugging Face team
 
 <a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
 </a><br>
 
 ## Quick tour
@@ -303,6 +303,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
@@ -317,6 +318,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
diff --git a/README_ko.md b/README_ko.md
index 5d813b0cc76d..71aa11474401 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -281,6 +281,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@@ -296,6 +297,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 5570335d49f9..efbe0e4547c7 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -305,6 +305,7 @@ conda install -c huggingface transformers
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
@@ -320,6 +321,7 @@ conda install -c huggingface transformers
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 88ba012d5a89..c9396e45faa6 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -317,6 +317,7 @@ conda install -c huggingface transformers
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@@ -332,6 +333,7 @@ conda install -c huggingface transformers
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 69717477e1f2..c32004ff5214 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -296,6 +296,8 @@
       title: Reformer
     - local: model_doc/rembert
       title: RemBERT
+    - local: model_doc/regnet
+      title: RegNet
     - local: model_doc/resnet
       title: ResNet
     - local: model_doc/retribert
@@ -328,6 +330,8 @@
       title: T5v1.1
     - local: model_doc/tapas
       title: TAPAS
+    - local: model_doc/tapex
+      title: TAPEX
     - local: model_doc/transfo-xl
       title: Transformer XL
     - local: model_doc/trocr
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 281add6e5ef4..2071e41e672f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -124,6 +124,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
 1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
@@ -138,6 +139,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
@@ -234,6 +236,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
@@ -250,6 +253,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TAPEX            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/internal/generation_utils.mdx b/docs/source/en/internal/generation_utils.mdx
index c8b42d91848e..5a717edb98fb 100644
--- a/docs/source/en/internal/generation_utils.mdx
+++ b/docs/source/en/internal/generation_utils.mdx
@@ -178,6 +178,12 @@ generation.
 [[autodoc]] TFRepetitionPenaltyLogitsProcessor
     - __call__
 
+[[autodoc]] TFForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForcedEOSTokenLogitsProcessor
+    - __call__
+
 [[autodoc]] FlaxLogitsProcessor
     - __call__
 
diff --git a/docs/source/en/model_doc/regnet.mdx b/docs/source/en/model_doc/regnet.mdx
new file mode 100644
index 000000000000..666a9ee39675
--- /dev/null
+++ b/docs/source/en/model_doc/regnet.mdx
@@ -0,0 +1,48 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RegNet
+
+## Overview
+
+The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+
+The authors design search spaces to perform Neural Architecture Search (NAS). They first start from a high dimensional search space and iteratively reduce the search space by empirically applying constraints based on the best-performing models sampled by the current search space.
+
+The abstract from the paper is the following:
+
+*In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.*
+
+Tips:
+
+- One can use [`AutoFeatureExtractor`] to prepare images for the model.
+- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
+
+This model was contributed by [Francesco](https://huggingface.co/Francesco).
+The original code can be found [here](https://github.com/facebookresearch/pycls).
+
+
+## RegNetConfig
+
+[[autodoc]] RegNetConfig
+
+
+## RegNetModel
+
+[[autodoc]] RegNetModel
+    - forward
+
+
+## RegNetForImageClassification
+
+[[autodoc]] RegNetForImageClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx
index 2e86c497c057..0a3b00b1d5dd 100644
--- a/docs/source/en/model_doc/speech_to_text.mdx
+++ b/docs/source/en/model_doc/speech_to_text.mdx
@@ -47,25 +47,19 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> import torch
 >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
 >>> from datasets import load_dataset
->>> import soundfile as sf
 
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
 
 
->>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
-...     return batch
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
 
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> ds = ds.map(map_to_array)
-
->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
+>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
+>>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
 
 >>> transcription = processor.batch_decode(generated_ids)
+>>> transcription
+['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel']
 ```
 
 - Multilingual speech translation
@@ -80,29 +74,22 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> import torch
 >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
 >>> from datasets import load_dataset
->>> import soundfile as sf
 
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
 
->>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
-...     return batch
-
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> ds = ds.map(map_to_array)
-
->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
 >>> generated_ids = model.generate(
-...     input_ids=inputs["input_features"],
+...     inputs["input_features"],
 ...     attention_mask=inputs["attention_mask"],
 ...     forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
 ... )
 
 >>> translation = processor.batch_decode(generated_ids)
+>>> translation
+["<lang:fr> (Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."]
 ```
 
 See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.
diff --git a/docs/source/en/model_doc/tapex.mdx b/docs/source/en/model_doc/tapex.mdx
new file mode 100644
index 000000000000..f6e65764e50d
--- /dev/null
+++ b/docs/source/en/model_doc/tapex.mdx
@@ -0,0 +1,130 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# TAPEX
+
+## Overview
+
+The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu,
+Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
+which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. 
+
+TAPEX has been fine-tuned on several datasets: 
+- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
+- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
+- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
+- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
+
+The abstract from the paper is the following:
+
+*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
+still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
+propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
+synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
+executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
+TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
+on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
+to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
+and to achieve new state-of-the-art results on various downstream tasks.*
+
+Tips:
+
+- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. 
+- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
+- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format: 
+  `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
+- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
+  and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). 
+
+## Usage: inference
+
+Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
+We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
+based on the configuration file of the checkpoint on the hub.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+>>> import pandas as pd
+
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+
+>>> # prepare table + question
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> question = "how many movies does Leonardo Di Caprio have?"
+
+>>> encoding = tokenizer(table, question, return_tensors="pt")
+
+>>> # let the model generate an answer autoregressively
+>>> outputs = model.generate(**encoding)
+
+>>> # decode back to text
+>>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+>>> print(predicted_answer)
+53
+```
+
+Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
+and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
+
+```python
+>>> # prepare table + question
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> questions = [
+...     "how many movies does Leonardo Di Caprio have?",
+...     "which actor has 69 movies?",
+...     "what's the first name of the actor who has 87 movies?",
+... ]
+>>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
+
+>>> # let the model generate an answer autoregressively
+>>> outputs = model.generate(**encoding)
+
+>>> # decode back to text
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+[' 53', ' george clooney', ' brad pitt']
+```
+
+In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
+of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
+benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
+>>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
+
+>>> # prepare table + sentence
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> sentence = "George Clooney has 30 movies"
+
+>>> encoding = tokenizer(table, sentence, return_tensors="pt")
+
+>>> # forward pass
+>>> outputs = model(**encoding)
+
+>>> # print prediction
+>>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
+>>> print(model.config.id2label[predicted_class_idx])
+Refused
+```
+
+
+## TapexTokenizer
+
+[[autodoc]] TapexTokenizer
+    - __call__
+    - save_vocabulary
\ No newline at end of file
diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx
index 390acd72731d..947129b34aaf 100644
--- a/docs/source/en/preprocessing.mdx
+++ b/docs/source/en/preprocessing.mdx
@@ -199,22 +199,22 @@ Audio inputs are preprocessed differently than textual inputs, but the end goal
 pip install datasets
 ```
 
-Load the keyword spotting task from the [SUPERB](https://huggingface.co/datasets/superb) benchmark (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset):
+Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset):
 
 ```py
 >>> from datasets import load_dataset, Audio
 
->>> dataset = load_dataset("superb", "ks")
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
 ```
 
 Access the first element of the `audio` column to take a look at the input. Calling the `audio` column will automatically load and resample the audio file:
 
 ```py
->>> dataset["train"][0]["audio"]
-{'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00592041,
-        -0.00405884, -0.00253296], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav',
- 'sampling_rate': 16000}
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
 ```
 
 This returns three items:
@@ -227,34 +227,34 @@ This returns three items:
 
 For this tutorial, you will use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. As you can see from the model card, the Wav2Vec2 model is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your audio data. 
 
-For example, load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset which has a sampling rate of 22050kHz. In order to use the Wav2Vec2 model with this dataset, downsample the sampling rate to 16kHz:
+For example, the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset has a sampling rate of 8000kHz. In order to use the Wav2Vec2 model with this dataset, upsample the sampling rate to 16kHz:
 
 ```py
->>> lj_speech = load_dataset("lj_speech", split="train")
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
 ```
 
-1. Use 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) method to downsample the sampling rate to 16kHz:
+1. Use 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) method to upsample the sampling rate to 16kHz:
 
 ```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
 ```
 
 2. Load the audio file:
 
 ```py
->>> lj_speech[0]["audio"]
-{'array': array([-0.00064146, -0.00074657, -0.00068768, ...,  0.00068341,
-         0.00014045,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
  'sampling_rate': 16000}
 ```
 
-As you can see, the `sampling_rate` was downsampled to 16kHz. Now that you know how resampling works, let's return to our previous example with the SUPERB dataset!
+As you can see, the `sampling_rate` is now 16kHz!
 
 ### Feature extractor
 
@@ -271,9 +271,10 @@ Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
 Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur.
 
 ```py
->>> audio_input = [dataset["train"][0]["audio"]["array"]]
+>>> audio_input = [dataset[0]["audio"]["array"]]
 >>> feature_extractor(audio_input, sampling_rate=16000)
-{'input_values': [array([ 0.00045439,  0.00045439,  0.00045439, ..., -0.1578519 , -0.10807519, -0.06727459], dtype=float32)]}
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
 ```
 
 ### Pad and truncate
@@ -281,11 +282,11 @@ Pass the audio `array` to the feature extractor. We also recommend adding the `s
 Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples:
 
 ```py
->>> dataset["train"][0]["audio"]["array"].shape
-(1522930,)
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
 
->>> dataset["train"][1]["audio"]["array"].shape
-(988891,)
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
 ```
 
 As you can see, the first sample has a longer sequence than the second sample. Let's create a function that will preprocess the dataset. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it:
@@ -297,7 +298,7 @@ As you can see, the first sample has a longer sequence than the second sample. L
 ...         audio_arrays,
 ...         sampling_rate=16000,
 ...         padding=True,
-...         max_length=1000000,
+...         max_length=100000,
 ...         truncation=True,
 ...     )
 ...     return inputs
@@ -306,17 +307,17 @@ As you can see, the first sample has a longer sequence than the second sample. L
 Apply the function to the the first few examples in the dataset:
 
 ```py
->>> processed_dataset = preprocess_function(dataset["train"][:5])
+>>> processed_dataset = preprocess_function(dataset[:5])
 ```
 
 Now take another look at the processed sample lengths:
 
 ```py
 >>> processed_dataset["input_values"][0].shape
-(1000000,)
+(100000,)
 
 >>> processed_dataset["input_values"][1].shape
-(1000000,)
+(100000,)
 ```
 
 The lengths of the first two samples now match the maximum length you specified.
diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx
index 1fc4f8b865dc..057196a78117 100644
--- a/docs/source/en/quicktour.mdx
+++ b/docs/source/en/quicktour.mdx
@@ -115,23 +115,23 @@ Create a [`pipeline`] with the task you want to solve for and the model you want
 >>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 ```
 
-Next, load a dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) for more details) you'd like to iterate over. For example, let's load the [SUPERB](https://huggingface.co/datasets/superb) dataset:
+Next, load a dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) for more details) you'd like to iterate over. For example, let's load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset:
 
 ```py
->>> import datasets
+>>> from datasets import load_dataset
 
->>> dataset = datasets.load_dataset("superb", name="asr", split="test")  # doctest: +IGNORE_RESULT
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
 ```
 
 You can pass a whole dataset pipeline:
 
 ```py
->>> files = dataset["file"]
+>>> files = dataset["path"]
 >>> speech_recognizer(files[:4])
-[{'text': 'HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOWER FAT AND SAUCE'},
- {'text': 'STUFFERED INTO YOU HIS BELLY COUNSELLED HIM'},
- {'text': 'AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS'},
- {'text': 'HO BERTIE ANY GOOD IN YOUR MIND'}]
+[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'}, 
+ {'text': "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE"}, 
+ {'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"}, 
+ {'text': 'HOW DO I FURN A JOINA COUT'}]
 ```
 
 For a larger dataset where the inputs are big (like in speech or vision), you will want to pass along a generator instead of a list that loads all the inputs in memory. See the [pipeline documentation](./main_classes/pipelines) for more information.
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 65fb5fa5cc54..acdabb717024 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -67,6 +67,7 @@ Ready-made configurations include the following architectures:
 - PLBart
 - RoBERTa
 - T5
+- TAPEX
 - ViT
 - XLM-RoBERTa
 - XLM-RoBERTa-XL
diff --git a/docs/source/en/task_summary.mdx b/docs/source/en/task_summary.mdx
index 95c2d9c201a5..17be51960588 100644
--- a/docs/source/en/task_summary.mdx
+++ b/docs/source/en/task_summary.mdx
@@ -967,3 +967,156 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 </frameworkcontent>
 
 We get the same translation as with the pipeline example.
+
+## Audio classification
+
+Audio classification assigns a class to an audio signal. The Keyword Spotting dataset from the [SUPERB](https://huggingface.co/datasets/superb) benchmark is an example dataset that can be used for audio classification fine-tuning. This dataset contains ten classes of keywords for classification. If you'd like to fine-tune a model for audio classification, take a look at the [run_audio_classification.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/audio-classification/run_audio_classification.py) script or this [how-to guide](./tasks/audio_classification).
+
+The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for audio classification inference:
+
+```py
+>>> from transformers import pipeline
+
+>>> audio_classifier = pipeline(
+...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+>>> audio_classifier("jfk_moon_speech.wav")
+[{'label': 'calm', 'score': 0.13856211304664612},
+ {'label': 'disgust', 'score': 0.13148026168346405},
+ {'label': 'happy', 'score': 0.12635163962841034},
+ {'label': 'angry', 'score': 0.12439591437578201},
+ {'label': 'fearful', 'score': 0.12404385954141617}]
+```
+
+The general process for using a model and feature extractor for audio classification is:
+
+1. Instantiate a feature extractor and a model from the checkpoint name.
+2. Process the audio signal to be classified with a feature extractor.
+3. Pass the input through the model and take the `argmax` to retrieve the most likely class.
+4. Convert the class id to a class name with `id2label` to return an interpretable result.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+>>> dataset = dataset.sort("id")
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
+>>> model = AutoModelForAudioClassification.from_pretrained("superb/wav2vec2-base-superb-ks")
+
+>>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+
+>>> predicted_class_ids = torch.argmax(logits, dim=-1).item()
+>>> predicted_label = model.config.id2label[predicted_class_ids]
+>>> predicted_label
+```
+</pt>
+</frameworkcontent>
+
+## Automatic speech recognition
+
+Automatic speech recognition transcribes an audio signal to text. The [Common Voice](https://huggingface.co/datasets/common_voice) dataset is an example dataset that can be used for automatic speech recognition fine-tuning. It contains an audio file of a speaker and the corresponding sentence. If you'd like to fine-tune a model for automatic speech recognition, take a look at the [run_speech_recognition_ctc.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) or [run_speech_recognition_seq2seq.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py) scripts or this [how-to guide](./tasks/asr).
+
+The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for automatic speech recognition inference:
+
+```py
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline(task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+>>> speech_recognizer("jfk_moon_speech.wav")
+{'text': "PRESENTETE MISTER VICE PRESIDENT GOVERNOR CONGRESSMEN THOMAS SAN O TE WILAN CONGRESSMAN MILLA MISTER WEBB MSTBELL SCIENIS DISTINGUISHED GUESS AT LADIES AND GENTLEMAN I APPRECIATE TO YOUR PRESIDENT HAVING MADE ME AN HONORARY VISITING PROFESSOR AND I WILL ASSURE YOU THAT MY FIRST LECTURE WILL BE A VERY BRIEF I AM DELIGHTED TO BE HERE AND I'M PARTICULARLY DELIGHTED TO BE HERE ON THIS OCCASION WE MEED AT A COLLEGE NOTED FOR KNOWLEGE IN A CITY NOTED FOR PROGRESS IN A STATE NOTED FOR STRAINTH AN WE STAND IN NEED OF ALL THREE"}
+```
+
+The general process for using a model and processor for automatic speech recognition is:
+
+1. Instantiate a processor (which regroups a feature extractor for input processing and a tokenizer for decoding) and a model from the checkpoint name.
+2. Process the audio signal and text with a processor.
+3. Pass the input through the model and take the `argmax` to retrieve the predicted text.
+4. Decode the text with a tokenizer to obtain the transcription.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import AutoProcessor, AutoModelForCTC
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+>>> dataset = dataset.sort("id")
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+>>> model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+
+>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+>>> predicted_ids = torch.argmax(logits, dim=-1)
+
+>>> transcription = processor.batch_decode(predicted_ids)
+>>> transcription[0]
+```
+</pt>
+</frameworkcontent>
+
+## Image classification
+
+Like text and audio classification, image classification assigns a class to an image. The [CIFAR-100](https://huggingface.co/datasets/cifar100) dataset is an example dataset that can be used for image classification fine-tuning. It contains an image and the corresponding class. If you'd like to fine-tune a model for image classification, take a look at the [run_image_classification.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-classification/run_image_classification.py) script or this [how-to guide](./tasks/image_classification).
+
+The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for image classification inference:
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(task="image-classification")
+>>> vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
+ {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
+  'score': 0.03433405980467796},
+ {'label': 'snow leopard, ounce, Panthera uncia',
+  'score': 0.032148055732250214},
+ {'label': 'Egyptian cat', 'score': 0.02353910356760025},
+ {'label': 'tiger cat', 'score': 0.023034192621707916}]
+```
+
+The general process for using a model and feature extractor for image classification is:
+
+1. Instantiate a feature extractor and a model from the checkpoint name.
+2. Process the image to be classified with a feature extractor.
+3. Pass the input through the model and take the `argmax` to retrieve the predicted class.
+4. Convert the class id to a class name with `id2label` to return an interpretable result.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import AutoFeatureExtractor, AutoModelForImageClassification
+>>> import torch
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("huggingface/cats-image")
+>>> image = dataset["test"]["image"][0]
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+>>> model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
+
+>>> inputs = feature_extractor(image, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+
+>>> predicted_label = logits.argmax(-1).item()
+>>> print(model.config.id2label[predicted_label])
+Egyptian cat
+```
+</pt>
+</frameworkcontent>
diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx
index 6fe90e5cd7d4..dac5015bf815 100644
--- a/docs/source/en/tasks/asr.mdx
+++ b/docs/source/en/tasks/asr.mdx
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 Automatic speech recognition (ASR) converts a speech signal to text. It is an example of a sequence-to-sequence task, going from a sequence of audio inputs to textual outputs. Voice assistants like Siri and Alexa utilize ASR models to assist users.
 
-This guide will show you how to fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [TIMIT](https://huggingface.co/datasets/timit_asr) dataset to transcribe audio to text.
+This guide will show you how to fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
 
 <Tip>
 
@@ -24,50 +24,54 @@ See the automatic speech recognition [task page](https://huggingface.co/tasks/au
 
 </Tip>
 
-## Load TIMIT dataset
+## Load MInDS-14 dataset
 
-Load the TIMIT dataset from the 🤗 Datasets library:
+Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) from the 🤗 Datasets library:
 
 ```py
->>> from datasets import load_dataset
+>>> from datasets import load_dataset, Audio
 
->>> timit = load_dataset("timit_asr")
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
 ```
 
-Then take a look at an example:
+Split this dataset into a train and test set:
 
 ```py
->>> timit
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+Then take a look at the dataset:
+
+```py
+>>> minds
 DatasetDict({
     train: Dataset({
-        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
-        num_rows: 4620
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 450
     })
     test: Dataset({
-        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
-        num_rows: 1680
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 113
     })
 })
 ```
 
-While the dataset contains a lot of helpful information, like `dialect_region` and `sentence_type`, you will focus on the `audio` and `text` fields in this guide. Remove the other columns:
+While the dataset contains a lot of helpful information, like `lang_id` and `intent_class`, you will focus on the `audio` and `transcription` columns in this guide. Remove the other columns:
 
 ```py
->>> timit = timit.remove_columns(
-...     ["phonetic_detail", "word_detail", "dialect_region", "id", "sentence_type", "speaker_id"]
-... )
+>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
 ```
 
 Take a look at the example again:
 
 ```py
->>> timit["train"][0]
-{'audio': {'array': array([-2.1362305e-04,  6.1035156e-05,  3.0517578e-05, ...,
-         -3.0517578e-05, -9.1552734e-05, -6.1035156e-05], dtype=float32),
-  'path': '/root/.cache/huggingface/datasets/downloads/extracted/404950a46da14eac65eb4e2a8317b1372fb3971d980d91d5d5b221275b1fd7e0/data/TRAIN/DR4/MMDM0/SI681.WAV',
-  'sampling_rate': 16000},
- 'file': '/root/.cache/huggingface/datasets/downloads/extracted/404950a46da14eac65eb4e2a8317b1372fb3971d980d91d5d5b221275b1fd7e0/data/TRAIN/DR4/MMDM0/SI681.WAV',
- 'text': 'Would such an act of refusal be useful?'}
+>>> minds["train"][0]
+{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
+          0.00024414,  0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 8000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
 ```
 
 The `audio` column contains a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file.
@@ -82,6 +86,19 @@ Load the Wav2Vec2 processor to process the audio signal and transcribed text:
 >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
 ```
 
+The [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset has a sampling rate of 8000khz. You will need to resample the dataset to use the pretrained Wav2Vec2 model:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
+          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 16000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
 The preprocessing function needs to:
 
 1. Call the `audio` column to load and resample the audio file.
@@ -96,14 +113,14 @@ The preprocessing function needs to:
 ...     batch["input_length"] = len(batch["input_values"])
 
 ...     with processor.as_target_processor():
-...         batch["labels"] = processor(batch["text"]).input_ids
+...         batch["labels"] = processor(batch["transcription"]).input_ids
 ...     return batch
 ```
 
 Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the map function by increasing the number of processes with `num_proc`. Remove the columns you don't need:
 
 ```py
->>> timit = timit.map(prepare_dataset, remove_columns=timit.column_names["train"], num_proc=4)
+>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
 ```
 
 🤗 Transformers doesn't have a data collator for automatic speech recognition, so you will need to create one. You can adapt the [`DataCollatorWithPadding`] to create a batch of examples for automatic speech recognition. It will also dynamically pad your text and labels to the length of the longest element in its batch, so they are a uniform length. While it is possible to pad your text in the `tokenizer` function by setting `padding=True`, dynamic padding is more efficient.
@@ -165,7 +182,7 @@ Load Wav2Vec2 with [`AutoModelForCTC`]. For `ctc_loss_reduction`, it is often be
 >>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
 
 >>> model = AutoModelForCTC.from_pretrained(
-...     "facebook/wav2vec-base",
+...     "facebook/wav2vec2-base",
 ...     ctc_loss_reduction="mean",
 ...     pad_token_id=processor.tokenizer.pad_token_id,
 ... )
@@ -200,8 +217,8 @@ At this point, only three steps remain:
 >>> trainer = Trainer(
 ...     model=model,
 ...     args=training_args,
-...     train_dataset=timit["train"],
-...     eval_dataset=timit["test"],
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
 ...     tokenizer=processor.feature_extractor,
 ...     data_collator=data_collator,
 ... )
diff --git a/docs/source/en/tasks/audio_classification.mdx b/docs/source/en/tasks/audio_classification.mdx
index e239461762fc..6dee7a19dd5d 100644
--- a/docs/source/en/tasks/audio_classification.mdx
+++ b/docs/source/en/tasks/audio_classification.mdx
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 Audio classification assigns a label or class to audio data. It is similar to text classification, except an audio input is continuous and must be discretized, whereas text can be split into tokens. Some practical applications of audio classification include identifying intent, speakers, and even animal species by their sounds.
 
-This guide will show you how to fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the Keyword Spotting subset of the [SUPERB](https://huggingface.co/datasets/superb) benchmark to classify utterances.
+This guide will show you how to fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) to classify intent.
 
 <Tip>
 
@@ -24,27 +24,59 @@ See the audio classification [task page](https://huggingface.co/tasks/audio-clas
 
 </Tip>
 
-## Load SUPERB dataset
+## Load MInDS-14 dataset
 
-Load the SUPERB dataset from the 🤗 Datasets library:
+Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) from the 🤗 Datasets library:
 
 ```py
->>> from datasets import load_dataset
+>>> from datasets import load_dataset, Audio
 
->>> ks = load_dataset("superb", "ks")
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
 ```
 
-Then take a look at an example:
+Split this dataset into a train and test set:
 
 ```py
->>> ks["train"][0]
-{'audio': {'array': array([ 0. , 0. , 0. , ..., -0.00592041, -0.00405884, -0.00253296], dtype=float32), 'path': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav', 'sampling_rate': 16000}, 'file': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav', 'label': 10}
+>>> minds = minds.train_test_split(test_size=0.2)
 ```
 
-The `audio` column contains a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. The `label` column is an integer that represents the utterance class. Create a dictionary that maps a label name to an integer and vice versa. The mapping will help the model recover the label name from the label number:
+Then take a look at the dataset:
 
 ```py
->>> labels = ks["train"].features["label"].names
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 450
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 113
+    })
+})
+```
+
+While the dataset contains a lot of other useful information, like `lang_id` and `english_transcription`, you will focus on the `audio` and `intent_class` in this guide. Remove the other columns:
+
+```py
+>>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
+```
+
+Take a look at an example now:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
+         -0.00024414, -0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 8000},
+ 'intent_class': 2}
+```
+
+The `audio` column contains a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. The `intent_class` column is an integer that represents the class id of intent. Create a dictionary that maps a label name to an integer and vice versa. The mapping will help the model recover the label name from the label number:
+
+```py
+>>> labels = minds["train"].features["intent_class"].names
 >>> label2id, id2label = dict(), dict()
 >>> for i, label in enumerate(labels):
 ...     label2id[label] = str(i)
@@ -54,11 +86,11 @@ The `audio` column contains a 1-dimensional `array` of the speech signal that mu
 Now you can convert the label number to a label name for more information:
 
 ```py
->>> id2label[str(10)]
-'_silence_'
+>>> id2label[str(2)]
+'app_error'
 ```
 
-Each keyword - or label - corresponds to a number; `10` indicates `silence` in the example above.
+Each keyword - or label - corresponds to a number; `2` indicates `app_error` in the example above.
 
 ## Preprocess
 
@@ -70,6 +102,18 @@ Load the Wav2Vec2 feature extractor to process the audio signal:
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 ```
 
+The [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset has a sampling rate of 8000khz. You will need to resample the dataset to use the pretrained Wav2Vec2 model:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([ 2.2098757e-05,  4.6582241e-05, -2.2803260e-05, ...,
+         -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 16000},
+ 'intent_class': 2}
+```
+
 The preprocessing function needs to:
 
 1. Call the `audio` column to load and if necessary resample the audio file.
@@ -85,10 +129,11 @@ The preprocessing function needs to:
 ...     return inputs
 ```
 
-Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need:
+Use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) function to apply the preprocessing function over the entire dataset. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that is what the model expects:
 
 ```py
->>> encoded_ks = ks.map(preprocess_function, remove_columns=["audio", "file"], batched=True)
+>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
+>>> encoded_minds = encoded_minds.rename_column("intent_class", "label")
 ```
 
 ## Train
@@ -130,8 +175,8 @@ At this point, only three steps remain:
 >>> trainer = Trainer(
 ...     model=model,
 ...     args=training_args,
-...     train_dataset=encoded_ks["train"],
-...     eval_dataset=encoded_ks["validation"],
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
 ...     tokenizer=feature_extractor,
 ... )
 
diff --git a/docs/source/es/quicktour.mdx b/docs/source/es/quicktour.mdx
index 8b400867099e..7b58c987b73b 100644
--- a/docs/source/es/quicktour.mdx
+++ b/docs/source/es/quicktour.mdx
@@ -115,23 +115,22 @@ Crea un [`pipeline`] con la tarea que deseas resolver y el modelo que quieres us
 >>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
 ```
 
-A continuación, carga el dataset (ve 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) para más detalles) sobre el que quisieras iterar. Por ejemplo, vamos a cargar el dataset [SUPERB](https://huggingface.co/datasets/superb):
+A continuación, carga el dataset (ve 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) para más detalles) sobre el que quisieras iterar. Por ejemplo, vamos a cargar el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
 
 ```py
 >>> import datasets
 
->>> dataset = datasets.load_dataset("superb", name="asr", split="test")  # doctest: +IGNORE_RESULT
+>>> dataset = datasets.load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
 ```
 
 Puedes pasar un pipeline para un dataset:
 
 ```py
->>> files = dataset["file"]
+>>> files = dataset["path"]
 >>> speech_recognizer(files[:4])
-[{'text': 'HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOWER FAT AND SAUCE'},
- {'text': 'STUFFERED INTO YOU HIS BELLY COUNSELLED HIM'},
- {'text': 'AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS'},
- {'text': 'HO BERTIE ANY GOOD IN YOUR MIND'}]
+[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'}, 
+ {'text': "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE"}, 
+ {'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"},
 ```
 
 Para un dataset más grande, donde los inputs son de mayor tamaño (como en habla/audio o visión), querrás pasar un generador en lugar de una lista que carga todos los inputs en memoria. Ve la [documentación del pipeline](./main_classes/pipelines) para más información.
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 6ab150a762b0..ac4ec706bfcf 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -60,7 +60,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 06f9caba8943..3ff6134531db 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -53,7 +53,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 32f0104b8929..a0e01b080275 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -53,7 +53,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index c0eb755b6a5a..5ad561ee2b85 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -44,7 +44,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 02f20936873b..fc036f2a20fa 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index fef52c4bf5e5..aa5424a85767 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index e2182ec783da..be65779fe3c8 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 323c38489589..ed39be7a1a15 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 3d2af72ccaf6..04a6b4c26794 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index f2f678dd8a5c..247ba09d54ab 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -23,6 +23,7 @@
 # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -185,6 +186,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -208,7 +226,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -427,18 +446,10 @@ def group_texts(examples):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
     if accelerator.distributed_type == DistributedType.TPU:
         model.tie_weights()
 
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -453,6 +464,23 @@ def group_texts(examples):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("clm_no_trainer", args)
+
     # Train!
     total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
@@ -467,11 +495,38 @@ def group_texts(examples):
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
 
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
+
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -481,6 +536,13 @@ def group_texts(examples):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -502,6 +564,16 @@ def group_texts(examples):
 
         logger.info(f"epoch {epoch}: perplexity: {perplexity}")
 
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "perplexity": perplexity,
+                    "train_loss": total_loss,
+                    "epoch": epoch,
+                },
+                step=completed_steps,
+            )
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -512,6 +584,12 @@ def group_texts(examples):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -521,6 +599,9 @@ def group_texts(examples):
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"perplexity": perplexity}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index f829e86781f1..477ccff95052 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 5d2ea0d4ea86..2634cc25e5b2 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -23,6 +23,7 @@
 # You can also adapt this script on your own mlm task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -194,6 +195,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -219,7 +237,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -439,9 +458,11 @@ def group_texts(examples):
     train_dataset = tokenized_datasets["train"]
     eval_dataset = tokenized_datasets["validation"]
 
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # Conditional for small test subsets
+    if len(train_dataset) > 3:
+        # Log a few random samples from the training set:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
 
     # Data collator
     # This one will take care of randomly masking the tokens.
@@ -468,11 +489,6 @@ def group_texts(examples):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
     if accelerator.distributed_type == DistributedType.TPU:
         model.tie_weights()
@@ -494,6 +510,23 @@ def group_texts(examples):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("clm_no_trainer", args)
+
     # Train!
     total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
@@ -508,11 +541,38 @@ def group_texts(examples):
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
 
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
+
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -522,6 +582,13 @@ def group_texts(examples):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -543,6 +610,16 @@ def group_texts(examples):
 
         logger.info(f"epoch {epoch}: perplexity: {perplexity}")
 
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "perplexity": perplexity,
+                    "train_loss": total_loss,
+                    "epoch": epoch,
+                },
+                step=completed_steps,
+            )
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -553,6 +630,12 @@ def group_texts(examples):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -562,6 +645,9 @@ def group_texts(examples):
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"perplexity": perplexity}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index cc4ad602329c..8974882595ae 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 4578e4570aa0..cd2bdd74ad2b 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 451f2fc17b89..a575644130f0 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -177,6 +178,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     if args.push_to_hub:
@@ -246,7 +264,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -431,14 +450,6 @@ def preprocess_function(examples):
     device = accelerator.device
     model.to(device)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -453,6 +464,23 @@ def preprocess_function(examples):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("clm_no_trainer", args)
+
     # Metrics
     metric = load_metric("accuracy")
 
@@ -470,11 +498,38 @@ def preprocess_function(examples):
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
 
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
+
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -484,6 +539,13 @@ def preprocess_function(examples):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -500,6 +562,16 @@ def preprocess_function(examples):
         eval_metric = metric.compute()
         accelerator.print(f"epoch {epoch}: {eval_metric}")
 
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "accuracy": eval_metric,
+                    "train_loss": total_loss,
+                    "epoch": epoch,
+                },
+                step=completed_steps,
+            )
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -510,6 +582,12 @@ def preprocess_function(examples):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -518,6 +596,8 @@ def preprocess_function(examples):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 90d199b14d6d..242e83427389 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 96aa07a8086b..d46e96d21043 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 939724bd60e3..541f1f784e03 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -210,6 +210,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -241,7 +258,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -670,14 +688,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -692,6 +702,23 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("clm_no_trainer", args)
+
     # Train!
     total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
@@ -707,11 +734,38 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
 
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
+
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -721,6 +775,10 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    accelerator.save_state(f"step_{completed_steps}")
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -847,6 +905,20 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
         logger.info(f"Predict metrics: {predict_metric}")
 
+    if args.with_tracking:
+        log = {
+            "squad_v2" if args.version_2_with_negative else "squad": eval_metric,
+            "train_loss": total_loss,
+            "epoch": epoch,
+        }
+        if args.do_predict:
+            log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric
+
+        accelerator.log(log, step=completed_steps)
+
+    if args.checkpointing_steps == "epoch":
+        accelerator.save_state(f"epoch_{epoch}")
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 964bcf63d175..6da75822398c 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -55,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -239,6 +240,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -270,7 +288,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -676,14 +695,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -698,6 +709,23 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("clm_no_trainer", args)
+
     # Train!
     total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
@@ -713,11 +741,38 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
 
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
+
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -727,9 +782,22 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -810,6 +878,17 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
         logger.info(f"Predict metrics: {predict_metric}")
 
+    if args.with_tracking:
+        log = {
+            "squad_v2" if args.version_2_with_negative else "squad": eval_metric,
+            "train_loss": total_loss,
+            "epoch": epoch,
+        }
+    if args.do_predict:
+        log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric
+
+        accelerator.log(log, step=completed_steps)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -818,6 +897,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_f1": eval_metric["f1"], "eval_exact": eval_metric["exact"]}, f)
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index b4434102599c..cb6bd09bc40d 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -44,7 +44,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 7c061495f325..6df37086240d 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 46d4785fa8f8..3c368c4ae836 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 7b39cb8e48f9..c35b636d7dd9 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index d9b8657d261b..adc9e616dda0 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own summarization task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -262,6 +263,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -296,7 +314,8 @@ def main():
             "`--source_prefix 'summarize: ' `"
         )
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -494,14 +513,6 @@ def postprocess_text(preds, labels):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -516,6 +527,23 @@ def postprocess_text(preds, labels):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("summarization_no_trainer", args)
+
     # Metric
     metric = load_metric("rouge")
 
@@ -532,12 +560,38 @@ def postprocess_text(preds, labels):
     # Only show the progress bar once on each machine.
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
 
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -547,6 +601,13 @@ def postprocess_text(preds, labels):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -596,6 +657,11 @@ def postprocess_text(preds, labels):
 
         logger.info(result)
 
+        if args.with_tracking:
+            result["train_loss"] = total_loss
+            result["epoch"] = epoch
+            accelerator.log(result, step=completed_steps)
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -606,6 +672,12 @@ def postprocess_text(preds, labels):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -614,6 +686,16 @@ def postprocess_text(preds, labels):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump(
+                {
+                    "eval_rouge1": result["rouge1"],
+                    "eval_rouge2": result["rouge2"],
+                    "eval_rougeL": result["rougeL"],
+                    "eval_rougeLsum": result["rougeLsum"],
+                },
+                f,
+            )
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
new file mode 100644
index 000000000000..883dc434deb7
--- /dev/null
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import json
+import logging
+import os
+import sys
+from unittest.mock import patch
+
+import torch
+
+from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device
+from transformers.utils import is_apex_available
+
+
+SRC_DIRS = [
+    os.path.join(os.path.dirname(__file__), dirname)
+    for dirname in [
+        "text-generation",
+        "text-classification",
+        "token-classification",
+        "language-modeling",
+        "multiple-choice",
+        "question-answering",
+        "summarization",
+        "translation",
+        "image-classification",
+        "speech-recognition",
+        "audio-classification",
+        "speech-pretraining",
+        "image-pretraining",
+    ]
+]
+sys.path.extend(SRC_DIRS)
+
+
+if SRC_DIRS is not None:
+    import run_clm_no_trainer
+    import run_glue_no_trainer
+    import run_mlm_no_trainer
+    import run_ner_no_trainer
+    import run_qa_no_trainer as run_squad_no_trainer
+    import run_summarization_no_trainer
+    import run_swag_no_trainer
+    import run_translation_no_trainer
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f")
+    args = parser.parse_args()
+    return args.f
+
+
+def get_results(output_dir):
+    results = {}
+    path = os.path.join(output_dir, "all_results.json")
+    if os.path.exists(path):
+        with open(path, "r") as f:
+            results = json.load(f)
+    else:
+        raise ValueError(f"can't find {path}")
+    return results
+
+
+def is_cuda_and_apex_available():
+    is_using_cuda = torch.cuda.is_available() and torch_device == "cuda"
+    return is_using_cuda and is_apex_available()
+
+
+class ExamplesTestsNoTrainer(TestCasePlus):
+    def test_run_glue_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_glue_no_trainer.py
+            --model_name_or_path distilbert-base-uncased
+            --output_dir {tmp_dir}
+            --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
+            --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --learning_rate=1e-4
+            --seed=42
+            --checkpointing_steps epoch
+            """.split()
+
+        if is_cuda_and_apex_available():
+            testargs.append("--fp16")
+
+        with patch.object(sys, "argv", testargs):
+            run_glue_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_clm_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_clm_no_trainer.py
+            --model_name_or_path distilgpt2
+            --train_file ./tests/fixtures/sample_text.txt
+            --validation_file ./tests/fixtures/sample_text.txt
+            --block_size 128
+            --per_device_train_batch_size 5
+            --per_device_eval_batch_size 5
+            --num_train_epochs 2
+            --output_dir {tmp_dir}
+            --checkpointing_steps epoch
+            """.split()
+
+        if torch.cuda.device_count() > 1:
+            # Skipping because there are not enough batches to train the model + would need a drop_last to work.
+            return
+
+        with patch.object(sys, "argv", testargs):
+            run_clm_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertLess(result["perplexity"], 100)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_mlm_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_mlm_no_trainer.py
+            --model_name_or_path distilroberta-base
+            --train_file ./tests/fixtures/sample_text.txt
+            --validation_file ./tests/fixtures/sample_text.txt
+            --output_dir {tmp_dir}
+            --num_train_epochs=1
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_mlm_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertLess(result["perplexity"], 42)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_ner_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
+        epochs = 7 if get_gpu_count() > 1 else 2
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_ner_no_trainer.py
+            --model_name_or_path bert-base-uncased
+            --train_file tests/fixtures/tests_samples/conll/sample.json
+            --validation_file tests/fixtures/tests_samples/conll/sample.json
+            --output_dir {tmp_dir}
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=2
+            --num_train_epochs={epochs}
+            --seed 7
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_ner_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
+            self.assertLess(result["train_loss"], 0.5)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_squad_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_qa_no_trainer.py
+            --model_name_or_path bert-base-uncased
+            --version_2_with_negative=False
+            --train_file tests/fixtures/tests_samples/SQUAD/sample.json
+            --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=10
+            --num_warmup_steps=2
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_squad_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_f1"], 30)
+            self.assertGreaterEqual(result["eval_exact"], 30)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_swag_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_swag_no_trainer.py
+            --model_name_or_path bert-base-uncased
+            --train_file tests/fixtures/tests_samples/swag/sample.json
+            --validation_file tests/fixtures/tests_samples/swag/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=20
+            --num_warmup_steps=2
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_swag_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.8)
+
+    @slow
+    def test_run_summarization_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_summarization_no_trainer.py
+            --model_name_or_path t5-small
+            --train_file tests/fixtures/tests_samples/xsum/sample.json
+            --validation_file tests/fixtures/tests_samples/xsum/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=50
+            --num_warmup_steps=8
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_summarization_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_rouge1"], 10)
+            self.assertGreaterEqual(result["eval_rouge2"], 2)
+            self.assertGreaterEqual(result["eval_rougeL"], 7)
+            self.assertGreaterEqual(result["eval_rougeLsum"], 7)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    @slow
+    def test_run_translation_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_translation_no_trainer.py
+            --model_name_or_path sshleifer/student_marian_en_ro_6_1
+            --source_lang en
+            --target_lang ro
+            --train_file tests/fixtures/tests_samples/wmt16/sample.json
+            --validation_file tests/fixtures/tests_samples/wmt16/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=50
+            --num_warmup_steps=8
+            --learning_rate=3e-3
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --source_lang en_XX
+            --target_lang ro_RO
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_translation_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_bleu"], 30)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index a0730f609820..b15a0378ca7d 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 4dac18206b47..2c7fa186d0e0 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Finetuning a 🤗 Transformers model for sequence classification on GLUE."""
 import argparse
+import json
 import logging
 import math
 import os
@@ -150,6 +151,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -173,7 +191,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -376,14 +395,6 @@ def preprocess_function(examples):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -398,6 +409,23 @@ def preprocess_function(examples):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("glue_no_trainer", args)
+
     # Get the metric function
     if args.task_name is not None:
         metric = load_metric("glue", args.task_name)
@@ -417,12 +445,38 @@ def preprocess_function(examples):
     # Only show the progress bar once on each machine.
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
 
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -432,6 +486,13 @@ def preprocess_function(examples):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -447,6 +508,16 @@ def preprocess_function(examples):
         eval_metric = metric.compute()
         logger.info(f"epoch {epoch}: {eval_metric}")
 
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "accuracy" if args.task_name is not None else "glue": eval_metric,
+                    "train_loss": total_loss,
+                    "epoch": epoch,
+                },
+                step=completed_steps,
+            )
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -457,6 +528,12 @@ def preprocess_function(examples):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -486,6 +563,10 @@ def preprocess_function(examples):
         eval_metric = metric.compute()
         logger.info(f"mnli-mm: {eval_metric}")
 
+    if args.output_dir is not None:
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 4a17a5d702ba..cd4d44b6a61e 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 5545b35862b3..fc54c77d6204 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 90b01d6deb16..ab9fcce6df95 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -19,6 +19,7 @@
 """
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -204,6 +205,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -227,7 +245,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -491,14 +510,6 @@ def tokenize_and_align_labels(examples):
     device = accelerator.device
     model.to(device)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -513,6 +524,23 @@ def tokenize_and_align_labels(examples):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("clm_no_trainer", args)
+
     # Metrics
     metric = load_metric("seqeval")
 
@@ -569,12 +597,38 @@ def compute_metrics():
     # Only show the progress bar once on each machine.
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
 
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -584,6 +638,13 @@ def compute_metrics():
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -605,9 +666,17 @@ def compute_metrics():
                 references=refs,
             )  # predictions and preferences are expected to be a nested list of labels, not label_ids
 
-        # eval_metric = metric.compute()
         eval_metric = compute_metrics()
         accelerator.print(f"epoch {epoch}:", eval_metric)
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "seqeval": eval_metric,
+                    "train_loss": total_loss,
+                    "epoch": epoch,
+                },
+                step=completed_steps,
+            )
 
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
@@ -619,6 +688,12 @@ def compute_metrics():
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -628,6 +703,9 @@ def compute_metrics():
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_accuracy": eval_metric["accuracy"], "train_loss": float(loss.cpu().detach().numpy())}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index f7e98276dc7b..6f2630104f7e 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 34c1bcd0e72f..034387582b84 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own text translation task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -243,6 +244,23 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        required=False,
+        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -268,7 +286,8 @@ def main():
     args = parse_args()
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
+    accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
 
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -472,14 +491,6 @@ def preprocess_function(examples):
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
     # Scheduler and math around the number of training steps.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -494,6 +505,23 @@ def preprocess_function(examples):
         num_training_steps=args.max_train_steps,
     )
 
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # Figure out how many steps we should save the Accelerator states
+    if hasattr(args.checkpointing_steps, "isdigit"):
+        checkpointing_steps = args.checkpointing_steps
+        if args.checkpointing_steps.isdigit():
+            checkpointing_steps = int(args.checkpointing_steps)
+    else:
+        checkpointing_steps = None
+
+    # We need to initialize the trackers we use, and also store our configuration
+    if args.with_tracking:
+        accelerator.init_trackers("translation_no_trainer", args)
+
     metric = load_metric("sacrebleu")
 
     def postprocess_text(preds, labels):
@@ -516,11 +544,38 @@ def postprocess_text(preds, labels):
     progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
     completed_steps = 0
 
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            resume_step = None
+            path = args.resume_from_checkpoint
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        if "epoch" in path:
+            args.num_train_epochs -= int(path.replace("epoch_", ""))
+        else:
+            resume_step = int(path.replace("step_", ""))
+            args.num_train_epochs -= resume_step // len(train_dataloader)
+            resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
+
     for epoch in range(args.num_train_epochs):
         model.train()
+        if args.with_tracking:
+            total_loss = 0
         for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
+                continue
             outputs = model(**batch)
             loss = outputs.loss
+            # We keep track of the loss at each epoch
+            if args.with_tracking:
+                total_loss += loss.detach().float()
             loss = loss / args.gradient_accumulation_steps
             accelerator.backward(loss)
             if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@@ -530,6 +585,13 @@ def postprocess_text(preds, labels):
                 progress_bar.update(1)
                 completed_steps += 1
 
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
             if completed_steps >= args.max_train_steps:
                 break
 
@@ -574,6 +636,16 @@ def postprocess_text(preds, labels):
         eval_metric = metric.compute()
         logger.info({"bleu": eval_metric["score"]})
 
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "blue": eval_metric["score"],
+                    "train_loss": total_loss,
+                    "epoch": epoch,
+                },
+                step=completed_steps,
+            )
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -584,6 +656,12 @@ def postprocess_text(preds, labels):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"step_{completed_steps}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -592,6 +670,8 @@ def postprocess_text(preds, labels):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_bleu": eval_metric["score"]}, f)
 
 
 if __name__ == "__main__":
diff --git a/examples/research_projects/tapex/README.md b/examples/research_projects/tapex/README.md
new file mode 100644
index 000000000000..7d98901e281e
--- /dev/null
+++ b/examples/research_projects/tapex/README.md
@@ -0,0 +1,288 @@
+<!---
+Copyright 2022 The Microsoft Inc. and The HuggingFace Inc. Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Run Table Tasks with TAPEX
+
+TAPEX is a table pre-training approach for table-related tasks. By learning a neural SQL executor over a synthetic corpus based on generative language models (e.g., BART), it achieves state-of-the-art performance on several table-based question answering benchmarks and table-based fact verification benchmark. More details can be found in the original paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/pdf/2107.07653.pdf).
+
+> If you are also familiar with [fairseq](https://github.com/pytorch/fairseq), you may also find [the official implementation](https://github.com/microsoft/Table-Pretraining) useful, which leverages the framework.
+
+## Table Question Answering Tasks
+
+### What is Table Question Answering
+
+![Example](https://table-pretraining.github.io/assets/tableqa_task.png)
+
+The task of Table Question Answering (TableQA) is to empower machines to answer users' questions over a given table. The resulting answer(s) can be a region in the table, or a number calculated by applying aggregation operators to a specific region.
+
+### What Questions Can be Answered
+
+Benefiting from the powerfulness of generative models, TAPEX can deal with almost all kinds of questions over tables (if there is training data). Below are some typical question and their answers taken from [WikiTableQuestion](https://nlp.stanford.edu/blog/wikitablequestions-a-complex-real-world-question-understanding-dataset).
+
+| Question | Answer |
+| :---: | :---: |
+| What is the years won for each team? | 2004, 2008, 2012 |
+| How long did Taiki Tsuchiya last? | 4:27 |
+| What is the total amount of matches drawn? | 1 |
+| Besides Tiger Woods, what other player won between 2007 and 2009? | Camilo Villegas |
+| What was the last Baekje Temple? | Uija |
+| What is the difference between White voters and Black voters in 1948? | 0 |
+| What is the average number of sailors for each country during the worlds qualification tournament? | 2 |
+
+
+### How to Fine-tune TAPEX on TableQA
+
+We provide a fine-tuning script of tapex for TableQA on the WikiSQL benchmark: [WikiSQL](https://github.com/salesforce/WikiSQL).
+This script is customized for tapex models, and can be easily adapted to other benchmarks such as WikiTableQuestion
+(only some tweaks in the function `preprocess_tableqa_function`).
+
+#### TAPEX-Base on WikiSQL
+
+Here is how to run the script on the WikiSQL with `tapex-base`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly.
+
+```bash
+export EXP_NAME=wikisql_tapex_base
+
+python run_wikisql_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-base \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 4 \
+  --gradient_accumulation_steps 8 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000
+```
+
+#### TAPEX-Large on WikiSQL
+
+Here is how to run the script on the WikiSQL with `tapex-large`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-large results within the memory budget of 16GB and 1 GPU card with fp16. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. If you do not install apex or other mixed-precision-training libs, you could disable the `predict_with_generate` option to save GPU memory and manually evaluate the model once the fine-tuning finished. Or just pick up the last checkpoint, which usually performs good enough on the dataset.
+
+```bash
+export EXP_NAME=wikisql_tapex_large
+
+python run_wikisql_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-large \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 32 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000 \
+  --fp16
+```
+
+#### TAPEX-Base on WikiTableQuestions
+
+Here is how to run the script on the WikiTableQuestions with `tapex-base`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly.
+
+```bash
+export EXP_NAME=wikitablequestions_tapex_base
+
+python run_wikitablequestions_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-base \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 4 \
+  --gradient_accumulation_steps 8 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000
+```
+
+#### TAPEX-Large on WikiTableQuestions
+
+Here is how to run the script on the WikiTableQuestions with `tapex-large`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-large results within the memory budget of 16GB and 1 GPU card with fp16. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. If you do not install apex or other mixed-precision-training libs, you could reduce the `per_device_train_batch_size` and `per_device_eval_batch_size` and have another try. Or you could disable the `predict_with_generate` option to save GPU memory and manually evaluate the model once the fine-tuning finished. Or just pick up the last checkpoint, which usually performs good enough on the dataset.
+
+```bash
+export EXP_NAME=wikitablequestions_tapex_large
+
+python run_wikitablequestions_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-large \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 2 \
+  --gradient_accumulation_steps 12 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000 \
+  --fp16
+```
+
+### How to Evaluate TAPEX Fine-tuned Models on TableQA
+
+We provide fine-tuned model weights to reproduce our results. You can evaluate them using the following command:
+> You can also replace `microsoft/tapex-base-finetuned-wikisql` with your local directory to evaluate your fine-tuned models. Notice that if the model has a larger size, you should reduce `per_device_eval_batch_size` to fit the memory requirement.
+
+```bash
+export EXP_NAME=wikisql_tapex_base_eval
+
+python run_wikisql_with_tapex.py \
+  --do_eval \
+  --model_name_or_path microsoft/tapex-base-finetuned-wikisql \
+  --output_dir $EXP_NAME \
+  --per_device_eval_batch_size 4 \
+  --predict_with_generate \
+  --num_beams 5
+```
+
+## Table Fact Verification Tasks
+
+### What is Table Fact Verification
+
+![Example](https://table-pretraining.github.io/assets/tableft_task.png)
+
+The task of Table Fact Verification (TableFV) is to empower machines to justify if a statement follows facts in a given table. The result is a binary classification belonging to `1` (entailed) or `0` (refused).
+
+### How to Fine-tune TAPEX on TableFV
+
+#### TAPEX-Base on TabFact
+
+We provide a fine-tuning script of tapex for TableFV on the TabFact benchmark: [TabFact](https://github.com/wenhuchen/Table-Fact-Checking).
+
+Here is how to run the script on the TabFact:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. Note that the `eval_accumulation_steps` is necessary, otherwise GPU memory leaks will occur during the evaluation.
+
+```bash
+export EXP_NAME=tabfact_tapex_base
+
+python run_tabfact_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-base \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 3 \
+  --gradient_accumulation_steps 16 \
+  --per_device_eval_batch_size 12 \
+  --eval_accumulation_steps 6 \
+  --warm_steps 1000 \
+  --logging_steps 10 \
+  --learning_rate 3e-5 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --evaluation_strategy steps \
+  --weight_decay 1e-2 \
+  --max_steps 30000 \
+  --max_grad_norm 0.1
+```
+
+#### TAPEX-Large on TabFact
+
+Here is how to run the script on the TabFact:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 24GB and 1 GPU card. Sorry we cannot reduce the memory consumption since the model input in TabFact usually contains nearly ~1000 tokens. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. Note that the `eval_accumulation_steps` is necessary, otherwise GPU memory leaks will occur during the evaluation.
+
+```bash
+export EXP_NAME=tabfact_tapex_large
+
+python run_tabfact_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-large \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 2 \
+  --gradient_accumulation_steps 18 \
+  --per_device_eval_batch_size 4 \
+  --eval_accumulation_steps 12 \
+  --warm_steps 1000 \
+  --logging_steps 10 \
+  --learning_rate 3e-5 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --evaluation_strategy steps \
+  --weight_decay 1e-2 \
+  --max_steps 30000 \
+  --max_grad_norm 0.1
+```
+
+### How to Evaluate TAPEX Fine-tuned Models on TableFV
+
+We provide fine-tuned model weights to reproduce our results. You can evaluate them using the following command:
+> You can also replace `microsoft/tapex-base-finetuned-tabfact` with your local directory to evaluate your fine-tuned models. Notice that if the model has a larger size, you should reduce `per_device_eval_batch_size` to fit the memory requirement.
+
+```bash
+export EXP_NAME=tabfact_tapex_base_eval
+
+python run_tabfact_with_tapex.py \
+  --do_eval \
+  --model_name_or_path microsoft/tapex-base-finetuned-tabfact \
+  --output_dir $EXP_NAME \
+  --per_device_eval_batch_size 12 \
+  --eval_accumulation_steps 6
+```
+
+## Reproduced Results
+
+We get the following results on the dev set of the benchmark with the previous commands:
+
+| Task | Model Size | Metric | Result |
+|:---:|:---:|:---:|:---:|
+| WikiSQL (Weak) | Base | Denotation Accuracy | 88.1 |
+| WikiSQL (Weak) | Large | Denotation Accuracy | 89.5 |
+| WikiTableQuestion | Base | Denotation Accuracy | 47.1 |
+| WikiTableQuestion | Large | Denotation Accuracy | 57.2 |
+| TabFact | Base | Accuracy | 78.7 |
+| TabFact | Large | Accuracy | 83.6 |
diff --git a/examples/research_projects/tapex/requirements.txt b/examples/research_projects/tapex/requirements.txt
new file mode 100644
index 000000000000..2379012a9b23
--- /dev/null
+++ b/examples/research_projects/tapex/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+datasets
+pandas
+nltk
\ No newline at end of file
diff --git a/examples/research_projects/tapex/run_tabfact_with_tapex.py b/examples/research_projects/tapex/run_tabfact_with_tapex.py
new file mode 100644
index 000000000000..0ed573ad9c1a
--- /dev/null
+++ b/examples/research_projects/tapex/run_tabfact_with_tapex.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fine-tuning the library models for tapex on table-based fact verification tasks.
+Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py
+"""
+
+import logging
+import os
+import random
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    AutoConfig,
+    BartForSequenceClassification,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    TapexTokenizer,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="tab_fact", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default="tab_fact",
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
+    )
+    max_seq_length: int = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the training data."}
+    )
+    validation_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the validation data."}
+    )
+    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
+
+    def __post_init__(self):
+        if self.dataset_name is not None:
+            pass
+        elif self.train_file is None or self.validation_file is None:
+            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
+        else:
+            train_extension = self.train_file.split(".")[-1]
+            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            validation_extension = self.validation_file.split(".")[-1]
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
+    #
+    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
+    # single column. You can easily tweak this behavior (see below)
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        # Loading a dataset from your local files.
+        # CSV/JSON training and evaluation files are needed.
+        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
+
+        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
+        # when you use `do_predict` without specifying a GLUE benchmark task.
+        if training_args.do_predict:
+            if data_args.test_file is not None:
+                train_extension = data_args.train_file.split(".")[-1]
+                test_extension = data_args.test_file.split(".")[-1]
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                data_files["test"] = data_args.test_file
+            else:
+                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
+
+        for key in data_files.keys():
+            logger.info(f"load a local file for {key}: {data_files[key]}")
+
+        if data_args.train_file.endswith(".csv"):
+            # Loading a dataset from local csv files
+            raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
+        else:
+            # Loading a dataset from local json files
+            raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Labels
+    label_list = raw_datasets["train"].features["label"].names
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # load tapex tokenizer
+    tokenizer = TapexTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        add_prefix_space=True,
+    )
+    model = BartForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Padding strategy
+    if data_args.pad_to_max_length:
+        padding = "max_length"
+    else:
+        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
+        padding = False
+
+    # Some models have set the order of the labels to use, so let's make sure we do use it.
+    model.config.label2id = {"Refused": 0, "Entailed": 1}
+    model.config.id2label = {0: "Refused", 1: "Entailed"}
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def preprocess_tabfact_function(examples):
+        # Tokenize the texts
+        def _convert_table_text_to_pandas(_table_text):
+            """Runs the structured pandas table object for _table_text.
+            An example _table_text can be: round#clubs remaining\nfirst round#156\n
+            """
+            _table_content = [_table_row.split("#") for _table_row in _table_text.strip("\n").split("\n")]
+            _table_pd = pd.DataFrame.from_records(_table_content[1:], columns=_table_content[0])
+            return _table_pd
+
+        questions = examples["statement"]
+        tables = list(map(_convert_table_text_to_pandas, examples["table_text"]))
+        result = tokenizer(tables, questions, padding=padding, max_length=max_seq_length, truncation=True)
+
+        result["label"] = examples["label"]
+        return result
+
+    with training_args.main_process_first(desc="dataset map pre-processing"):
+        raw_datasets = raw_datasets.map(
+            preprocess_tabfact_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    if training_args.do_predict or data_args.test_file is not None:
+        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+
+    # Log a few random samples from the training set:
+    if training_args.do_train:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+    # predictions and label_ids field) and has to return a dictionary string to float.
+    def compute_metrics(p: EvalPrediction):
+        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        preds = np.argmax(preds, axis=1)
+        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+
+    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    elif training_args.fp16:
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+    else:
+        data_collator = None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(eval_dataset=eval_dataset)
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        # Removing the `label` columns because it contains -1 and Trainer won't like that.
+        predict_dataset = predict_dataset.remove_columns("label")
+        predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
+        predictions = np.argmax(predictions, axis=1)
+
+        output_predict_file = os.path.join(training_args.output_dir, "predict_results_tabfact.txt")
+        if trainer.is_world_process_zero():
+            with open(output_predict_file, "w") as writer:
+                logger.info("***** Predict Results *****")
+                writer.write("index\tprediction\n")
+                for index, item in enumerate(predictions):
+                    item = label_list[item]
+                    writer.write(f"{index}\t{item}\n")
+
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
new file mode 100644
index 000000000000..594c83cb6be5
--- /dev/null
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -0,0 +1,629 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fine-tuning the library models for tapex on table-based question answering tasks.
+Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py
+"""
+
+import logging
+import os
+import sys
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from functools import partial
+from typing import List, Optional
+
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+import transformers
+from filelock import FileLock
+from transformers import (
+    AutoConfig,
+    BartForConditionalGeneration,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    TapexTokenizer,
+    set_seed,
+)
+from transformers.file_utils import is_offline_mode
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+try:
+    nltk.data.find("tokenizers/punkt")
+except (LookupError, OSError):
+    if is_offline_mode():
+        raise LookupError(
+            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
+        )
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name. "
+            "By default we use BART-large tokenizer for TAPEX-large."
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="wikisql", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (rouge) on "
+            "(a jsonlines or csv file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (rouge) on " "(a jsonlines or csv file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+            "during ``evaluate`` and ``predict``."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+            "which is used during ``evaluate`` and ``predict``."
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # IMPORTANT: the initial BART model's decoding is penalized by no_repeat_ngram_size, and thus
+    # we should disable it here to avoid problematic generation
+    config.no_repeat_ngram_size = 0
+    config.max_length = 1024
+    config.early_stopping = False
+
+    # load tapex tokenizer
+    tokenizer = TapexTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        add_prefix_space=True,
+    )
+
+    # load Bart based Tapex model (default tapex-large)
+    model = BartForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    if model.config.decoder_start_token_id is None:
+        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = datasets["validation"].column_names
+    elif training_args.do_predict:
+        column_names = datasets["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+
+    # Temporarily set max_target_length for training.
+    max_target_length = data_args.max_target_length
+    padding = "max_length" if data_args.pad_to_max_length else False
+
+    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
+        logger.warning(
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
+        )
+
+    def preprocess_tableqa_function(examples, is_training=False):
+        """
+        The is_training FLAG is used to identify if we could use the supervision
+        to truncate the table content if it is required.
+        """
+
+        # this function is specific for WikiSQL since the util function need the data structure
+        # to retrieve the WikiSQL answer for each question
+        def _convert_table_types(_table):
+            """Runs the type converter over the table cells."""
+            ret_table = deepcopy(_table)
+            types = ret_table["types"]
+            ret_table["real_rows"] = ret_table["rows"]
+            typed_rows = []
+            for row in ret_table["rows"]:
+                typed_row = []
+                for column, cell_value in enumerate(row):
+                    typed_row.append(_TYPE_CONVERTER[types[column]](cell_value))
+                typed_rows.append(typed_row)
+            ret_table["rows"] = typed_rows
+            return ret_table
+
+        questions = [question.lower() for question in examples["question"]]
+        example_tables = examples["table"]
+        example_sqls = examples["sql"]
+        tables = [
+            pd.DataFrame.from_records(example_table["rows"], columns=example_table["header"])
+            for example_table in example_tables
+        ]
+
+        # using tapas utils to obtain wikisql answer
+        answers = []
+        for example_sql, example_table in zip(example_sqls, example_tables):
+            tapas_table = _convert_table_types(example_table)
+            answer_list: List[str] = retrieve_wikisql_query_answer_tapas(tapas_table, example_sql)
+            # you can choose other delimiters to split each answer
+            answers.append(answer_list)
+
+        # IMPORTANT: we cannot pass by answers during evaluation, answers passed during training are used to
+        # truncate large tables in the train set!
+        if is_training:
+            model_inputs = tokenizer(
+                table=tables,
+                query=questions,
+                answer=answers,
+                max_length=data_args.max_source_length,
+                padding=padding,
+                truncation=True,
+            )
+        else:
+            model_inputs = tokenizer(
+                table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
+            )
+
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(
+                answer=[", ".join(answer) for answer in answers],
+                max_length=max_target_length,
+                padding=padding,
+                truncation=True,
+            )
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    # in training, we can use the answer as extra information to truncate large tables
+    preprocess_tableqa_function_training = partial(preprocess_tableqa_function, is_training=True)
+
+    if training_args.do_train:
+        if "train" not in datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        train_dataset = train_dataset.map(
+            preprocess_tableqa_function_training,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_eval:
+        max_target_length = data_args.val_max_target_length
+        if "validation" not in datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        eval_dataset = eval_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_predict:
+        max_target_length = data_args.val_max_target_length
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )
+
+    def postprocess_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+
+        return preds, labels
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        if data_args.ignore_pad_token_for_loss:
+            # Replace -100 in the labels as we can't decode them.
+            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        # Some simple post-processing
+        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+        delimiter = ", "
+
+        # define example evaluation
+        def evaluate_example(predict_str: str, ground_str: str):
+            predict_spans = predict_str.split(delimiter)
+            ground_spans = ground_str.split(delimiter)
+            predict_values = defaultdict(lambda: 0)
+            ground_values = defaultdict(lambda: 0)
+            for span in predict_spans:
+                try:
+                    predict_values[float(span)] += 1
+                except ValueError:
+                    predict_values[span.strip()] += 1
+            for span in ground_spans:
+                try:
+                    ground_values[float(span)] += 1
+                except ValueError:
+                    ground_values[span.strip()] += 1
+            is_correct = predict_values == ground_values
+            return is_correct
+
+        def get_denotation_accuracy(predictions: List[str], references: List[str]):
+            assert len(predictions) == len(references)
+            correct_num = 0
+            for predict_str, ground_str in zip(predictions, references):
+                is_correct = evaluate_example(predict_str.lower(), ground_str.lower())
+                if is_correct:
+                    correct_num += 1
+            return correct_num / len(predictions)
+
+        accuracy = get_denotation_accuracy(decoded_preds, decoded_labels)
+        result = {"denotation_accuracy": accuracy}
+
+        return result
+
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+    )
+
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(
+            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
+        )
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        predict_results = trainer.predict(
+            predict_dataset,
+            metric_key_prefix="predict",
+            max_length=data_args.val_max_target_length,
+            num_beams=data_args.num_beams,
+        )
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+        if trainer.is_world_process_zero():
+            if training_args.predict_with_generate:
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "tapex_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
new file mode 100644
index 000000000000..4398309566a8
--- /dev/null
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -0,0 +1,605 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fine-tuning the library models for tapex on table-based question answering tasks.
+Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py
+"""
+
+import logging
+import os
+import sys
+from collections import defaultdict
+from dataclasses import dataclass, field
+from functools import partial
+from typing import List, Optional
+
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+import transformers
+from filelock import FileLock
+from transformers import (
+    AutoConfig,
+    BartForConditionalGeneration,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    TapexTokenizer,
+    set_seed,
+)
+from transformers.file_utils import is_offline_mode
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+try:
+    nltk.data.find("tokenizers/punkt")
+except (LookupError, OSError):
+    if is_offline_mode():
+        raise LookupError(
+            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
+        )
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name. "
+            "By default we use BART-large tokenizer for TAPEX-large."
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="wikitablequestions", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (rouge) on "
+            "(a jsonlines or csv file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (rouge) on " "(a jsonlines or csv file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+            "during ``evaluate`` and ``predict``."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+            "which is used during ``evaluate`` and ``predict``."
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # IMPORTANT: the initial BART model's decoding is penalized by no_repeat_ngram_size, and thus
+    # we should disable it here to avoid problematic generation
+    config.no_repeat_ngram_size = 0
+    config.max_length = 1024
+    config.early_stopping = False
+
+    # load tapex tokenizer
+    tokenizer = TapexTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        add_prefix_space=True,
+    )
+
+    # load Bart based Tapex model (default tapex-large)
+    model = BartForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    if model.config.decoder_start_token_id is None:
+        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = datasets["validation"].column_names
+    elif training_args.do_predict:
+        column_names = datasets["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+
+    # Temporarily set max_target_length for training.
+    max_target_length = data_args.max_target_length
+    padding = "max_length" if data_args.pad_to_max_length else False
+
+    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
+        logger.warning(
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
+        )
+
+    def preprocess_tableqa_function(examples, is_training=False):
+        """
+        The is_training FLAG is used to identify if we could use the supervision
+        to truncate the table content if it is required.
+        """
+
+        questions = [question.lower() for question in examples["question"]]
+        example_tables = examples["table"]
+        tables = [
+            pd.DataFrame.from_records(example_table["rows"], columns=example_table["header"])
+            for example_table in example_tables
+        ]
+
+        # using wikitablequestion's answer set
+        answers = examples["answers"]
+
+        # IMPORTANT: we cannot pass by answers during evaluation, answers passed during training are used to
+        # truncate large tables in the train set!
+        if is_training:
+            model_inputs = tokenizer(
+                table=tables,
+                query=questions,
+                answer=answers,
+                max_length=data_args.max_source_length,
+                padding=padding,
+                truncation=True,
+            )
+        else:
+            model_inputs = tokenizer(
+                table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
+            )
+
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(
+                answer=[", ".join(answer) for answer in answers],
+                max_length=max_target_length,
+                padding=padding,
+                truncation=True,
+            )
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    # in training, we can use the answer as extra information to truncate large tables
+    preprocess_tableqa_function_training = partial(preprocess_tableqa_function, is_training=True)
+
+    if training_args.do_train:
+        if "train" not in datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        train_dataset = train_dataset.map(
+            preprocess_tableqa_function_training,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_eval:
+        max_target_length = data_args.val_max_target_length
+        if "validation" not in datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        eval_dataset = eval_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_predict:
+        max_target_length = data_args.val_max_target_length
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )
+
+    def postprocess_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+
+        return preds, labels
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        if data_args.ignore_pad_token_for_loss:
+            # Replace -100 in the labels as we can't decode them.
+            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        # Some simple post-processing
+        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+        delimiter = ", "
+
+        # define example evaluation
+        def evaluate_example(predict_str: str, ground_str: str):
+            predict_spans = predict_str.split(delimiter)
+            ground_spans = ground_str.split(delimiter)
+            predict_values = defaultdict(lambda: 0)
+            ground_values = defaultdict(lambda: 0)
+            for span in predict_spans:
+                try:
+                    predict_values[float(span)] += 1
+                except ValueError:
+                    predict_values[span.strip()] += 1
+            for span in ground_spans:
+                try:
+                    ground_values[float(span)] += 1
+                except ValueError:
+                    ground_values[span.strip()] += 1
+            _is_correct = predict_values == ground_values
+            return _is_correct
+
+        def get_denotation_accuracy(predictions: List[str], references: List[str]):
+            assert len(predictions) == len(references)
+            correct_num = 0
+            for predict_str, ground_str in zip(predictions, references):
+                is_correct = evaluate_example(predict_str.lower(), ground_str.lower())
+                if is_correct:
+                    correct_num += 1
+            return correct_num / len(predictions)
+
+        accuracy = get_denotation_accuracy(decoded_preds, decoded_labels)
+        result = {"denotation_accuracy": accuracy}
+
+        return result
+
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+    )
+
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(
+            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
+        )
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        predict_results = trainer.predict(
+            predict_dataset,
+            metric_key_prefix="predict",
+            max_length=data_args.val_max_target_length,
+            num_beams=data_args.num_beams,
+        )
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+        if trainer.is_world_process_zero():
+            if training_args.predict_with_generate:
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "tapex_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py
new file mode 100644
index 000000000000..9147fdc882e4
--- /dev/null
+++ b/examples/research_projects/tapex/wikisql_utils.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright 2022 The Microsoft, The Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import enum
+import functools
+import math
+import re
+
+# The following script is adapted from the script of TaPas.
+# Original: https://github.com/google-research/tapas/master/wikisql_utils.py
+from typing import Any, List, Text
+
+import six
+
+
+EMPTY_ANSWER = "none"
+EMPTY_ANSWER_AGG = "none"
+
+
+def _split_thousands(delimiter, value):
+    split = value.split(delimiter)
+    return len(split) > 1 and any(map(lambda x: len(x) == 3, split))
+
+
+def convert_to_float(value):
+    """Converts value to a float using a series of increasingly complex heuristics.
+    Args:
+      value: object that needs to be converted. Allowed types include
+        float/int/strings.
+    Returns:
+      A float interpretation of value.
+    Raises:
+      ValueError if the float conversion of value fails.
+    """
+    if isinstance(value, float):
+        return value
+    if isinstance(value, int):
+        return float(value)
+    if not isinstance(value, six.string_types):
+        raise ValueError("Argument value is not a string. Can't parse it as float")
+    sanitized = value
+
+    try:
+        # Example: 1,000.7
+        if "." in sanitized and "," in sanitized:
+            return float(sanitized.replace(",", ""))
+        # 1,000
+        if "," in sanitized and _split_thousands(",", sanitized):
+            return float(sanitized.replace(",", ""))
+        # 5,5556
+        if "," in sanitized and sanitized.count(",") == 1 and not _split_thousands(",", sanitized):
+            return float(sanitized.replace(",", "."))
+        # 0.0.0.1
+        if sanitized.count(".") > 1:
+            return float(sanitized.replace(".", ""))
+        # 0,0,0,1
+        if sanitized.count(",") > 1:
+            return float(sanitized.replace(",", ""))
+        return float(sanitized)
+    except ValueError:
+        # Avoid adding the sanitized value in the error message.
+        raise ValueError("Unable to convert value to float")
+
+
+def _normalize_float(answer):
+    if answer is None:
+        return None
+    try:
+        value = convert_to_float(answer)
+        if isinstance(value, float) and math.isnan(value):
+            return None
+        return value
+    except ValueError:
+        return answer.lower()
+
+
+_TYPE_CONVERTER = {
+    "text": lambda x: x,
+    "real": convert_to_float,
+}
+
+
+class _Aggregation(enum.Enum):
+    """Aggregations as defined by WikiSQL. Indexes match the data."""
+
+    NONE = 0
+    MAX = 1
+    MIN = 2
+    COUNT = 3
+    SUM = 4
+    AVERAGE = 5
+
+
+class _Operator(enum.Enum):
+    """The boolean operators used by WikiSQL. Indexes match the data."""
+
+    EQUALS = 0
+    GREATER = 1
+    LESSER = 2
+
+
+@dataclasses.dataclass
+class _Condition:
+    """Represents an SQL where clauses (e.g A = "a" or B > 5)."""
+
+    column: Text
+    operator: _Operator
+    cmp_value: Any
+
+
+_TOKENIZER = re.compile(r"\w+|[^\w\s]+", re.UNICODE | re.MULTILINE | re.DOTALL)
+
+
+def _normalize_for_match(x):
+    return [t for t in _TOKENIZER.findall(x.lower())]
+
+
+def _compare(operator, src, tgt):
+    if operator == _Operator.EQUALS:
+        return src == tgt
+    elif operator == _Operator.GREATER:
+        return src > tgt
+    elif operator == _Operator.LESSER:
+        return src < tgt
+    raise ValueError(f"Unknown operator: {operator}")
+
+
+def _parse_value(table, column, cell_value):
+    """Convert numeric values to floats and keeps everything else as string."""
+    types = table["types"]
+    return _TYPE_CONVERTER[types[column]](cell_value)
+
+
+def _is_string(x):
+    return isinstance(x, str)
+
+
+def _respect_conditions(table, row, conditions):
+    """True if 'row' satisfies all 'conditions'."""
+    for cond in conditions:
+        table_value = row[cond.column]
+
+        cmp_value = _parse_value(table, cond.column, cond.cmp_value)
+
+        if _is_string(table_value) and _is_string(cmp_value):
+            table_value = _normalize_for_match(table_value)
+            cmp_value = _normalize_for_match(cmp_value)
+
+        if not isinstance(table_value, type(cmp_value)):
+            raise ValueError("Type difference {} != {}".format(type(table_value), type(cmp_value)))
+
+        if not _compare(cond.operator, table_value, cmp_value):
+            return False
+    return True
+
+
+def _get_float_answer(table, answer_coordinates, aggregation_op):
+    """Applies operation to produce reference float answer."""
+    if not answer_coordinates:
+        if aggregation_op == _Aggregation.COUNT:
+            return 0.0
+        else:
+            return EMPTY_ANSWER_AGG
+
+    # Count can support non numeric answers.
+    if aggregation_op == _Aggregation.COUNT:
+        return float(len(answer_coordinates))
+
+    # If we have just one answer, if float returns it or try a conversion.
+    values = [table["rows"][i][j] for (i, j) in answer_coordinates]
+    if len(answer_coordinates) == 1:
+        try:
+            return convert_to_float(values[0])
+        except ValueError as e:
+            if aggregation_op != _Aggregation.NONE:
+                raise e
+
+    if aggregation_op == _Aggregation.NONE:
+        return None
+
+    # Other aggregation only support numeric values. Bail out if we have strings.
+    if not all((isinstance(v, (int, float)) for v in values)):
+        return None
+
+    if aggregation_op == _Aggregation.SUM:
+        return float(sum(values))
+    elif aggregation_op == _Aggregation.AVERAGE:
+        return sum(values) / len(answer_coordinates)
+    else:
+        raise ValueError(f"Unknown aggregation: {aggregation_op}")
+
+
+def _get_answer_coordinates(table, sql_query):
+    """Retrieves references coordinates by executing SQL."""
+    # MAX and MIN are automatically supported by the model.
+    aggregation_op_index = sql_query["agg"]
+    if aggregation_op_index >= 3:
+        aggregation_op = _Aggregation(aggregation_op_index)
+    else:
+        aggregation_op = _Aggregation.NONE
+
+    target_column = sql_query["sel"]
+    conditions = [
+        _Condition(column, _Operator(operator), cmp_value)
+        for column, operator, cmp_value in zip(
+            sql_query["conds"]["column_index"], sql_query["conds"]["operator_index"], sql_query["conds"]["condition"]
+        )
+    ]
+
+    indices = []
+    for row in range(len(table["rows"])):
+        if _respect_conditions(table, table["rows"][row], conditions):
+            indices.append((row, target_column))
+
+    if not indices:
+        return [], aggregation_op
+
+    if len(indices) == 1:
+        return indices, aggregation_op
+
+    # Parsing of MIN/MAX.
+    if aggregation_op_index in (1, 2):
+        operators = {2: min, 1: max}
+        values = [(table["rows"][i][j], index) for index, (i, j) in enumerate(indices)]
+        reduced = functools.reduce(operators[sql_query["agg"]], values)
+
+        ret = [indices[reduced[1]]]
+        return ret, _Aggregation.NONE
+
+    return indices, aggregation_op
+
+
+def _get_answer_text(table, answer_coordinates, float_answer):
+    if float_answer is not None:
+        return [str(float_answer)]
+    return [str(table["real_rows"][r][c]) for r, c in answer_coordinates]
+
+
+def retrieve_wikisql_query_answer_tapas(table, example) -> List:
+    answer_coordinates, aggregation_op = _get_answer_coordinates(table, example)
+    float_answer = _get_float_answer(table, answer_coordinates, aggregation_op)
+    answer_text = _get_answer_text(table, answer_coordinates, float_answer)
+    # keep the original data the same with TaPas
+    if len(answer_text) == 0:
+        answer_text = [EMPTY_ANSWER]
+    return answer_text
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 2c78ab39fa60..a1f39eeeb011 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 891219d3a1a2..877fe8800999 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 09aa8f90de3d..6c4f1e5a9ed9 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -50,7 +50,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index fa8cb98a5a6e..c36476120eab 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -61,7 +61,7 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index c6921bbf3c51..f81148a4af0b 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -53,7 +53,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
+check_min_version("4.19.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
new file mode 100755
index 000000000000..903b4e0dd6d5
--- /dev/null
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -0,0 +1,448 @@
+#!/usr/bin/env python
+
+# HF Trainer benchmarking tool
+#
+# This tool can be used to run and compare multiple dimensions of the HF Trainers args.
+#
+# It then prints a report once in github format with all the information that needs to be shared
+# with others and second time in a console-friendly format, so it's easier to use for tuning things up.
+#
+# The main idea is:
+#
+#     ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
+#     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
+#     --target-metric-key train_samples_per_second
+#
+# The variations can be any command line argument that you want to compare and not just dtype as in
+# the example.
+#
+# --variations allows you to compare variations in multiple dimensions.
+#
+# as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6
+# times adding one of:
+#
+#    1. --tf32 0 --fp16 0
+#    2. --tf32 0 --fp16 1
+#    3. --tf32 0 --bf16 1
+#    4. --tf32 1 --fp16 0
+#    5. --tf32 1 --fp16 1
+#    6. --tf32 1 --bf16 1
+#
+# and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
+#
+# If you want to rely on defaults, this:
+#    --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1'
+# is identical to this:
+#    --variations '--tf32 0|--tf32 1' '|--fp16|--bf16'
+#
+# the leading empty variation in the 2nd dimension is a valid variation.
+#
+# So here we get the following 6 variations:
+#
+#    1. --tf32 0
+#    2. --tf32 0 --fp16
+#    3. --tf32 0 --bf16
+#    4. --tf32 1
+#    5. --tf32 1 --fp16
+#    6. --tf32 1 --bf16
+#
+# In this particular case we don't know what the default tf32 setting is as it's normally
+# pytorch-version dependent). That's why it's best to do an explicit setting of each variation:
+#    `--tf32 0|--tf32 1`
+#
+# Here is a full example of a train:
+#
+# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
+# --base-cmd \
+# ' examples/pytorch/translation/run_translation.py --model_name_or_path t5-small \
+# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
+# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
+# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
+# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
+# --source_prefix "translate English to Romanian: " --warmup_steps 50 \
+# --max_train_samples 20000 --dataloader_num_workers 2 ' \
+# --target-metric-key train_samples_per_second --repeat-times 1 --variations \
+# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \
+# --repeat-times 1 --base-variation '--tf32 0'
+#
+# and here is a possible output:
+#
+#
+# | Variation       |     Train |   Diff |   Train |
+# |                 |   samples |      % |    loss |
+# |                 |       per |        |         |
+# |                 |    second |        |         |
+# |:----------------|----------:|-------:|--------:|
+# | --tf32 0        |    285.11 |      0 |    2.51 |
+# | --tf32 1        |    342.09 |     20 |    2.51 |
+# | --fp16 --tf32 0 |    423.49 |     49 |    2.51 |
+# | --fp16 --tf32 1 |    423.13 |     48 |    2.51 |
+# | --bf16 --tf32 0 |    416.80 |     46 |    2.52 |
+# | --bf16 --tf32 1 |    415.87 |     46 |    2.52 |
+#
+#
+# So you can quickly compare the different outcomes.
+#
+# Typically running each experiment once is enough, but if the environment is unstable you can
+# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
+#
+# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
+# it as can be seen from the table above, but you can also specify which combination is the one to use as
+# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
+#
+# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
+# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use:
+#    --target-metric-key eval_samples_per_second
+# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as
+# well (as currently it doesn't)
+#
+
+import argparse
+import datetime
+import io
+import itertools
+import json
+import math
+import os
+import platform
+import re
+import shlex
+import subprocess
+import sys
+from pathlib import Path
+from statistics import fmean
+
+import pandas as pd
+import torch
+from tqdm import tqdm
+
+import transformers
+
+
+nan = float("nan")
+
+
+class Tee:
+    """
+    A helper class to tee print's output into a file.
+    Usage:
+    sys.stdout = Tee(filename)
+    """
+
+    def __init__(self, filename):
+        self.stdout = sys.stdout
+        self.file = open(filename, "a")
+
+    def __getattr__(self, attr):
+        return getattr(self.stdout, attr)
+
+    def write(self, msg):
+        self.stdout.write(msg)
+        # strip tqdm codes
+        self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
+
+
+def get_original_command(max_width=80, full_python_path=False):
+    """
+    Return the original command line string that can be replayed nicely and wrapped for 80 char width.
+
+    Args:
+        max_width (`int`, `optional`, defaults to 80):
+            The width to wrap for.
+        full_python_path (`bool`, `optional`, defaults to `False`):
+             Whether to replicate the full path or just the last segment (i.e. `python`).
+    """
+
+    cmd = []
+
+    # deal with critical env vars
+    env_keys = ["CUDA_VISIBLE_DEVICES"]
+    for key in env_keys:
+        val = os.environ.get(key, None)
+        if val is not None:
+            cmd.append(f"{key}={val}")
+
+    # python executable (not always needed if the script is executable)
+    python = sys.executable if full_python_path else sys.executable.split("/")[-1]
+    cmd.append(python)
+
+    # now the normal args
+    cmd += list(map(shlex.quote, sys.argv))
+
+    # split up into up to MAX_WIDTH lines with shell multi-line escapes
+    lines = []
+    current_line = ""
+    while len(cmd) > 0:
+        current_line += f"{cmd.pop(0)} "
+        if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1:
+            lines.append(current_line)
+            current_line = ""
+    return "\\\n".join(lines)
+
+
+def get_base_command(args, output_dir):
+
+    # unwrap multi-line input
+    args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
+
+    # remove --output_dir if any and set our own
+    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
+    args.base_cmd += f" --output_dir {output_dir}"
+
+    # ensure we have --overwrite_output_dir
+    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
+    args.base_cmd += " --overwrite_output_dir"
+
+    return [sys.executable] + shlex.split(args.base_cmd)
+
+
+def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
+
+    # Enable to debug everything but the run itself, to do it fast and see the progress.
+    # This is useful for debugging the output formatting quickly - we can remove it later once
+    # everybody is happy with the output
+    if 0:
+        import random
+        from time import sleep
+
+        sleep(0)
+        return dict(
+            {k: random.uniform(0, 100) for k in metric_keys},
+            **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])},
+        )
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if verbose:
+        print("STDOUT", result.stdout)
+        print("STDERR", result.stderr)
+
+    # save the streams
+    prefix = variation.replace(" ", "-")
+    with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f:
+        f.write(result.stdout)
+    with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f:
+        f.write(result.stderr)
+
+    if result.returncode != 0:
+        if verbose:
+            print("failed")
+        return {target_metric_key: nan}
+
+    with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
+        metrics = json.load(f)
+
+    # filter out just the keys we want
+    return {k: v for k, v in metrics.items() if k in metric_keys}
+
+
+def process_run(
+    id,
+    cmd,
+    variation_key,
+    variation,
+    longest_variation_len,
+    target_metric_key,
+    report_metric_keys,
+    repeat_times,
+    output_dir,
+    verbose,
+):
+    results = []
+    metrics = []
+    preamble = f"{id}: {variation:<{longest_variation_len}}"
+    outcome = f"{preamble}: "
+    metric_keys = set(report_metric_keys + [target_metric_key])
+    for i in tqdm(range(repeat_times), desc=preamble, leave=False):
+        single_run_metrics = process_run_single(
+            id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
+        )
+        result = single_run_metrics[target_metric_key]
+        if not math.isnan(result):
+            metrics.append(single_run_metrics)
+            results.append(result)
+            outcome += "✓"
+        else:
+            outcome += "✘"
+    outcome = f"\33[2K\r{outcome}"
+    if len(metrics) > 0:
+        mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()}
+        mean_target = round(mean_metrics[target_metric_key], 2)
+        results_str = f"{outcome} {mean_target}"
+        if len(metrics) > 1:
+            results_str += f" {tuple(round(x, 2) for x in results)}"
+        print(results_str)
+        mean_metrics[variation_key] = variation
+        return mean_metrics
+    else:
+        print(outcome)
+        return {variation_key: variation, target_metric_key: nan}
+
+
+def get_versions():
+    properties = torch.cuda.get_device_properties(torch.device("cuda"))
+    return f"""
+Datetime    : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+Software:
+transformers: {transformers.__version__}
+torch       : {torch.__version__}
+cuda        : {torch.version.cuda}
+python      : {platform.python_version()}
+
+Hardware:
+{torch.cuda.device_count()} GPUs      : {properties.name}, {properties.total_memory/2**30:0.2f}GB
+"""
+
+
+def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
+
+    df = pd.DataFrame(results)
+    variation_key = "variation"
+    diff_key = "diff_%"
+
+    sentinel_value = nan
+    if base_variation is not None and len(df[df[variation_key] == base_variation]):
+        # this may still return nan
+        sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
+    if math.isnan(sentinel_value):
+        # as a fallback, use the minimal value as the sentinel
+        sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min()
+
+    # create diff column if possible
+    if not math.isnan(sentinel_value):
+        df[diff_key] = df.apply(
+            lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value)
+            if not math.isnan(r[target_metric_key])
+            else 0,
+            axis="columns",
+        )
+
+    # re-order columns
+    cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
+    df = df.reindex(cols, axis="columns")  # reorder cols
+
+    # capitalize
+    df = df.rename(str.capitalize, axis="columns")
+
+    # make the cols as narrow as possible
+    df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns")
+    df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns")
+
+    report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"]
+    report += ["----------8<-----------------8<--------"]
+    report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
+    report += ["```"]
+    report += ["*** Setup:", get_versions()]
+    report += ["*** The benchmark command line was:", get_original_command()]
+    report += ["```"]
+    report += ["----------8<-----------------8<--------"]
+    report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
+
+    print("\n\n".join(report))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base-cmd",
+        default=None,
+        type=str,
+        required=True,
+        help="Base cmd",
+    )
+    parser.add_argument(
+        "--variations",
+        default=None,
+        type=str,
+        nargs="+",
+        required=True,
+        help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'",
+    )
+    parser.add_argument(
+        "--base-variation",
+        default=None,
+        type=str,
+        help="Baseline variation to compare to. if None the minimal target value will be used to compare against",
+    )
+    parser.add_argument(
+        "--target-metric-key",
+        default=None,
+        type=str,
+        required=True,
+        help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second",
+    )
+    parser.add_argument(
+        "--report-metric-keys",
+        default="",
+        type=str,
+        help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples",
+    )
+    parser.add_argument(
+        "--repeat-times",
+        default=1,
+        type=int,
+        help="How many times to re-run each variation - an average will be reported",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="output_benchmark",
+        type=str,
+        help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked",
+    )
+    parser.add_argument(
+        "--verbose",
+        default=False,
+        action="store_true",
+        help="Whether to show the outputs of each run or just the benchmark progress",
+    )
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    Path(output_dir).mkdir(exist_ok=True)
+    base_cmd = get_base_command(args, output_dir)
+
+    # split each dimension into its --foo variations
+    dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
+    # build a cartesian product of dimensions and convert those back into cmd-line arg strings,
+    # while stripping white space for inputs that were empty
+    variations = list(map(str.strip, map(" ".join, itertools.product(*dims))))
+    longest_variation_len = max(len(x) for x in variations)
+
+    # split wanted keys
+    report_metric_keys = args.report_metric_keys.split()
+
+    # capture prints into a log file for convenience
+    report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
+    print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt")
+    print(f"and this script's output is also piped into {report_fn}")
+
+    sys.stdout = Tee(report_fn)
+
+    print(f"\n*** Running {len(variations)} benchmarks:")
+    print(f"Base command: {' '.join(base_cmd)}")
+
+    variation_key = "variation"
+    results = []
+    for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):
+        cmd = base_cmd + variation.split()
+        results.append(
+            process_run(
+                id + 1,
+                cmd,
+                variation_key,
+                variation,
+                longest_variation_len,
+                args.target_metric_key,
+                report_metric_keys,
+                args.repeat_times,
+                output_dir,
+                args.verbose,
+            )
+        )
+
+    process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index 56ba7d4c4784..4d386ae00825 100644
--- a/setup.py
+++ b/setup.py
@@ -385,7 +385,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.18.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.19.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Suraj Patil, Stas Bekman, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 841b830c3b1d..dbdef5824d6a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.18.0.dev0"
+__version__ = "4.19.0.dev0"
 
 from typing import TYPE_CHECKING
 
@@ -242,6 +242,7 @@
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
     "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
     "models.reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"],
+    "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
     "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
@@ -266,6 +267,7 @@
     "models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"],
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
+    "models.tapex": ["TapexTokenizer"],
     "models.transfo_xl": [
         "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TransfoXLConfig",
@@ -1342,6 +1344,14 @@
             "ReformerPreTrainedModel",
         ]
     )
+    _import_structure["models.regnet"].extend(
+        [
+            "REGNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RegNetForImageClassification",
+            "RegNetModel",
+            "RegNetPreTrainedModel",
+        ]
+    )
     _import_structure["models.rembert"].extend(
         [
             "REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1699,6 +1709,8 @@
     _import_structure["benchmark.benchmark_args_tf"] = ["TensorFlowBenchmarkArguments"]
     _import_structure["benchmark.benchmark_tf"] = ["TensorFlowBenchmark"]
     _import_structure["generation_tf_logits_process"] = [
+        "TFForcedBOSTokenLogitsProcessor",
+        "TFForcedEOSTokenLogitsProcessor",
         "TFLogitsProcessor",
         "TFLogitsProcessorList",
         "TFLogitsWarper",
@@ -2610,6 +2622,7 @@
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
     from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
+    from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
     from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
@@ -2631,6 +2644,7 @@
     from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
+    from .models.tapex import TapexTokenizer
     from .models.transfo_xl import (
         TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TransfoXLConfig,
@@ -3535,6 +3549,12 @@
             ReformerModelWithLMHead,
             ReformerPreTrainedModel,
         )
+        from .models.regnet import (
+            REGNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RegNetForImageClassification,
+            RegNetModel,
+            RegNetPreTrainedModel,
+        )
         from .models.rembert import (
             REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             RemBertForCausalLM,
@@ -3827,6 +3847,8 @@
         # Benchmarks
         from .benchmark.benchmark_tf import TensorFlowBenchmark
         from .generation_tf_logits_process import (
+            TFForcedBOSTokenLogitsProcessor,
+            TFForcedEOSTokenLogitsProcessor,
             TFLogitsProcessor,
             TFLogitsProcessorList,
             TFLogitsWarper,
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 1599cfab8a28..0c9276e948f5 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -220,12 +220,11 @@ class DataCollatorWithPadding:
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
-              is provided).
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
             - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
               acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-              lengths).
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
         max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
         pad_to_multiple_of (`int`, *optional*):
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 46cfb9730760..6feabdaa8095 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -250,7 +250,7 @@ def trainer_config_process(self, args):
         self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
 
         # deepspeed's default mode is fp16 unless there is a config that says differently
-        if self.is_true("bfoat16.enabled"):
+        if self.is_true("bf16.enabled"):
             self._dtype = torch.bfloat16
         elif self.is_false("fp16.enabled"):
             self._dtype = torch.float32
diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py
index 22ece088ae42..92bae58eb8fd 100644
--- a/src/transformers/generation_tf_logits_process.py
+++ b/src/transformers/generation_tf_logits_process.py
@@ -216,14 +216,10 @@ def __init__(self, min_length: int, eos_token_id: int):
         self.min_length = min_length
         self.eos_token_id = eos_token_id
 
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
-        # create boolean flag to decide if min length penalty should be applied
-        cur_len = input_ids.shape[-1]
-        apply_penalty = 1 - tf.clip_by_value(cur_len - self.min_length, 0, 1)
-
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
         # TODO(Matt) - this if statement has to be rewritten for XLA. Leaving it now though since
         # generate is not XLA - compileable anyways
-        if apply_penalty:
+        if cur_len < self.min_length:
             eos_token_id_mask = tf.broadcast_to(tf.range(scores.shape[-1]) == self.eos_token_id, scores.shape)
             scores = set_tensor_by_indices_to_value(scores, eos_token_id_mask, float("-inf"))
 
@@ -259,8 +255,8 @@ def _create_score_penalties(self, input_ids, logits):
             np.put(token_penalties[i], prev_input_id, logit_penalties)
         return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
 
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
-        score_penalties = self._create_score_penalties(input_ids, scores)
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        score_penalties = self._create_score_penalties(input_ids[:, :cur_len], scores)
 
         scores = tf.math.multiply(scores, score_penalties)
 
@@ -330,12 +326,12 @@ def _tokens_match(prev_tokens, tokens):
 
         return banned_tokens
 
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
 
         vocab_size = scores.shape[-1]
 
         # calculate a list of banned tokens according to bad words
-        banned_tokens = self.calc_banned_bad_words_ids(input_ids)
+        banned_tokens = self.calc_banned_bad_words_ids(input_ids[:, :cur_len])
 
         banned_tokens_indices_mask = []
         for banned_tokens_slice in banned_tokens:
@@ -365,12 +361,13 @@ def __init__(self, ngram_size: int):
             raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
         self.ngram_size = ngram_size
 
-    def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len):
+    def calc_banned_ngram_tokens(self, input_ids, num_hypos, cur_len):
         # Copied from fairseq for no_repeat_ngram in beam_search
         if cur_len + 1 < self.ngram_size:
             # return no banned tokens if we haven't generated ngram_size tokens yet
             return [[] for _ in range(num_hypos)]
         generated_ngrams = [{} for _ in range(num_hypos)]
+        prev_input_ids = input_ids[:, :cur_len]
         for idx in range(num_hypos):
             gen_tokens = prev_input_ids[idx].numpy().tolist()
             generated_ngram = generated_ngrams[idx]
@@ -388,10 +385,9 @@ def _get_generated_ngrams(hypo_idx):
 
         return banned_tokens
 
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
 
         batch_size, vocab_size = scores.shape
-        cur_len = input_ids.shape[-1]
         banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len)
 
         # create banned_tokens boolean mask
@@ -406,3 +402,66 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
         )
 
         return scores
+
+
+class TFForcedBOSTokenLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] that enforces the specified token as the first generated token.
+
+    Args:
+        bos_token_id (`int`):
+            The id of the token to force as the first generated token.
+    """
+
+    def __init__(self, bos_token_id: int):
+        if bos_token_id < 0:
+            raise ValueError(f"The forced bos token id  must be a non-negative integer, got {bos_token_id}")
+        self.bos_token_id = bos_token_id
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        if cur_len == 1:
+            batch_size, num_tokens = scores.shape
+            # sets the score to 0 in the bos_token_id column
+            scores = tf.zeros((batch_size, 1))
+            # sets the score to -inf everywhere else
+            if self.bos_token_id > 0:
+                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.bos_token_id)), scores), axis=-1)
+            if self.bos_token_id < (num_tokens - 1):
+                scores = tf.concat(
+                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.bos_token_id))),
+                    axis=-1,
+                )
+        return scores
+
+
+class TFForcedEOSTokenLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
+
+    Args:
+        max_length (`int`):
+            The maximum length of the sequence to be generated.
+        eos_token_id (`int`):
+            The id of the token to force as the last generated token when `max_length` is reached.
+    """
+
+    def __init__(self, max_length: int, eos_token_id: int):
+        self.max_length = max_length
+        if eos_token_id < 0:
+            raise ValueError(f"The forced eos token id must be a non-negative integer, got {eos_token_id}")
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        if cur_len == self.max_length - 1:
+            batch_size, num_tokens = scores.shape
+            # sets the score to 0 in the eos_token_id column
+            scores = tf.zeros((batch_size, 1))
+            # sets the score to -inf everywhere else
+            if self.eos_token_id > 0:
+                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.eos_token_id)), scores), axis=-1)
+            if self.eos_token_id < (num_tokens - 1):
+                scores = tf.concat(
+                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.eos_token_id))),
+                    axis=-1,
+                )
+        return scores
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index e96eb191e6a2..8668e6f8dcc2 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -16,12 +16,15 @@
 
 import inspect
 from dataclasses import dataclass
+from functools import partial
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
 from .generation_tf_logits_process import (
+    TFForcedBOSTokenLogitsProcessor,
+    TFForcedEOSTokenLogitsProcessor,
     TFLogitsProcessorList,
     TFMinLengthLogitsProcessor,
     TFNoBadWordsLogitsProcessor,
@@ -560,7 +563,7 @@ def generate(
         num_beams = num_beams if num_beams is not None else self.config.num_beams
         do_sample = do_sample if do_sample is not None else self.config.do_sample
 
-        if num_beams == 1:
+        if do_sample is False or num_beams == 1:
             return self._generate(
                 input_ids=input_ids,
                 max_length=max_length,
@@ -586,6 +589,8 @@ def generate(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict_in_generate=return_dict_in_generate,
+                forced_bos_token_id=forced_bos_token_id,
+                forced_eos_token_id=forced_eos_token_id,
             )
 
         # We cannot generate if the model does not have a LM head
@@ -1377,8 +1382,8 @@ def _generate(
                 the target language token.
             forced_eos_token_id (`int`, *optional*):
                 The id of the token to force as the last generated token when `max_length` is reached.
-            model_specific_kwargs:
-                Additional model specific kwargs will be forwarded to the `forward` function of the model.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `call` function of the model.
 
         Return:
             [`~utils.ModelOutput`] or `tf.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when
@@ -1452,12 +1457,20 @@ def _generate(
         # 1. Set generation parameters if not already defined
         max_length = max_length if max_length is not None else self.config.max_length
         min_length = min_length if min_length is not None else self.config.min_length
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
         early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
 
         bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
         pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
 
+        forced_bos_token_id = (
+            forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id
+        )
+        forced_eos_token_id = (
+            forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id
+        )
+
         output_scores = output_scores if output_scores is not None else self.config.output_scores
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1489,10 +1502,13 @@ def _generate(
             model_kwargs["output_hidden_states"] = output_hidden_states
         if use_cache is not None:
             model_kwargs["use_cache"] = use_cache
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = attention_mask
 
+        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.call).parameters.keys())
         requires_attention_mask = "encoder_outputs" not in model_kwargs
 
-        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(input_ids, pad_token_id)
 
         # 4. Prepare model inputs which will be used for auto-regressive generation
@@ -1519,6 +1535,7 @@ def _generate(
         # TODO(Matt, Joao, Patrick) - add more use cases here
         is_greedy_gen_mode = (num_beams == 1) and do_sample is False
         is_sample_gen_mode = (num_beams == 1) and do_sample is True
+        is_beam_gen_mode = (num_beams > 1) and do_sample is False
 
         # 6. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
@@ -1526,7 +1543,10 @@ def _generate(
             no_repeat_ngram_size=no_repeat_ngram_size,
             bad_words_ids=bad_words_ids,
             min_length=min_length,
+            max_length=max_length,
             eos_token_id=eos_token_id,
+            forced_bos_token_id=forced_bos_token_id,
+            forced_eos_token_id=forced_eos_token_id,
         )
 
         # 7. go into different generation modes
@@ -1571,18 +1591,62 @@ def _generate(
                 **model_kwargs,
             )
 
+        elif is_beam_gen_mode:
+            if num_beams < num_return_sequences:
+                raise ValueError(
+                    "Greedy beam search decoding cannot return more sequences than it has beams. Please set "
+                    f"num_beams >= num_return_sequences, got {num_beams} and {num_return_sequences} (respectivelly)"
+                )
+
+            # 8. broadcast inputs to the desired number of beams
+            input_ids = self._expand_to_num_beams(input_ids, num_beams=num_beams)
+
+            if "encoder_outputs" in model_kwargs:
+                model_kwargs["encoder_outputs"]["last_hidden_state"] = self._expand_to_num_beams(
+                    model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=num_beams
+                )
+
+            if "attention_mask" in model_kwargs:
+                model_kwargs["attention_mask"] = self._expand_to_num_beams(
+                    model_kwargs["attention_mask"], num_beams=num_beams
+                )
+
+            # 9. run beam search
+            return self.beam_search(
+                input_ids,
+                max_length=max_length,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                length_penalty=length_penalty,
+                early_stopping=early_stopping,
+                logits_processor=logits_processor,
+                return_dict_in_generate=return_dict_in_generate,
+                num_return_sequences=num_return_sequences,
+                **model_kwargs,
+            )
+
+        else:
+            # TODO(Matt, Joao, Patrick) - add more sub-generation methods here
+            raise NotImplementedError("Beam sampling is currently not implemented.")
+
+    @staticmethod
+    def _expand_to_num_beams(tensor: tf.Tensor, num_beams: int) -> tf.Tensor:
+        return tf.broadcast_to(tensor[:, None], (tensor.shape[0], num_beams) + tensor.shape[1:])
+
     def _prepare_attention_mask_for_generation(
         self,
-        input_ids: tf.Tensor,
+        inputs: tf.Tensor,
         pad_token_id: int,
     ) -> tf.Tensor:
-        # prepare `attention_mask` if not passed
-        if (pad_token_id is not None) and tf.math.reduce_any(input_ids == pad_token_id):
-            return tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
+        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in (tf.int32, tf.int64)
+        is_pad_token_in_inputs = (pad_token_id is not None) and tf.math.reduce_any(inputs == pad_token_id)
+        # Check if input is input_ids and padded -> only then is attention_mask defined
+        if is_input_ids and is_pad_token_in_inputs:
+            return tf.cast(tf.math.not_equal(inputs, pad_token_id), dtype=tf.int32)
         else:
-            return tf.ones(input_ids.shape[:2], dtype=tf.int32)
+            return tf.ones(inputs.shape[:2], dtype=tf.int32)
 
-    def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids: tf.Tensor, model_kwargs) -> Dict[str, Any]:
+    def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: tf.Tensor, model_kwargs) -> Dict[str, Any]:
         # get encoder and store encoder outputs
         encoder = self.get_encoder()
 
@@ -1595,11 +1659,9 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids: tf.Tensor, m
         }
 
         # vision models don't use `attention_mask`.
-        signature = dict(inspect.signature(encoder.call).parameters)
-        if "attention_mask" not in signature:
-            encoder_kwargs.pop("attention_mask")
-
-        encoder_outputs = encoder(input_ids, **encoder_kwargs)
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[self.main_input_name] = inputs_tensor
+        encoder_outputs = encoder(**encoder_kwargs)
         model_kwargs["encoder_outputs"] = encoder_outputs
 
         return model_kwargs
@@ -1757,7 +1819,10 @@ def _get_logits_processor(
         no_repeat_ngram_size: int,
         bad_words_ids: List[List[int]],
         min_length: int,
+        max_length: int,
         eos_token_id: int,
+        forced_bos_token_id: int,
+        forced_eos_token_id: int,
     ) -> TFLogitsProcessorList:
         """
         This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`]
@@ -1781,6 +1846,10 @@ def _get_logits_processor(
             processors.append(TFNoBadWordsLogitsProcessor(bad_words_ids, eos_token_id))
         if min_length is not None and eos_token_id is not None and min_length > 0:
             processors.append(TFMinLengthLogitsProcessor(min_length, eos_token_id))
+        if forced_bos_token_id is not None:
+            processors.append(TFForcedBOSTokenLogitsProcessor(forced_bos_token_id))
+        if forced_eos_token_id is not None:
+            processors.append(TFForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
 
         return processors
 
@@ -1949,7 +2018,7 @@ def greedy_search_body_fn(generated, finished_sequences, next_tokens, current_po
             if not use_xla:
                 input_ids = tf.reshape(generated.concat(), (-1, batch_size))
                 input_ids = tf.transpose(input_ids[: current_pos[0]])
-            next_tokens_scores = logits_processor(input_ids, next_token_logits)
+            next_tokens_scores = logits_processor(input_ids, next_token_logits, cur_len=current_pos[0])
 
             # argmax
             next_tokens = tf.argmax(next_tokens_scores, axis=-1, output_type=tf.int32)
@@ -2064,7 +2133,7 @@ def sample(
             input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             logits_processor (`TFLogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
                 used to modify the prediction scores of the language modeling head applied at each generation step.
             logits_warper (`TFLogitsProcessorList`, *optional*):
                 An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
@@ -2184,7 +2253,7 @@ def sample(
             next_token_logits = outputs.logits[:, -1, :]
 
             # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_processor(input_ids, next_token_logits, cur_len=cur_len)
             next_token_scores = logits_warper(input_ids, next_token_scores)
 
             # Store scores, attentions and hidden_states when required
@@ -2259,6 +2328,527 @@ def sample(
         else:
             return input_ids
 
+    def beam_search(
+        self,
+        input_ids: tf.Tensor,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        early_stopping: Optional[bool] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        num_return_sequences: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[TFBeamSearchOutput, tf.Tensor]:
+        r"""
+        Generates sequences for models with a language modeling head using beam search with multinomial sampling.
+
+        Parameters:
+
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            length_penalty (`float`, *optional*, defaults to 1.0):
+                Exponential penalty to the length. 1.0 means no penalty.
+            early_stopping (`bool`, *optional*, defaults to `False`):
+                Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+            logits_processor (`[TFLogitsProcessorList]`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            num_return_sequences(`int`, *optional*, defaults to 1):
+                The number of independently computed returned sequences for each element in the batch.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `call` function of the model. If model is an
+                encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`],
+            [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`] or `tf.Tensor`: A `tf.Tensor` containing the
+            generated tokens (default behaviour) or a [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     TFAutoModelForSeq2SeqLM,
+        ...     TFLogitsProcessorList,
+        ...     TFMinLengthLogitsProcessor,
+        ... )
+        >>> import tensorflow as tf
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="tf").input_ids
+
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = tf.ones((num_beams, 1), dtype=tf.int64)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         tf.repeat(encoder_input_ids, num_beams, axis=0), return_dict=True
+        ...     )
+        ... }
+
+        >>> # instantiate logits processors
+        >>> logits_processor = TFLogitsProcessorList(
+        ...     [TFMinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
+        ... )
+
+        >>> outputs = model.beam_search(input_ids, logits_processor=logits_processor, **model_kwargs)
+
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
+
+        def flatten_beam_dim(tensor, batch_axis=0):
+            """Flattens the first two dimensions of a non-scalar array."""
+            # ignore scalars (e.g. cache index)
+            if tf.rank(tensor) == 0:
+                return tensor
+            return tf.reshape(
+                tensor,
+                tensor.shape[:batch_axis]
+                + [tensor.shape[batch_axis] * tensor.shape[batch_axis + 1]]
+                + tensor.shape[batch_axis + 2 :],
+            )
+
+        def unflatten_beam_dim(tensor, batch_size, num_beams, batch_axis=0):
+            """Unflattens the first, flat batch*beam dimension of a non-scalar array."""
+            # ignore scalars (e.g. cache index)
+            if tf.rank(tensor) == 0:
+                return tensor
+            return tf.reshape(
+                tensor, tensor.shape[:batch_axis] + [batch_size, num_beams] + tensor.shape[batch_axis + 1 :]
+            )
+
+        def gather_beams(nested, beam_indices, batch_axis=0):
+            """Gathers the beam slices indexed by beam_indices into new beam array."""
+
+            def gather_fn(tensor):
+                # ignore scalars (e.g. cache index)
+                if tf.rank(tensor) == 0:
+                    return tensor
+                else:
+                    if batch_axis > 0:
+                        # pushes all dimentions before the batch to the end, so we get (batch, beam_id, ...)
+                        perm = [axis for axis in range(tf.rank(tensor)) if axis >= batch_axis] + list(
+                            range(batch_axis)
+                        )
+                        tensor = tf.transpose(tensor, perm=perm)
+
+                    gathered_tensor = tf.gather(params=tensor, indices=beam_indices, axis=1, batch_dims=1)
+                    if batch_axis > 0:
+                        # transposes back to the original dimensions
+                        perm = [axis for axis in range(tf.rank(tensor)) if axis >= batch_axis] + list(
+                            range(batch_axis)
+                        )
+                        perm = tf.math.invert_permutation(perm)
+                        gathered_tensor = tf.transpose(gathered_tensor, perm=perm)
+
+                    return gathered_tensor
+
+            return tf.nest.map_structure(gather_fn, nested)
+
+        # 1. init beam_search values
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+
+        use_xla = not tf.executing_eagerly()
+        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # GPT2 and other models has a slightly different cache structure, with a different batch axis
+        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
+        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+
+        # 2. init `attentions`, `hidden_states`, and `scores` tuples
+        scores = [] if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = [] if (return_dict_in_generate and output_hidden_states) else None
+
+        # 3. init tensors to use for "xla-compileable" generate function
+        batch_size, num_beams, cur_len = input_ids.shape
+
+        # per batch, beam-item holding current token in loop, pre-populated with `pad_token_id`
+        sequences = tf.TensorArray(
+            element_shape=(batch_size, num_beams),
+            dtype=tf.int32,
+            dynamic_size=False,
+            size=max_length,
+            clear_after_read=False,
+        )
+        running_sequences = tf.TensorArray(
+            element_shape=(batch_size, num_beams),
+            dtype=tf.int32,
+            dynamic_size=False,
+            size=max_length,
+            clear_after_read=False,
+        )
+        intermediary_running_sequences = tf.TensorArray(
+            element_shape=(batch_size, num_beams * 2),
+            dtype=tf.int32,
+            dynamic_size=False,
+            size=max_length,
+            clear_after_read=False,
+        )
+        for i in range(max_length):
+            sequences = sequences.write(i, tf.broadcast_to(pad_token_id, (batch_size, num_beams)))
+            running_sequences = running_sequences.write(i, tf.broadcast_to(pad_token_id, (batch_size, num_beams)))
+            intermediary_running_sequences = intermediary_running_sequences.write(
+                i, tf.broadcast_to(pad_token_id, (batch_size, num_beams * 2))
+            )
+
+        # write prompt to running_sequences
+        for i in range(cur_len):
+            running_sequences = running_sequences.write(i, input_ids[:, :, i])
+
+        # per batch,beam-item state bit indicating if sentence has finished.
+        is_sent_finished = tf.zeros((batch_size, num_beams), dtype=tf.bool)
+
+        # per batch, beam-item score, logprobs
+        running_scores = tf.tile(
+            tf.expand_dims(tf.convert_to_tensor([0.0] + [-1.0e9] * (num_beams - 1)), axis=0), [batch_size, 1]
+        )
+        scores = tf.ones((batch_size, num_beams)) * -1.0e9
+
+        # flatten beam dim
+        if "encoder_outputs" in model_kwargs:
+            model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
+                model_kwargs["encoder_outputs"]["last_hidden_state"]
+            )
+        if "attention_mask" in model_kwargs:
+            model_kwargs["attention_mask"] = flatten_beam_dim(model_kwargs["attention_mask"])
+
+        # 4. define "xla-compile-able" stop-condition and auto-regressive function
+        # define stop-condition and auto-regressive function
+        def beam_search_cond_fn(
+            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs
+        ):
+            """
+            Beam Search termination condition function -- halts the generation loop if any of these conditions becomes
+            False
+            """
+            # 1. is less than max length?
+            not_max_length_yet = cur_len < max_length
+
+            # 2. can the new beams still improve?
+            best_running_score = running_scores[:, :1] / (max_length**length_penalty)
+            worst_finished_score = tf.where(
+                is_sent_finished, tf.math.reduce_min(scores, axis=1, keepdims=True), -1.0e9
+            )
+            improvement_still_possible = tf.math.reduce_all(worst_finished_score < best_running_score)
+
+            # 3. is there still a beam that has not finished?
+            still_open_beam = ~(tf.math.reduce_all(is_sent_finished) & early_stopping)
+
+            return not_max_length_yet & (still_open_beam | improvement_still_possible)
+
+        def beam_search_body_fn(
+            cur_len,
+            running_sequences,
+            running_scores,
+            sequences,
+            scores,
+            is_sent_finished,
+            model_kwargs,
+            input_ids_length=1,
+            intermediary_running_sequences=None,
+        ):
+            """
+            Beam Search iterative update function -- each iteration adds a new token and updates the best sequences
+            seen so far
+            """
+            # TODO (joao): this loop is probably faster with gather/scatters, instead of using `tf.TensorArray`.
+            # Alternativelly, attempt to rewrite function with permuted axis, when enabling XLA.
+
+            # 1. Forward current tokens
+
+            # TF places the dynamic dimension (seq_len) in the first axis, we want it in the last
+            running_sequences_seq_last = tf.transpose(running_sequences.stack(), perm=[1, 2, 0])
+            input_token = tf.slice(
+                running_sequences_seq_last,
+                (0, 0, cur_len - input_ids_length),
+                (batch_size, num_beams, input_ids_length),
+            )
+            model_inputs = self.prepare_inputs_for_generation(
+                flatten_beam_dim(input_token), use_xla=use_xla, **model_kwargs
+            )
+            model_outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            logits = unflatten_beam_dim(model_outputs.logits[:, -1], batch_size, num_beams)
+
+            # Store scores, attentions and hidden_states when required
+            if not use_xla and return_dict_in_generate:
+                if output_scores:
+                    scores.append(model_outputs.logits[:, -1])
+                if output_attentions and self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.decoder_attentions)
+                elif output_attentions and not self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.attentions)
+                    if self.config.is_encoder_decoder:
+                        cross_attentions.append(model_outputs.cross_attentions)
+
+                if output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.decoder_hidden_states)
+                elif output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.hidden_states)
+
+            # 2. Compute log probs
+            # get log probabilities from logits, process logits with processors (*e.g.* min_length, ...), and
+            # add new logprobs to existing running logprobs scores.
+            log_probs = tf.nn.log_softmax(logits)
+            log_probs = logits_processor(
+                flatten_beam_dim(running_sequences_seq_last), flatten_beam_dim(log_probs), cur_len=cur_len
+            )
+            log_probs = unflatten_beam_dim(log_probs, batch_size, num_beams)
+            log_probs = log_probs + tf.expand_dims(running_scores, axis=2)
+            vocab_size = log_probs.shape[2]
+            log_probs = tf.reshape(log_probs, (batch_size, num_beams * vocab_size))
+
+            # 3. Retrieve top-K
+            # Each item in batch has num_beams * vocab_size candidate sequences. For each item, get the top 2*k
+            # candidates with the highest log-probabilities. We gather the top 2*K beams here so that even if the
+            # best K sequences reach EOS simultaneously, we have another K sequences remaining to continue the live
+            # beam search.
+            # Gather the top 2*K scores from _all_ beams.
+            # Gather 2*k top beams.
+            # Recover the beam index by floor division.
+            # Recover token id by modulo division and expand Id array for broadcasting.
+            # Update sequences for the 2*K top-k new sequences.
+            beams_to_keep = 2 * num_beams
+            topk_log_probs, topk_indices = tf.math.top_k(log_probs, k=beams_to_keep)
+            topk_beam_indices = topk_indices // vocab_size
+            topk_running_sequences_seq_last = gather_beams(running_sequences_seq_last, topk_beam_indices)
+            topk_ids = topk_indices % vocab_size
+
+            # writes the new token
+            intermediary_running_sequences = intermediary_running_sequences.unstack(
+                tf.transpose(topk_running_sequences_seq_last, perm=[2, 0, 1])
+            )
+            topk_sequences = intermediary_running_sequences.write(cur_len, topk_ids)
+            topk_sequences_seq_last = tf.transpose(topk_sequences.stack(), perm=[1, 2, 0])
+
+            # 4. Check which sequences have ended
+            # Update current sequences: Did the top `num_beams` sequences reach an end marker?
+            # To prevent these just finished sequences from being added to the current sequences
+            # set of active beam search sequences, set their log probs to a very large negative value.
+            eos_in_next_token = topk_sequences_seq_last[:, :, cur_len] == eos_token_id
+            if eos_token_id is None:
+                eos_in_next_token = tf.broadcast_to(eos_in_next_token, topk_sequences_seq_last[:, :, cur_len].shape)
+            did_topk_just_finished = eos_in_next_token & tf.broadcast_to(
+                tf.concat((tf.ones((num_beams), dtype=tf.bool), tf.zeros((num_beams), dtype=tf.bool)), axis=0),
+                eos_in_next_token.shape,
+            )
+
+            # non-top `num_beams` eos tokens can't be used to finish a beam, but the others can't be used in the next
+            # running sentences either
+            running_topk_log_probs = topk_log_probs + tf.cast(eos_in_next_token, tf.float32) * -1.0e9
+
+            # 5. Get running sequences scores for next
+            # Determine the top k beam indices (from top 2*k beams) from log probs and gather top k beams
+            # (from top 2*k beams).
+            next_topk_indices = tf.math.top_k(running_topk_log_probs, k=num_beams)[1]
+            next_running_sequences_seq_last, next_running_scores = gather_beams(
+                [topk_sequences_seq_last, running_topk_log_probs], next_topk_indices
+            )
+
+            # 6. Process topk logits
+            # Further process log probs:
+            # - add length penalty
+            # - make sure no scores can be added anymore if beam is full
+            # - make sure still running sequences cannot be chosen as finalized beam
+            topk_log_probs = topk_log_probs / (cur_len**length_penalty)
+            beams_in_batch_are_full = (
+                tf.broadcast_to(
+                    tf.math.reduce_all(is_sent_finished, axis=-1, keepdims=True), did_topk_just_finished.shape
+                )
+                & early_stopping
+            )
+            add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
+            topk_log_probs += tf.cast(add_penalty, tf.float32) * -1.0e9
+
+            # 7. Get scores, sequences, is sentence finished for next.
+            # Combine sequences, scores, and flags along the beam dimension and compare new finished sequence scores
+            # to existing finished scores and select the best from the new set of beams
+            sequences_seq_last = tf.transpose(sequences.stack(), perm=[1, 2, 0])
+            merged_sequences = tf.concat([sequences_seq_last, topk_sequences_seq_last], axis=1)
+            merged_scores = tf.concat([scores, topk_log_probs], axis=1)
+            merged_is_sent_finished = tf.concat([is_sent_finished, did_topk_just_finished], axis=1)
+            topk_merged_indices = tf.math.top_k(merged_scores, k=num_beams)[1]
+            next_sequences_seq_last, next_scores, next_is_sent_finished = gather_beams(
+                [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices
+            )
+
+            # 8. Prepare data for the next iteration
+            # Determine the top k beam indices from the original set of all beams. With these, gather the top k
+            # beam-associated caches.
+            if "past_key_values" in model_outputs:
+                cache = tf.nest.map_structure(
+                    lambda tensor: unflatten_beam_dim(tensor, batch_size, num_beams, batch_axis=cache_batch_axis),
+                    model_outputs.past_key_values,
+                )
+                next_running_indices = gather_beams(topk_beam_indices, next_topk_indices)
+                next_cache = gather_beams(cache, next_running_indices, batch_axis=cache_batch_axis)
+                model_outputs["past_key_values"] = tf.nest.map_structure(
+                    lambda tensor: flatten_beam_dim(tensor, batch_axis=cache_batch_axis), next_cache
+                )
+
+            if use_xla:
+                next_model_kwargs = self._update_model_kwargs_for_xla_generation(
+                    model_outputs, model_kwargs, cur_len, max_length
+                )
+            else:
+                next_model_kwargs = self._update_model_kwargs_for_generation(
+                    model_outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+
+                # if we don't cache past key values we need the whole input
+                if model_kwargs.get("past", None) is None:
+                    input_ids_length = cur_len + 1
+                    # let's throw out `past` since we don't want `None` tensors
+                    model_kwargs.pop("past", None)
+
+            # 9. Prepare the `tf.TensorArray` for the next iteration
+            next_sequences = sequences.unstack(tf.transpose(next_sequences_seq_last, perm=[2, 0, 1]))
+            next_running_sequences = running_sequences.unstack(
+                tf.transpose(next_running_sequences_seq_last, perm=[2, 0, 1])
+            )
+
+            return (
+                cur_len + 1,
+                next_running_sequences,
+                next_running_scores,
+                next_sequences,
+                next_scores,
+                next_is_sent_finished,
+                next_model_kwargs,
+            )
+
+        # 5. run generation
+        # Adds the `intermediary_running_sequences` TensorArray into the body, needed as a scratchpad
+        beam_search_body_fn = partial(
+            beam_search_body_fn, intermediary_running_sequences=intermediary_running_sequences
+        )
+
+        # 1st generation step has to be run before to initialize `past`
+        beam_search_body_fn_first_iter = partial(beam_search_body_fn, input_ids_length=cur_len)
+        (
+            cur_len,
+            running_sequences,
+            running_scores,
+            sequences,
+            scores,
+            is_sent_finished,
+            model_kwargs,
+        ) = beam_search_body_fn_first_iter(
+            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs
+        )
+
+        # 2-to-n generation steps can then be run in autoregressive fashion (only in case 1st generation step does
+        # NOT yield EOS token though)
+        if beam_search_cond_fn(
+            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs
+        ):
+            maximum_iterations = max_length - cur_len
+            cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, _ = tf.while_loop(
+                beam_search_cond_fn,
+                beam_search_body_fn,
+                (cur_len, running_sequences, running_scores, sequences, scores, is_sent_finished, model_kwargs),
+                maximum_iterations=maximum_iterations,
+            )
+
+        # 6. prepare outputs
+        # convert the sequneces to tf.Tensor with shape (batch_size, num_beams, seq_len)
+        sequences_seq_last = tf.transpose(sequences.stack(), perm=[1, 2, 0])
+        running_sequences_seq_last = tf.transpose(running_sequences.stack(), perm=[1, 2, 0])
+
+        # Account for the edge-case where there are no finished sequences for a particular batch item. If so, return
+        # running sequences for that batch item.
+        none_finished = tf.math.reduce_any(is_sent_finished, axis=1)
+        sequences_seq_last = tf.where(none_finished[:, None, None], sequences_seq_last, running_sequences_seq_last)
+        scores = tf.where(none_finished[:, None], scores, running_scores)
+
+        # Take best beams for each batch (the score is sorted in ascending order)
+        sequences_seq_last = flatten_beam_dim(sequences_seq_last[:, :num_return_sequences, :])
+        scores = flatten_beam_dim(scores[:, :num_return_sequences])
+
+        if not use_xla:
+            # Cut for backward compatibility
+            sequences_seq_last = sequences_seq_last[:, :cur_len]
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+
+                return TFBeamSearchEncoderDecoderOutput(
+                    sequences=sequences_seq_last,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return TFBeamSearchDecoderOnlyOutput(
+                    sequences=sequences_seq_last,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequences_seq_last
+
 
 def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
     # create logit penalties for already seen input_ids
@@ -2445,7 +3035,6 @@ def is_done(self, best_sum_logprobs, cur_len):
         If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
         one in the heap, then we are done with this sentence.
         """
-
         if len(self) < self.num_beams:
             return False
         elif self.early_stopping:
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index ee5b32886b07..efa37e32bd75 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -38,7 +38,6 @@
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation_tf_utils import TFGenerationMixin
-from .modeling_tf_outputs import TFSeq2SeqLMOutput
 from .tf_utils import shape_list
 from .tokenization_utils_base import BatchEncoding
 from .utils import (
@@ -53,6 +52,7 @@
     RevisionNotFoundError,
     cached_path,
     copy_func,
+    find_labels,
     has_file,
     hf_bucket_url,
     is_offline_mode,
@@ -715,6 +715,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
     base_model_prefix = ""
     main_input_name = "input_ids"
     _auto_class = None
+    _using_dummy_loss = None
 
     # a list of re pattern of tensor names to ignore from the model when loading the model weights
     # (and avoid unnecessary warnings).
@@ -899,24 +900,46 @@ def compile(
         function themselves.
         """
         if loss == "passthrough":
+            if metrics is not None:
+                raise ValueError(
+                    "Passing metrics as a dict is not supported when using the internal loss! "
+                    "Please either compile the model with a loss, or remove the metrics argument. "
+                    "Note that advanced metrics using the `KerasMetricCallback` can still be used with the internal "
+                    "loss."
+                )
             logger.warning(
                 "No loss specified in compile() - the model's internal loss computation will be used as the "
                 "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! "
-                "Please ensure your labels are passed as keys in the input dict so that they are "
-                "accessible to the model during the forward pass. To disable this behaviour, please pass a "
-                "loss argument, or explicitly pass loss=None if you do not want your model to compute a loss."
+                "To disable this behaviour, please pass a loss argument, or explicitly pass "
+                "`loss=None` if you do not want your model to compute a loss."
+            )
+            loss = dummy_loss
+            self._using_dummy_loss = True
+        else:
+            self._using_dummy_loss = False
+        parent_args = list(inspect.signature(tf.keras.Model.compile).parameters.keys())
+        if "steps_per_execution" in parent_args:
+            super().compile(
+                optimizer=optimizer,
+                loss=loss,
+                metrics=metrics,
+                loss_weights=loss_weights,
+                weighted_metrics=weighted_metrics,
+                run_eagerly=run_eagerly,
+                steps_per_execution=steps_per_execution,
+                **kwargs,
+            )
+        else:
+            super().compile(
+                optimizer=optimizer,
+                loss=loss,
+                metrics=metrics,
+                loss_weights=loss_weights,
+                weighted_metrics=weighted_metrics,
+                run_eagerly=run_eagerly,
+                experimental_steps_per_execution=steps_per_execution,
+                **kwargs,
             )
-            loss = {"loss": dummy_loss}
-        super().compile(
-            optimizer=optimizer,
-            loss=loss,
-            metrics=metrics,
-            loss_weights=loss_weights,
-            weighted_metrics=weighted_metrics,
-            run_eagerly=run_eagerly,
-            steps_per_execution=steps_per_execution,
-            **kwargs,
-        )
 
     def compute_loss(self, *args, **kwargs):
         if hasattr(tf.keras.Model, "compute_loss"):
@@ -935,40 +958,54 @@ def compute_loss(self, *args, **kwargs):
     def train_step(self, data):
         """
         A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss. If
-        a user specifies a loss at model compile time, this function behaves as the original Keras `train_step`. In
-        this case, it expects the same `data` as the original function (i.e. `(inputs, labels)`).
-
-        However, when the model is compiled without specifying the loss AND the expected label columns are passed as
-        part of the input dictionary, the loss is computed internally (inside the model class) and is used in the
-        backwards pass. In this case, `data` is a singleton tuple containing `(inputs,)`.
+        a user specifies a loss at model compile time, this function behaves as the original Keras `train_step`.
 
-        This is possible under the aforementioned circumstances because our overriden compile function can set an
-        additional loss function that reduces a `loss` output, and the model will output a `loss` component (notice the
-        name matching) containing the loss that was used to train the pre-trained model.
+        When the model is compiled without specifying the loss, our overridden compile function can set a simple dummy
+        loss that just reads the loss output head of the model. When using this dummy loss, inputs can be passed either
+        as keys in the input dictionary, or as normal Keras labels.
         """
+
         # These are the only transformations `Model.fit` applies to user-input
         # data when a `tf.data.Dataset` is provided.
-        data = data_adapter.expand_1d(data)
+        if not self._using_dummy_loss:
+            data = data_adapter.expand_1d(data)
         x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-        # These next two lines differ from the base method - they avoid issues when the labels are in
-        # the input dict (and loss is computed internally)
-        if y is None and "labels" in x:
-            y = x["labels"]  # Stops confusion with metric computations
-        elif y is None and "input_ids" in x:
-            # Just make any kind of dummy array to make loss work
-            y = tf.zeros(tf.shape(x["input_ids"])[0], dtype=tf.int64)
+
+        # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
+        # if those keys are not already present in the input dict
+        if self._using_dummy_loss and y is not None:
+            arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+            label_kwargs = find_labels(self.__class__)
+            # If y is a tensor and the model only has one label-like input, map y to that input
+            if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
+                if isinstance(x, tf.Tensor):
+                    x = {arg_names[0]: x}
+                label_kwarg = next(iter(label_kwargs))
+                if label_kwarg not in x:
+                    x[label_kwarg] = y
+            # Otherwise, copy keys from y to x as long as they weren't already present in x
+            elif isinstance(y, dict):
+                if isinstance(x, tf.Tensor):
+                    x = {arg_names[0]: x}
+                for key, val in y.items():
+                    if key in arg_names and key not in x:
+                        x[key] = val
+
         # Run forward pass.
         with tf.GradientTape() as tape:
             y_pred = self(x, training=True)
-            loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
+            if self._using_dummy_loss:
+                loss = self.compiled_loss(y_pred.loss, y_pred.loss, sample_weight, regularization_losses=self.losses)
+            else:
+                loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
         # Run backwards pass.
         self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
-        # When y_pred is a ModelOutput and y is a tf.Tensor the metrics update
-        # should be done only with the relevant ModelOutput param that is
-        # considered by the loss.
-        if isinstance(y_pred, TFSeq2SeqLMOutput) and isinstance(y, tf.Tensor):
-            y_pred = y_pred["logits"]
-        self.compiled_metrics.update_state(y, y_pred, sample_weight)
+
+        # When using the dummy_loss we know metrics are not present, so we can skip a lot of this
+        if self._using_dummy_loss:
+            self.compiled_metrics.update_state(y_pred.loss, y_pred.loss, sample_weight)
+        else:
+            self.compiled_metrics.update_state(y, y_pred, sample_weight)
         # Collect metrics to return
         return_metrics = {}
         for metric in self.metrics:
@@ -985,23 +1022,51 @@ def train_step(self, data):
 
     def test_step(self, data):
         """
-        A modification of Keras's default test_step that cleans up the printed metrics when we use a dummy loss.
+        A modification of Keras's default `test_step` that cleans up the printed metrics when we use a dummy loss. If a
+        user specifies a loss at model compile time, this function behaves as the original Keras `test_step`.
+
+        When the model is compiled without specifying the loss, our overridden compile function can set a simple dummy
+        loss that just reads the loss output head of the model. When using this dummy loss, inputs can be passed either
+        as keys in the input dictionary, or as normal Keras labels.
         """
-        data = data_adapter.expand_1d(data)
+        # These are the only transformations `Model.fit` applies to user-input
+        # data when a `tf.data.Dataset` is provided.
+        if not self._using_dummy_loss:
+            data = data_adapter.expand_1d(data)
         x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-        # These next two lines differ from the base method - they avoid issues when the labels are in
-        # the input dict (and loss is computed internally)
-        if y is None and "labels" in x:
-            y = x["labels"]  # Stops confusion with metric computations
-        elif y is None and "input_ids" in x:
-            # Just make any kind of dummy array to make loss work
-            y = tf.zeros(tf.shape(x["input_ids"])[0], dtype=tf.int64)
+
+        # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
+        # if those keys are not already present in the input dict
+        if self._using_dummy_loss and y is not None:
+            arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+            label_kwargs = find_labels(self.__class__)
+            # If y is a tensor and the model only has one label-like input, map y to that input
+            if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
+                if isinstance(x, tf.Tensor):
+                    x = {arg_names[0]: x}
+                label_kwarg = next(iter(label_kwargs))
+                if label_kwarg not in x:
+                    x[label_kwarg] = y
+            # Otherwise, copy keys from y to x as long as they weren't already present in x
+            elif isinstance(y, dict):
+                if isinstance(x, tf.Tensor):
+                    x = {arg_names[0]: x}
+                for key, val in y.items():
+                    if key in arg_names and key not in x:
+                        x[key] = val
+
+        # Run forward pass.
         y_pred = self(x, training=False)
-        self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
-        # Updates stateful loss metrics.
-        if isinstance(y_pred, TFSeq2SeqLMOutput) and isinstance(y, tf.Tensor):
-            y_pred = y_pred["logits"]
-        self.compiled_metrics.update_state(y, y_pred, sample_weight)
+        if self._using_dummy_loss:
+            self.compiled_loss(y_pred.loss, y_pred.loss, sample_weight, regularization_losses=self.losses)
+        else:
+            self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
+
+        # When using the dummy_loss we know metrics are not present, so we can skip a lot of this
+        if self._using_dummy_loss:
+            self.compiled_metrics.update_state(y_pred.loss, y_pred.loss, sample_weight)
+        else:
+            self.compiled_metrics.update_state(y, y_pred, sample_weight)
         # Collect metrics to return
         return_metrics = {}
         for metric in self.metrics:
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 33401c3c093f..6941e0fdf065 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1361,32 +1361,46 @@ def save_pretrained(
                 if ignore_key in state_dict.keys():
                     del state_dict[ignore_key]
 
-        # Shard the model if it is too big.
-        shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size)
+        from .utils import is_sagemaker_mp_enabled
+        if is_sagemaker_mp_enabled():
+            # Do not shard checkpoints when sagemaker model parallel is enabled
+            # Clean the folder from a previous save
+            for filename in os.listdir(save_directory):
+                full_filename = os.path.join(save_directory, filename)
+                if filename.startswith(WEIGHTS_NAME[:-4]) and os.path.isfile(full_filename):
+                    os.remove(full_filename)
+            output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+
+            # Save the model
+            save_function(state_dict, output_model_file)
+            logger.info(f"Model weights saved in {output_model_file}")
+        else:
+            # Shard the model if it is too big.
+            shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size)
 
-        # Clean the folder from a previous save
-        for filename in os.listdir(save_directory):
-            full_filename = os.path.join(save_directory, filename)
-            if filename.startswith(WEIGHTS_NAME[:-4]) and os.path.isfile(full_filename):
-                os.remove(full_filename)
+            # Clean the folder from a previous save
+            for filename in os.listdir(save_directory):
+                full_filename = os.path.join(save_directory, filename)
+                if filename.startswith(WEIGHTS_NAME[:-4]) and os.path.isfile(full_filename):
+                    os.remove(full_filename)
 
-        # Save the model
-        for shard_file, shard in shards.items():
-            save_function(shard, os.path.join(save_directory, shard_file))
+            # Save the model
+            for shard_file, shard in shards.items():
+                save_function(shard, os.path.join(save_directory, shard_file))
 
-        if index is None:
-            logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
-        else:
-            save_index_file = os.path.join(save_directory, WEIGHTS_INDEX_NAME)
-            # Save the index as well
-            with open(save_index_file, "w", encoding="utf-8") as f:
-                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-                f.write(content)
-            logger.info(
-                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
-                f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
-                f"index located at {save_index_file}."
-            )
+            if index is None:
+                logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
+            else:
+                save_index_file = os.path.join(save_directory, WEIGHTS_INDEX_NAME)
+                # Save the index as well
+                with open(save_index_file, "w", encoding="utf-8") as f:
+                    content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                    f.write(content)
+                logger.info(
+                    f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                    f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+                    f"index located at {save_index_file}."
+                )
 
         if push_to_hub:
             url = self._push_to_hub(repo, commit_message=commit_message)
@@ -1487,12 +1501,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Please refer to the mirror site for more information.
             _fast_init(`bool`, *optional*, defaults to `True`):
                 Whether or not to disable fast initialization.
-            low_cpu_mem_usage(`bool``, *optional*, defaults to `False`):
-                Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
-                This is an experimental feature and a subject to change at any moment.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
 
                 <Tip warning={true}>
 
@@ -1502,6 +1510,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
                 </Tip>
 
+            low_cpu_mem_usage(`bool`, *optional*, defaults to `False`):
+                Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                This is an experimental feature and a subject to change at any moment.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -1792,7 +1806,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # load pt weights early so that we know which dtype to init the model under
         if from_pt:
-            if not is_sharded:
+            if not is_sharded and state_dict is None:
                 # Time to load the checkpoint
                 state_dict = load_state_dict(resolved_archive_file)
             # set dtype to instantiate the model under:
@@ -1823,7 +1837,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 if is_sharded:
                     loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
                 else:
-                    state_dict = load_state_dict(resolved_archive_file)
                     loaded_state_dict_keys = [k for k in state_dict.keys()]
                     del state_dict  # free CPU memory - will reload again later
 
@@ -2103,8 +2116,8 @@ def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=Fal
 
         return retrieved_modules
 
-    @classmethod
-    def _load_pretrained_model_low_mem(cls, model, loaded_state_dict_keys, resolved_archive_file):
+    @staticmethod
+    def _load_pretrained_model_low_mem(model, loaded_state_dict_keys, resolved_archive_file):
         """
         This is an experimental function that loads the model using ~1.x model size CPU memory
 
@@ -2159,16 +2172,18 @@ def find_submodule_and_param_name(model, long_key):
             resolved_archive_file = [resolved_archive_file]
 
         for archive_file in resolved_archive_file:
-            state_dict = torch.load(resolved_archive_file, map_location="cpu")
+            state_dict = torch.load(archive_file, map_location="cpu")
 
             # materialize state_dict entries one by one on CPU
             for k in loaded_state_dict_keys:
-                submodule, param_name = find_submodule_and_param_name(model, k)
-                if submodule is not None:
-                    new_val = state_dict[k]
-                    if isinstance(getattr(submodule, param_name), torch.nn.Parameter):
-                        new_val = torch.nn.Parameter(new_val)
-                    setattr(submodule, param_name, new_val)
+                if k in state_dict:
+                    submodule, param_name = find_submodule_and_param_name(model, k)
+                    if submodule is not None:
+                        param_dtype = getattr(submodule, param_name).dtype
+                        new_val = state_dict[k].to(param_dtype)
+                        if isinstance(getattr(submodule, param_name), torch.nn.Parameter):
+                            new_val = torch.nn.Parameter(new_val)
+                        setattr(submodule, param_name, new_val)
 
             del state_dict
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7045f18c5561..d21f2789b3d7 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -96,6 +96,7 @@
     rag,
     realm,
     reformer,
+    regnet,
     rembert,
     resnet,
     retribert,
@@ -112,6 +113,7 @@
     swin,
     t5,
     tapas,
+    tapex,
     transfo_xl,
     trocr,
     unispeech,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1899c3c249fa..88235528d6a6 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -29,6 +29,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("tapex", "BartConfig"),
         ("dpt", "DPTConfig"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("glpn", "GLPNConfig"),
@@ -38,6 +39,7 @@
         ("convnext", "ConvNextConfig"),
         ("van", "VanConfig"),
         ("resnet", "ResNetConfig"),
+        ("regnet", "RegNetConfig"),
         ("yoso", "YosoConfig"),
         ("swin", "SwinConfig"),
         ("vilt", "ViltConfig"),
@@ -134,7 +136,7 @@
 
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
-        # Add archive maps here
+        # Add archive maps here)
         ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("maskformer", "MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -142,6 +144,7 @@
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("resnet", "RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("regnet", "REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("yoso", "YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -226,6 +229,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("tapex", "TAPEX"),
         ("dpt", "DPT"),
         ("decision_transformer", "Decision Transformer"),
         ("glpn", "GLPN"),
@@ -234,6 +238,7 @@
         ("convnext", "ConvNext"),
         ("van", "VAN"),
         ("resnet", "ResNet"),
+        ("regnet", "RegNet"),
         ("yoso", "YOSO"),
         ("swin", "Swin"),
         ("vilt", "ViLT"),
@@ -389,7 +394,13 @@ def __getitem__(self, key):
         module_name = model_type_to_module_name(key)
         if module_name not in self._modules:
             self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models")
-        return getattr(self._modules[module_name], value)
+        if hasattr(self._modules[module_name], value):
+            return getattr(self._modules[module_name], value)
+
+        # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the
+        # object at the top level.
+        transformers_module = importlib.import_module("transformers")
+        return getattr(transformers_module, value)
 
     def keys(self):
         return list(self._mapping.keys()) + list(self._extra_content.keys())
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index dc83fc133fad..dad7e165e8d7 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -54,6 +54,7 @@
         ("convnext", "ConvNextFeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
         ("resnet", "ConvNextFeatureExtractor"),
+        ("regnet", "ConvNextFeatureExtractor"),
         ("poolformer", "PoolFormerFeatureExtractor"),
         ("maskformer", "MaskFormerFeatureExtractor"),
     ]
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b0cfb4767249..fbe3b50c98a9 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -38,6 +38,7 @@
         ("convnext", "ConvNextModel"),
         ("van", "VanModel"),
         ("resnet", "ResNetModel"),
+        ("regnet", "RegNetModel"),
         ("yoso", "YosoModel"),
         ("swin", "SwinModel"),
         ("vilt", "ViltModel"),
@@ -303,6 +304,7 @@
         ("convnext", "ConvNextForImageClassification"),
         ("van", "VanForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
+        ("regnet", "RegNetForImageClassification"),
         ("poolformer", "PoolFormerForImageClassification"),
     ]
 )
@@ -388,6 +390,7 @@
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Seq2Seq Causal LM mapping
+        ("tapex", "BartForConditionalGeneration"),
         ("plbart", "PLBartForConditionalGeneration"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
         ("m2m_100", "M2M100ForConditionalGeneration"),
@@ -417,6 +420,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("tapex", "BartForSequenceClassification"),
         ("yoso", "YosoForSequenceClassification"),
         ("nystromformer", "NystromformerForSequenceClassification"),
         ("plbart", "PLBartForSequenceClassification"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index a60484134096..b3b0960b23f1 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -105,6 +105,7 @@
             ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
             ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
             ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
+            ("tapex", ("TapexTokenizer", None)),
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index cfb4632a0958..8e5003a4efda 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -56,11 +56,14 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
 
 # SequenceClassification docstring
-_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE = [1, 2]
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2"
+_SEQ_CLASS_EXPECTED_LOSS = 0.0
+_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'"
 
 # QuestionAsnwering docstring
-_QA_EXPECTED_LOSS = 2.98
-_QA_EXPECTED_OUTPUT_SHAPE = [1, 17]
+_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1"
+_QA_EXPECTED_LOSS = 0.59
+_QA_EXPECTED_OUTPUT = "' nice puppet'"
 
 
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -1447,10 +1450,11 @@ def __init__(self, config: BartConfig, **kwargs):
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1572,11 +1576,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_loss=_QA_EXPECTED_LOSS,
-        expected_output=_QA_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_QA_EXPECTED_OUTPUT,
     )
     def forward(
         self,
diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
index 4eaffae3fa6e..048b1b9cb678 100644
--- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
@@ -13,13 +13,16 @@
 # limitations under the License.
 
 """
-This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official GitHub:
-https://github.com/tensorflow/models/tree/master/official/nlp/bert
+This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now
+deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert
 
 TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert
 weight names to the original names, so the model can be imported with Huggingface/transformer.
 
 You may adapt this script to include classification/MLM/NSP/etc. heads.
+
+Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0).
+      Models trained with never versions are not compatible with this script.
 """
 import argparse
 import os
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index b765a854009d..85b48170f70c 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -18,7 +18,7 @@
 import math
 import os
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -1592,7 +1592,7 @@ def forward(
         to_mask=None,
         blocked_encoder_mask=None,
         return_dict=True,
-    ):
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, Tuple]:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -1986,20 +1986,20 @@ def set_attention_type(self, value: str):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple[torch.FloatTensor]]:
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -2280,18 +2280,18 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        next_sentence_label: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BigBirdForPreTrainingOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -2395,19 +2395,19 @@ def set_output_embeddings(self, new_embeddings):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -2493,21 +2493,21 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[CausalLMOutputWithCrossAttentions, Tuple[torch.FloatTensor]]:
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -2664,17 +2664,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -2762,17 +2762,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MultipleChoiceModelOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
@@ -2858,17 +2858,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -2957,19 +2957,19 @@ def __init__(self, config, add_pooling_layer=False):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
         question_lengths=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BigBirdForQuestionAnsweringModelOutput, Tuple[torch.FloatTensor]]:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 1fb8de8e1452..d7025193e930 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -51,17 +51,20 @@
 
 _CHECKPOINT_FOR_DOC = "google/bigbird-pegasus-large-arxiv"
 _CONFIG_FOR_DOC = "BigBirdPegasusConfig"
-_TOKENIZER_FOR_DOC = "PegasusTokenizer"
+_TOKENIZER_FOR_DOC = "PegasusTokenizerFast"
 
 # Base model docstring
 _EXPECTED_OUTPUT_SHAPE = [1, 7, 1024]
 
 # SequenceClassification docstring
-_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE = [1, 2]
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-random-bigbird_pegasus"
+_SEQ_CLASS_EXPECTED_LOSS = 0.69
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
 
 # QuestionAsnwering docstring
-_QA_EXPECTED_LOSS = 2.56
-_QA_EXPECTED_OUTPUT_SHAPE = [1, 12]
+_CHECKPOINT_FOR_QA = "hf-internal-testing/tiny-random-bigbird_pegasus"
+_QA_EXPECTED_LOSS = 3.96
+_QA_EXPECTED_OUTPUT = "''"
 
 
 BIGBIRD_PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -2645,10 +2648,11 @@ def __init__(self, config: BigBirdPegasusConfig, **kwargs):
     @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -2771,11 +2775,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_loss=_QA_EXPECTED_LOSS,
-        expected_output=_QA_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_QA_EXPECTED_OUTPUT,
     )
     def forward(
         self,
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index d6219131aeec..418e1fd39e5f 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -898,6 +898,13 @@ def get_text_features(
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         text_outputs = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -942,6 +949,13 @@ def get_image_features(
 
         >>> image_features = model.get_image_features(**inputs)
         ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
@@ -991,7 +1005,13 @@ def forward(
         >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
         >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index f03046928ca4..78c09339f718 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """ PyTorch CTRL model."""
 
-from typing import Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -359,18 +359,18 @@ def _prune_heads(self, heads_to_prune):
     )
     def forward(
         self,
-        input_ids=None,
-        past_key_values=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]:
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -521,19 +521,19 @@ def prepare_inputs_for_generation(self, input_ids, past=None, use_cache=None, **
     )
     def forward(
         self,
-        input_ids=None,
-        past_key_values=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
@@ -625,19 +625,19 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        past_key_values=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 96a771049337..14823c4352df 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -29,7 +29,7 @@
 
 import math
 import random
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor, nn
@@ -1009,20 +1009,20 @@ def __init__(self, config: FSMTConfig):
     )
     def forward(
         self,
-        input_ids,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs: Optional[Tuple] = None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
         if decoder_input_ids is None:
             use_cache = False
 
@@ -1131,21 +1131,21 @@ def __init__(self, config: FSMTConfig):
     @add_end_docstrings(FSMT_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index df3b97b84e40..ae0605f28454 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -18,7 +18,7 @@
 import math
 import random
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -1567,11 +1567,11 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
 
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
-            shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids`
-            you can choose to directly pass an embedded representation. This is useful if you want more control over
-            how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup
-            matrix.
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
         decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
             representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
@@ -2187,23 +2187,23 @@ def get_decoder(self):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        global_attention_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], LEDSeq2SeqModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2323,24 +2323,24 @@ def set_output_embeddings(self, new_embeddings):
     @add_end_docstrings(LED_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        global_attention_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], LEDSeq2SeqLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -2487,23 +2487,23 @@ def __init__(self, config: LEDConfig, **kwargs):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        global_attention_mask=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], LEDSeq2SeqSequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -2614,24 +2614,24 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        global_attention_mask=None,
-        start_positions=None,
-        end_positions=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], LEDSeq2SeqQuestionAnsweringModelOutput]:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index d816218824e1..f0c0fa4c456d 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -17,7 +17,7 @@
 
 import math
 import random
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -1137,22 +1137,22 @@ def get_decoder(self):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1255,23 +1255,23 @@ def set_output_embeddings(self, new_embeddings):
     @add_end_docstrings(M2M_100_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 6ed7c24ab176..ccf792489eb5 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -55,11 +55,14 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
 
 # SequenceClassification docstring
-_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE = [1, 2]
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-random-mbart"
+_SEQ_CLASS_EXPECTED_LOSS = 0.69
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
 
 # QuestionAsnwering docstring
-_QA_EXPECTED_LOSS = 3.04
-_QA_EXPECTED_OUTPUT_SHAPE = [1, 16]
+_CHECKPOINT_FOR_QA = "hf-internal-testing/tiny-random-mbart"
+_QA_EXPECTED_LOSS = 3.55
+_QA_EXPECTED_OUTPUT = "'? Jim Henson was a'"
 
 
 MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -1437,10 +1440,11 @@ def __init__(self, config: MBartConfig, **kwargs):
     @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
@@ -1563,11 +1567,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_loss=_QA_EXPECTED_LOSS,
-        expected_output=_QA_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_QA_EXPECTED_OUTPUT,
     )
     # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
     def forward(
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index c6169c0ff5e7..bcf155ad7257 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -300,6 +300,10 @@ def main():
     if args.config_file == "":
         # Default config of megatron-bert 345m
         config = MegatronBertConfig()
+
+        # different megatron-bert-*-345m models have different vocab sizes, so override the default
+        # config (which is for megatron-bert-cased-345m) with the actual vocab dimension
+        config.vocab_size = input_state_dict["model"]["lm_head"]["bias"].numel()
     else:
         config = MegatronBertConfig.from_json_file(args.config_file)
 
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index 195f961dcf8b..61f50f72a706 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -17,6 +17,7 @@
 
 
 import math
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -516,16 +517,16 @@ class PreTrainedModel
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         **kwargs,
-    ):
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -599,16 +600,16 @@ def set_output_embeddings(self, new_embeddings):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -702,16 +703,16 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -797,16 +798,16 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
@@ -890,16 +891,16 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -989,17 +990,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index 70ba709e92cb..3e1592721fad 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -16,6 +16,7 @@
 
 
 import math
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -586,16 +587,16 @@ class PreTrainedModel
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -689,17 +690,17 @@ def set_output_embeddings(self, new_embeddings):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -787,17 +788,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -884,17 +885,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
@@ -979,17 +980,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -1059,18 +1060,18 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index ce5ea166781b..ec943347a9bb 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -20,7 +20,7 @@
 import math
 import os
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -437,16 +437,16 @@ def _prune_heads(self, heads_to_prune):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -557,17 +557,17 @@ def set_output_embeddings(self, new_embeddings):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
@@ -642,19 +642,19 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=OpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        mc_token_ids=None,
-        labels=None,
-        mc_labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        mc_token_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        mc_labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], OpenAIGPTDoubleHeadsModelOutput]:
         r"""
         mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
             Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
@@ -765,17 +765,17 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 37230541e9db..af46fa8b449f 100755
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -54,7 +54,9 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
 
 # SequenceClassification docstring
-_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE = [1, 2]
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "hf-internal-testing/tiny-plbart"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.69
 
 
 PLBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -1408,10 +1410,11 @@ def __init__(self, config: PLBartConfig, **kwargs):
     @add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT_SHAPE,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index e3a303e1fd2f..593efa694e5b 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -496,7 +496,7 @@ def __init__(
         config: Optional[PretrainedConfig] = None,
         question_encoder: Optional[PreTrainedModel] = None,
         generator: Optional[PreTrainedModel] = None,
-        retriever: Optional = None,  # or maybe just use a `set_retriever(...)` method
+        retriever: Optional[RagRetriever] = None,  # or maybe just use a `set_retriever(...)` method
         **kwargs,
     ):
         assert config is not None or (
@@ -537,21 +537,21 @@ def __init__(
     @replace_return_docstrings(output_type=RetrievAugLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_outputs=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        past_key_values=None,
-        doc_scores=None,
-        context_input_ids=None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        doc_scores: Optional[torch.FloatTensor] = None,
+        context_input_ids: Optional[torch.LongTensor] = None,
         context_attention_mask=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        output_retrieved=None,
-        n_docs=None,
-    ):
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_retrieved: Optional[bool] = None,
+        n_docs: Optional[int] = None,
+    ) -> Union[Tuple[torch.Tensor], RetrievAugLMOutput]:
         r"""
         Returns:
 
@@ -740,7 +740,7 @@ def __init__(
         config: Optional[PretrainedConfig] = None,
         question_encoder: Optional[PreTrainedModel] = None,
         generator: Optional[PreTrainedModel] = None,
-        retriever: Optional = None,
+        retriever: Optional[RagRetriever] = None,
         **kwargs,
     ):
         assert config is not None or (
@@ -1137,7 +1137,7 @@ def __init__(
         config: Optional[PretrainedConfig] = None,
         question_encoder: Optional[PreTrainedModel] = None,
         generator: Optional[PreTrainedModel] = None,
-        retriever: Optional = None,
+        retriever: Optional[RagRetriever] = None,
         **kwargs,
     ):
         assert config is not None or (
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index c5245108e8e9..30f50a29ff40 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -1245,7 +1245,10 @@ def extend_enc_output(tensor, num_beams=None):
                 no_repeat_ngram_size=no_repeat_ngram_size,
                 bad_words_ids=bad_words_ids,
                 min_length=min_length,
+                max_length=max_length,
                 eos_token_id=eos_token_id,
+                forced_bos_token_id=None,
+                forced_eos_token_id=None,
             )
             model_kwargs["attention_mask"] = context_attention_mask
 
diff --git a/src/transformers/models/regnet/__init__.py b/src/transformers/models/regnet/__init__.py
new file mode 100644
index 000000000000..185ead37b640
--- /dev/null
+++ b/src/transformers/models/regnet/__init__.py
@@ -0,0 +1,52 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_regnet"] = [
+        "REGNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RegNetForImageClassification",
+        "RegNetModel",
+        "RegNetPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
+
+    if is_torch_available():
+        from .modeling_regnet import (
+            REGNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RegNetForImageClassification,
+            RegNetModel,
+            RegNetPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py
new file mode 100644
index 000000000000..9bfe35ec9b3d
--- /dev/null
+++ b/src/transformers/models/regnet/configuration_regnet.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RegNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "regnety-40": "https://huggingface.co/zuppif/regnety-040/blob/main/config.json",
+}
+
+
+class RegNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RegNetModel`]. It is used to instantiate a RegNet
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [facebook/regnet-y-40](https://huggingface.co/facebook/regnet-y-40) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"y"`):
+            The layer to use, it can be either `"x" or `"y"`. An `x` layer is a ResNet's BottleNeck layer with
+            `reduction` fixed to `1`. While a `y` layer is a `x` but with squeeze and excitation. Please refer to the
+            paper for a detailed explanation of how these layers were constructed.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+
+    Example:
+    ```python
+    >>> from transformers import RegNetConfig, RegNetModel
+
+    >>> # Initializing a RegNet regnet-y-40 style configuration
+    >>> configuration = RegNetConfig()
+    >>> # Initializing a model from the regnet-y-40 style configuration
+    >>> model = RegNetModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "regnet"
+    layer_types = ["x", "y"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=32,
+        hidden_sizes=[128, 192, 512, 1088],
+        depths=[2, 6, 12, 2],
+        groups_width=64,
+        layer_type="y",
+        hidden_act="relu",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.groups_width = groups_width
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        # always downsample in the first stage
+        self.downsample_in_first_stage = True
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
new file mode 100644
index 000000000000..8024ef679201
--- /dev/null
+++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
@@ -0,0 +1,301 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RegNet 10B checkpoints vissl."""
+# You need to install a specific version of classy vision
+# pip install git+https://github.com/FrancescoSaverioZuppichini/ClassyVision.git@convert_weights
+
+import argparse
+import json
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from pprint import pprint
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from classy_vision.models.regnet import RegNet, RegNetParams
+from huggingface_hub import cached_download, hf_hub_url
+from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger()
+
+
+@dataclass
+class Tracker:
+    module: nn.Module
+    traced: List[nn.Module] = field(default_factory=list)
+    handles: list = field(default_factory=list)
+    name2module: Dict[str, nn.Module] = field(default_factory=OrderedDict)
+
+    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor, name: str):
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        if has_not_submodules:
+            self.traced.append(m)
+            self.name2module[name] = m
+
+    def __call__(self, x: Tensor):
+        for name, m in self.module.named_modules():
+            self.handles.append(m.register_forward_hook(partial(self._forward_hook, name=name)))
+        self.module(x)
+        list(map(lambda x: x.remove(), self.handles))
+        return self
+
+    @property
+    def parametrized(self):
+        # check the len of the state_dict keys to see if we have learnable params
+        return {k: v for k, v in self.name2module.items() if len(list(v.state_dict().keys())) > 0}
+
+
+class FakeRegNetVisslWrapper(nn.Module):
+    """
+    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
+    """
+
+    def __init__(self, model: nn.Module):
+        super().__init__()
+
+        feature_blocks: List[Tuple[str, nn.Module]] = []
+        # - get the stem
+        feature_blocks.append(("conv1", model.stem))
+        # - get all the feature blocks
+        for k, v in model.trunk_output.named_children():
+            assert k.startswith("block"), f"Unexpected layer name {k}"
+            block_index = len(feature_blocks) + 1
+            feature_blocks.append((f"res{block_index}", v))
+
+        self._feature_blocks = nn.ModuleDict(feature_blocks)
+
+    def forward(self, x: Tensor):
+        return get_trunk_forward_outputs(
+            x,
+            out_feat_keys=None,
+            feature_blocks=self._feature_blocks,
+        )
+
+
+class FakeRegNetParams(RegNetParams):
+    """
+    Used to instantiace a RegNet model from classy vision with the same depth as the 10B one but with super small
+    parameters, so we can trace it in memory.
+    """
+
+    def get_expanded_params(self):
+        return [(8, 2, 2, 8, 1.0), (8, 2, 7, 8, 1.0), (8, 2, 17, 8, 1.0), (8, 2, 1, 8, 1.0)]
+
+
+def get_from_to_our_keys(model_name: str) -> Dict[str, str]:
+    """
+    Returns a dictionary that maps from original model's key -> our implementation's keys
+    """
+
+    # create our model (with small weights)
+    our_config = RegNetConfig(depths=[2, 7, 17, 1], hidden_sizes=[8, 8, 8, 8], groups_width=8)
+    if "in1k" in model_name:
+        our_model = RegNetForImageClassification(our_config)
+    else:
+        our_model = RegNetModel(our_config)
+    # create from model (with small weights)
+    from_model = FakeRegNetVisslWrapper(
+        RegNet(FakeRegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
+    )
+
+    with torch.no_grad():
+        from_model = from_model.eval()
+        our_model = our_model.eval()
+
+        x = torch.randn((1, 3, 32, 32))
+        # trace both
+        dest_tracker = Tracker(our_model)
+        dest_traced = dest_tracker(x).parametrized
+
+        pprint(dest_tracker.name2module)
+        src_tracker = Tracker(from_model)
+        src_traced = src_tracker(x).parametrized
+
+    # convert the keys -> module dict to keys -> params
+    def to_params_dict(dict_with_modules):
+        params_dict = OrderedDict()
+        for name, module in dict_with_modules.items():
+            for param_name, param in module.state_dict().items():
+                params_dict[f"{name}.{param_name}"] = param
+        return params_dict
+
+    from_to_ours_keys = {}
+
+    src_state_dict = to_params_dict(src_traced)
+    dst_state_dict = to_params_dict(dest_traced)
+
+    for (src_key, src_param), (dest_key, dest_param) in zip(src_state_dict.items(), dst_state_dict.items()):
+        from_to_ours_keys[src_key] = dest_key
+        logger.info(f"{src_key} -> {dest_key}")
+    # if "in1k" was in the model_name it means it must have a classification head (was finetuned)
+    if "in1k" in model_name:
+        from_to_ours_keys["0.clf.0.weight"] = "classifier.1.weight"
+        from_to_ours_keys["0.clf.0.bias"] = "classifier.1.bias"
+
+    return from_to_ours_keys
+
+
+def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
+    filename = "imagenet-1k-id2label.json"
+    num_labels = 1000
+
+    repo_id = "datasets/huggingface/label-files"
+    num_labels = num_labels
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+
+    id2label = id2label
+    label2id = {v: k for k, v in id2label.items()}
+
+    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
+
+    names_to_config = {
+        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+        # finetuned on imagenet
+        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+    }
+
+    # add seer weights logic
+    def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
+        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
+        # check if we have a head, if yes add it
+        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
+        return model_state_dict["trunk"], model_state_dict["heads"]
+
+    names_to_from_model = {
+        "regnet-y-10b-seer": partial(
+            load_using_classy_vision,
+            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
+        ),
+        "regnet-y-10b-seer-in1k": partial(
+            load_using_classy_vision,
+            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
+        ),
+    }
+
+    from_to_ours_keys = get_from_to_our_keys(model_name)
+
+    if not (save_directory / f"{model_name}.pth").exists():
+        logger.info("Loading original state_dict.")
+        from_state_dict_trunk, from_state_dict_head = names_to_from_model[model_name]()
+        from_state_dict = from_state_dict_trunk
+        if "in1k" in model_name:
+            # add the head
+            from_state_dict = {**from_state_dict_trunk, **from_state_dict_head}
+        logger.info("Done!")
+
+        converted_state_dict = {}
+
+        not_used_keys = list(from_state_dict.keys())
+        regex = r"\.block.-part."
+        # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it
+        for key in from_state_dict.keys():
+            # remove the weird "block[0,1]-part" from the key
+            src_key = re.sub(regex, "", key)
+            # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key
+            dest_key = from_to_ours_keys[src_key]
+            # store the parameter with our key
+            converted_state_dict[dest_key] = from_state_dict[key]
+            not_used_keys.remove(key)
+        # check that all keys have been updated
+        assert len(not_used_keys) == 0, f"Some keys where not used {','.join(not_used_keys)}"
+
+        logger.info(f"The following keys were not used: {','.join(not_used_keys)}")
+
+        # save our state dict to disk
+        torch.save(converted_state_dict, save_directory / f"{model_name}.pth")
+
+        del converted_state_dict
+    else:
+        logger.info("The state_dict was already stored on disk.")
+    if push_to_hub:
+        logger.info(f"Token is {os.environ['HF_TOKEN']}")
+        logger.info("Loading our model.")
+        # create our model
+        our_config = names_to_config[model_name]
+        our_model_func = RegNetModel
+        if "in1k" in model_name:
+            our_model_func = RegNetForImageClassification
+        our_model = our_model_func(our_config)
+        # place our model to the meta device (so remove all the weights)
+        our_model.to(torch.device("meta"))
+        logger.info("Loading state_dict in our model.")
+        # load state dict
+        state_dict_keys = our_model.state_dict().keys()
+        PreTrainedModel._load_pretrained_model_low_mem(
+            our_model, state_dict_keys, [save_directory / f"{model_name}.pth"]
+        )
+        logger.info("Finally, pushing!")
+        # push it to hub
+        our_model.push_to_hub(
+            repo_path_or_name=save_directory / model_name,
+            commit_message="Add model",
+            output_dir=save_directory / model_name,
+        )
+        size = 384
+        # we can use the convnext one
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        feature_extractor.push_to_hub(
+            repo_path_or_name=save_directory / model_name,
+            commit_message="Add feature extractor",
+            output_dir=save_directory / model_name,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default=None,
+        type=str,
+        help="The name of the model you wish to convert, it must be one of the supported regnet* architecture, currently: regnetx-*, regnety-*. If `None`, all of them will the converted.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=Path,
+        required=True,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        required=False,
+        help="If True, push model and feature extractor to the hub.",
+    )
+
+    args = parser.parse_args()
+
+    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
+    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
+    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
new file mode 100644
index 000000000000..96e4ab700ab5
--- /dev/null
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RegNet checkpoints from timm and vissl."""
+
+
+import argparse
+import json
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Callable, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+import timm
+from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf
+from huggingface_hub import cached_download, hf_hub_url
+from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers.utils import logging
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger()
+
+
+@dataclass
+class Tracker:
+    module: nn.Module
+    traced: List[nn.Module] = field(default_factory=list)
+    handles: list = field(default_factory=list)
+
+    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        if has_not_submodules:
+            self.traced.append(m)
+
+    def __call__(self, x: Tensor):
+        for m in self.module.modules():
+            self.handles.append(m.register_forward_hook(self._forward_hook))
+        self.module(x)
+        list(map(lambda x: x.remove(), self.handles))
+        return self
+
+    @property
+    def parametrized(self):
+        # check the len of the state_dict keys to see if we have learnable params
+        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
+
+
+@dataclass
+class ModuleTransfer:
+    src: nn.Module
+    dest: nn.Module
+    verbose: int = 1
+    src_skip: List = field(default_factory=list)
+    dest_skip: List = field(default_factory=list)
+    raise_if_mismatch: bool = True
+
+    def __call__(self, x: Tensor):
+        """
+        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
+        hood we tracked all the operations in both modules.
+        """
+        dest_traced = Tracker(self.dest)(x).parametrized
+        src_traced = Tracker(self.src)(x).parametrized
+
+        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
+        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
+
+        if len(dest_traced) != len(src_traced) and self.raise_if_mismatch:
+            raise Exception(
+                f"Numbers of operations are different. Source module has {len(src_traced)} operations while destination module has {len(dest_traced)}."
+            )
+
+        for dest_m, src_m in zip(dest_traced, src_traced):
+            dest_m.load_state_dict(src_m.state_dict())
+            if self.verbose == 1:
+                print(f"Transfered from={src_m} to={dest_m}")
+
+
+class FakeRegNetVisslWrapper(nn.Module):
+    """
+    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
+    """
+
+    def __init__(self, model: nn.Module):
+        super().__init__()
+
+        feature_blocks: List[Tuple[str, nn.Module]] = []
+        # - get the stem
+        feature_blocks.append(("conv1", model.stem))
+        # - get all the feature blocks
+        for k, v in model.trunk_output.named_children():
+            assert k.startswith("block"), f"Unexpected layer name {k}"
+            block_index = len(feature_blocks) + 1
+            feature_blocks.append((f"res{block_index}", v))
+
+        self._feature_blocks = nn.ModuleDict(feature_blocks)
+
+    def forward(self, x: Tensor):
+        return get_trunk_forward_outputs(
+            x,
+            out_feat_keys=None,
+            feature_blocks=self._feature_blocks,
+        )
+
+
+class NameToFromModelFuncMap(dict):
+    """
+    A Dictionary with some additional logic to return a function that creates the correct original model.
+    """
+
+    def convert_name_to_timm(self, x: str) -> str:
+        x_split = x.split("-")
+        return x_split[0] + x_split[1] + "_" + "".join(x_split[2:])
+
+    def __getitem__(self, x: str) -> Callable[[], Tuple[nn.Module, Dict]]:
+        # default to timm!
+        if x not in self:
+            x = self.convert_name_to_timm(x)
+            val = partial(lambda: (timm.create_model(x, pretrained=True).eval(), None))
+
+        else:
+            val = super().__getitem__(x)
+
+        return val
+
+
+class NameToOurModelFuncMap(dict):
+    """
+    A Dictionary with some additional logic to return the correct hugging face RegNet class reference.
+    """
+
+    def __getitem__(self, x: str) -> Callable[[], nn.Module]:
+        if "seer" in x and "in1k" not in x:
+            val = RegNetModel
+        else:
+            val = RegNetForImageClassification
+        return val
+
+
+def manually_copy_vissl_head(from_state_dict, to_state_dict, keys: List[Tuple[str, str]]):
+    for from_key, to_key in keys:
+        to_state_dict[to_key] = from_state_dict[from_key].clone()
+        print(f"Copied key={from_key} to={to_key}")
+    return to_state_dict
+
+
+def convert_weight_and_push(
+    name: str,
+    from_model_func: Callable[[], nn.Module],
+    our_model_func: Callable[[], nn.Module],
+    config: RegNetConfig,
+    save_directory: Path,
+    push_to_hub: bool = True,
+):
+    print(f"Converting {name}...")
+    with torch.no_grad():
+        from_model, from_state_dict = from_model_func()
+        our_model = our_model_func(config).eval()
+        module_transfer = ModuleTransfer(src=from_model, dest=our_model, raise_if_mismatch=False)
+        x = torch.randn((1, 3, 224, 224))
+        module_transfer(x)
+
+    if from_state_dict is not None:
+        keys = []
+        # for seer - in1k finetuned we have to manually copy the head
+        if "seer" in name and "in1k" in name:
+            keys = [("0.clf.0.weight", "classifier.1.weight"), ("0.clf.0.bias", "classifier.1.bias")]
+        to_state_dict = manually_copy_vissl_head(from_state_dict, our_model.state_dict(), keys)
+        our_model.load_state_dict(to_state_dict)
+
+    our_outputs = our_model(x, output_hidden_states=True)
+    our_output = (
+        our_outputs.logits if isinstance(our_model, RegNetForImageClassification) else our_outputs.last_hidden_state
+    )
+
+    from_output = from_model(x)
+    from_output = from_output[-1] if type(from_output) is list else from_output
+
+    # now since I don't want to use any config files, vissl seer model doesn't actually have an head, so let's just check the last hidden state
+    if "seer" in name and "in1k" in name:
+        our_output = our_outputs.hidden_states[-1]
+
+    assert torch.allclose(from_output, our_output), "The model logits don't match the original one."
+
+    if push_to_hub:
+        our_model.push_to_hub(
+            repo_path_or_name=save_directory / name,
+            commit_message="Add model",
+            use_temp_dir=True,
+        )
+
+        size = 224 if "seer" not in name else 384
+        # we can use the convnext one
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        feature_extractor.push_to_hub(
+            repo_path_or_name=save_directory / name,
+            commit_message="Add feature extractor",
+            use_temp_dir=True,
+        )
+
+        print(f"Pushed {name}")
+
+
+def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
+    filename = "imagenet-1k-id2label.json"
+    num_labels = 1000
+    expected_shape = (1, num_labels)
+
+    repo_id = "datasets/huggingface/label-files"
+    num_labels = num_labels
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+
+    id2label = id2label
+    label2id = {v: k for k, v in id2label.items()}
+
+    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
+
+    names_to_config = {
+        "regnet-x-002": ImageNetPreTrainedConfig(
+            depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8, layer_type="x"
+        ),
+        "regnet-x-004": ImageNetPreTrainedConfig(
+            depths=[1, 2, 7, 12], hidden_sizes=[32, 64, 160, 384], groups_width=16, layer_type="x"
+        ),
+        "regnet-x-006": ImageNetPreTrainedConfig(
+            depths=[1, 3, 5, 7], hidden_sizes=[48, 96, 240, 528], groups_width=24, layer_type="x"
+        ),
+        "regnet-x-008": ImageNetPreTrainedConfig(
+            depths=[1, 3, 7, 5], hidden_sizes=[64, 128, 288, 672], groups_width=16, layer_type="x"
+        ),
+        "regnet-x-016": ImageNetPreTrainedConfig(
+            depths=[2, 4, 10, 2], hidden_sizes=[72, 168, 408, 912], groups_width=24, layer_type="x"
+        ),
+        "regnet-x-032": ImageNetPreTrainedConfig(
+            depths=[2, 6, 15, 2], hidden_sizes=[96, 192, 432, 1008], groups_width=48, layer_type="x"
+        ),
+        "regnet-x-040": ImageNetPreTrainedConfig(
+            depths=[2, 5, 14, 2], hidden_sizes=[80, 240, 560, 1360], groups_width=40, layer_type="x"
+        ),
+        "regnet-x-064": ImageNetPreTrainedConfig(
+            depths=[2, 4, 10, 1], hidden_sizes=[168, 392, 784, 1624], groups_width=56, layer_type="x"
+        ),
+        "regnet-x-080": ImageNetPreTrainedConfig(
+            depths=[2, 5, 15, 1], hidden_sizes=[80, 240, 720, 1920], groups_width=120, layer_type="x"
+        ),
+        "regnet-x-120": ImageNetPreTrainedConfig(
+            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112, layer_type="x"
+        ),
+        "regnet-x-160": ImageNetPreTrainedConfig(
+            depths=[2, 6, 13, 1], hidden_sizes=[256, 512, 896, 2048], groups_width=128, layer_type="x"
+        ),
+        "regnet-x-320": ImageNetPreTrainedConfig(
+            depths=[2, 7, 13, 1], hidden_sizes=[336, 672, 1344, 2520], groups_width=168, layer_type="x"
+        ),
+        # y variant
+        "regnet-y-002": ImageNetPreTrainedConfig(depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8),
+        "regnet-y-004": ImageNetPreTrainedConfig(
+            depths=[1, 3, 6, 6], hidden_sizes=[48, 104, 208, 440], groups_width=8
+        ),
+        "regnet-y-006": ImageNetPreTrainedConfig(
+            depths=[1, 3, 7, 4], hidden_sizes=[48, 112, 256, 608], groups_width=16
+        ),
+        "regnet-y-008": ImageNetPreTrainedConfig(
+            depths=[1, 3, 8, 2], hidden_sizes=[64, 128, 320, 768], groups_width=16
+        ),
+        "regnet-y-016": ImageNetPreTrainedConfig(
+            depths=[2, 6, 17, 2], hidden_sizes=[48, 120, 336, 888], groups_width=24
+        ),
+        "regnet-y-032": ImageNetPreTrainedConfig(
+            depths=[2, 5, 13, 1], hidden_sizes=[72, 216, 576, 1512], groups_width=24
+        ),
+        "regnet-y-040": ImageNetPreTrainedConfig(
+            depths=[2, 6, 12, 2], hidden_sizes=[128, 192, 512, 1088], groups_width=64
+        ),
+        "regnet-y-064": ImageNetPreTrainedConfig(
+            depths=[2, 7, 14, 2], hidden_sizes=[144, 288, 576, 1296], groups_width=72
+        ),
+        "regnet-y-080": ImageNetPreTrainedConfig(
+            depths=[2, 4, 10, 1], hidden_sizes=[168, 448, 896, 2016], groups_width=56
+        ),
+        "regnet-y-120": ImageNetPreTrainedConfig(
+            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112
+        ),
+        "regnet-y-160": ImageNetPreTrainedConfig(
+            depths=[2, 4, 11, 1], hidden_sizes=[224, 448, 1232, 3024], groups_width=112
+        ),
+        "regnet-y-320": ImageNetPreTrainedConfig(
+            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
+        ),
+        # models created by SEER -> https://arxiv.org/abs/2202.08360
+        "regnet-y-320-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232),
+        "regnet-y-640-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328),
+        "regnet-y-1280-seer": RegNetConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
+        ),
+        "regnet-y-2560-seer": RegNetConfig(
+            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
+        ),
+        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+        # finetuned on imagenet
+        "regnet-y-320-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
+        ),
+        "regnet-y-640-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328
+        ),
+        "regnet-y-1280-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
+        ),
+        "regnet-y-2560-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
+        ),
+        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+    }
+
+    names_to_ours_model_map = NameToOurModelFuncMap()
+    names_to_from_model_map = NameToFromModelFuncMap()
+    # add seer weights logic
+
+    def load_using_classy_vision(checkpoint_url: str, model_func: Callable[[], nn.Module]) -> Tuple[nn.Module, Dict]:
+        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
+        model = model_func()
+        # check if we have a head, if yes add it
+        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
+        state_dict = model_state_dict["trunk"]
+        model.load_state_dict(state_dict)
+        return model.eval(), model_state_dict["heads"]
+
+    # pretrained
+    names_to_from_model_map["regnet-y-320-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet32d/seer_regnet32gf_model_iteration244000.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
+    )
+
+    names_to_from_model_map["regnet-y-640-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet64/seer_regnet64gf_model_final_checkpoint_phase0.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
+    )
+
+    names_to_from_model_map["regnet-y-1280-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/swav_ig1b_regnet128Gf_cnstant_bs32_node16_sinkhorn10_proto16k_syncBN64_warmup8k/model_final_checkpoint_phase0.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
+    )
+
+    names_to_from_model_map["regnet-y-10b-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
+        lambda: FakeRegNetVisslWrapper(
+            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
+        ),
+    )
+
+    # IN1K finetuned
+    names_to_from_model_map["regnet-y-320-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet32_finetuned_in1k_model_final_checkpoint_phase78.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
+    )
+
+    names_to_from_model_map["regnet-y-640-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet64_finetuned_in1k_model_final_checkpoint_phase78.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
+    )
+
+    names_to_from_model_map["regnet-y-1280-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet128_finetuned_in1k_model_final_checkpoint_phase78.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
+    )
+
+    names_to_from_model_map["regnet-y-10b-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
+        lambda: FakeRegNetVisslWrapper(
+            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
+        ),
+    )
+
+    if model_name:
+        convert_weight_and_push(
+            model_name,
+            names_to_from_model_map[model_name],
+            names_to_ours_model_map[model_name],
+            names_to_config[model_name],
+            save_directory,
+            push_to_hub,
+        )
+    else:
+        for model_name, config in names_to_config.items():
+            convert_weight_and_push(
+                model_name,
+                names_to_from_model_map[model_name],
+                names_to_ours_model_map[model_name],
+                config,
+                save_directory,
+                push_to_hub,
+            )
+    return config, expected_shape
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default=None,
+        type=str,
+        help="The name of the model you wish to convert, it must be one of the supported regnet* architecture, currently: regnetx-*, regnety-*. If `None`, all of them will the converted.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=Path,
+        required=True,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        required=False,
+        help="If True, push model and feature extractor to the hub.",
+    )
+
+    args = parser.parse_args()
+
+    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
+    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
+    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py
new file mode 100644
index 000000000000..0ebd05a25ce1
--- /dev/null
+++ b/src/transformers/models/regnet/modeling_regnet.py
@@ -0,0 +1,444 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch RegNet model."""
+
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_regnet import RegNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "RegNetConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
+_EXPECTED_OUTPUT_SHAPE = [1, 1088, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/regnet-y-040"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'"
+
+REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/regnet-y-040",
+    # See all regnet models at https://huggingface.co/models?filter=regnet
+]
+
+
+class RegNetConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        groups: int = 1,
+        activation: Optional[str] = "relu",
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=False,
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state = self.convolution(hidden_state)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetEmbeddings(nn.Module):
+    """
+    RegNet Embedddings (stem) composed of a single aggressive convolution.
+    """
+
+    def __init__(self, config: RegNetConfig):
+        super().__init__()
+        self.embedder = RegNetConvLayer(
+            config.num_channels, config.embedding_size, kernel_size=3, stride=2, activation=config.hidden_act
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.embedder(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut with ResNet->RegNet
+class RegNetShortCut(nn.Sequential):
+    """
+    RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+        self.normalization = nn.BatchNorm2d(out_channels)
+
+
+class RegNetSELayer(nn.Module):
+    """
+    Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507).
+    """
+
+    def __init__(self, in_channels: int, reduced_channels: int):
+        super().__init__()
+
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        self.attention = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, hidden_state):
+        # b c h w -> b c 1 1
+        pooled = self.pooler(hidden_state)
+        attention = self.attention(pooled)
+        hidden_state = hidden_state * attention
+        return hidden_state
+
+
+class RegNetXLayer(nn.Module):
+    """
+    RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1):
+        super().__init__()
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            RegNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
+        )
+        self.layer = nn.Sequential(
+            RegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetYLayer(nn.Module):
+    """
+    RegNet's Y layer: an X layer with Squeeze and Excitation.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1):
+        super().__init__()
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            RegNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
+        )
+        self.layer = nn.Sequential(
+            RegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act),
+            RegNetSELayer(out_channels, reduced_channels=int(round(in_channels / 4))),
+            RegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetStage(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        config: RegNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 2,
+        depth: int = 2,
+    ):
+        super().__init__()
+
+        layer = RegNetXLayer if config.layer_type == "x" else RegNetYLayer
+
+        self.layers = nn.Sequential(
+            # downsampling is done in the first layer with stride of 2
+            layer(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+            ),
+            *[layer(config, out_channels, out_channels) for _ in range(depth - 1)],
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.layers(hidden_state)
+        return hidden_state
+
+
+class RegNetEncoder(nn.Module):
+    def __init__(self, config: RegNetConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        # based on `downsample_in_first_stage`, the first layer of the first stage may or may not downsample the input
+        self.stages.append(
+            RegNetStage(
+                config,
+                config.embedding_size,
+                config.hidden_sizes[0],
+                stride=2 if config.downsample_in_first_stage else 1,
+                depth=config.depths[0],
+            )
+        )
+        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+        for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
+            self.stages.append(RegNetStage(config, in_channels, out_channels, depth=depth))
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RegNet,resnet->regnet
+class RegNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RegNetConfig
+    base_model_prefix = "regnet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RegNetModel):
+            module.gradient_checkpointing = value
+
+
+REGNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`RegNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+REGNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RegNet model outputting raw features without any specific head on top.",
+    REGNET_START_DOCSTRING,
+)
+# Copied from transformers.models.resnet.modeling_resnet.ResNetModel with RESNET->REGNET,ResNet->RegNet
+class RegNetModel(RegNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embedder = RegNetEmbeddings(config)
+        self.encoder = RegNetEncoder(config)
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.pooler(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    RegNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    REGNET_START_DOCSTRING,
+)
+# Copied from transformers.models.resnet.modeling_resnet.ResNetForImageClassification with RESNET->REGNET,ResNet->RegNet,resnet->regnet
+class RegNetForImageClassification(RegNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.regnet = RegNetModel(config)
+        # classification head
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
+        )
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Tensor = None,
+        labels: Tensor = None,
+        output_hidden_states: bool = None,
+        return_dict: bool = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index 00c589d68f7b..f2f555d7f519 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -66,12 +66,14 @@ def __init__(
 
 class ResNetEmbeddings(nn.Sequential):
     """
-    ResNet Embedddings (stem) composed of a single aggressive convolution.
+    ResNet Embeddings (stem) composed of a single aggressive convolution.
     """
 
-    def __init__(self, num_channels: int, out_channels: int, activation: str = "relu"):
+    def __init__(self, config: ResNetConfig):
         super().__init__()
-        self.embedder = ResNetConvLayer(num_channels, out_channels, kernel_size=7, stride=2, activation=activation)
+        self.embedder = ResNetConvLayer(
+            config.num_channels, config.embedding_size, kernel_size=7, stride=2, activation=config.hidden_act
+        )
         self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 
 
@@ -267,7 +269,7 @@ class ResNetModel(ResNetPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        self.embedder = ResNetEmbeddings(config.num_channels, config.embedding_size, config.hidden_act)
+        self.embedder = ResNetEmbeddings(config)
         self.encoder = ResNetEncoder(config)
         self.pooler = nn.AdaptiveAvgPool2d((1, 1))
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index 58c8b29f7371..fa54c62c227c 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -54,7 +54,7 @@ class SegformerConfig(PretrainedConfig):
             Patch size before each encoder block.
         strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
             Stride before each encoder block.
-        num_attention_heads (`List[int]`, *optional*, defaults to [1, 2, 4, 8]):
+        num_attention_heads (`List[int]`, *optional*, defaults to [1, 2, 5, 8]):
             Number of attention heads for each attention layer in each block of the Transformer encoder.
         mlp_ratios (`List[int]`, *optional*, defaults to [4, 4, 4, 4]):
             Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
diff --git a/src/transformers/models/tapex/__init__.py b/src/transformers/models/tapex/__init__.py
new file mode 100644
index 000000000000..36c5938d23c9
--- /dev/null
+++ b/src/transformers/models/tapex/__init__.py
@@ -0,0 +1,36 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...file_utils import _LazyModule
+
+
+_import_structure = {
+    "tokenization_tapex": ["TapexTokenizer"],
+}
+
+
+if TYPE_CHECKING:
+    from .tokenization_tapex import TapexTokenizer
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py
new file mode 100644
index 000000000000..0b5c1241415a
--- /dev/null
+++ b/src/transformers/models/tapex/tokenization_tapex.py
@@ -0,0 +1,1512 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for TAPEX."""
+
+import json
+import os
+import random
+from contextlib import contextmanager
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple, Union
+
+import regex as re
+
+from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ...utils import logging
+
+
+if is_pandas_available():
+    import pandas as pd
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/tapex-base": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/tapex-base": {"do_lower_case": True},
+}
+
+
+class TapexTruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE.
+    """
+
+    DROP_ROWS_TO_FIT = "drop_rows_to_fit"
+
+
+class TokenizerStrategy(ExplicitEnum):
+
+    TOKENIZE_SOURCE = "tokenize_source"
+    TOKENIZE_TARGET = "tokenize_target"
+
+
+TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`],
+                   *optional*, defaults to `False`):
+
+                Activates and controls truncation. Accepts the following values:
+
+                - `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  row by row, removing rows from the table.
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
+                truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
+    of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
+    you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
+    vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class IndexedRowTableLinearize:
+    """
+    FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+    """
+
+    def process_table(self, table_content: Dict):
+        """
+        Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
+        # process header
+        table_str = self.process_header(table_content["header"]) + " "
+        # process rows
+        for i, row_example in enumerate(table_content["rows"]):
+            # NOTE: the row should start from row 1 instead of 0
+            table_str += self.process_row(row_example, row_index=i + 1) + " "
+        return table_str.strip()
+
+    def process_header(self, headers: List):
+        """
+        Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        return "col : " + " | ".join(headers)
+
+    def process_row(self, row: List, row_index: int):
+        """
+        Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        row_str = ""
+        row_cell_values = []
+        for cell_value in row:
+            if isinstance(cell_value, int):
+                row_cell_values.append(str(cell_value))
+            else:
+                row_cell_values.append(cell_value)
+        row_str += " | ".join(row_cell_values)
+        return "row " + str(row_index) + " : " + row_str
+
+
+class TapexTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
+
+    This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
+    to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
+
+    sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+
+    The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
+    will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
+    for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
+    the tokenizer for instance to prepare them for the model.
+
+    Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (BART tokenizer detect beginning of words by the preceding space).
+        max_cell_length (`int`, *optional*, defaults to 15):
+            Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation
+            takes place.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lower_case=True,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        max_cell_length=15,
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            max_cell_length=max_cell_length,
+            **kwargs,
+        )
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.do_lower_case = do_lower_case
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        # additional properties
+        self.max_cell_length = max_cell_length
+        self.table_linearize = IndexedRowTableLinearize()
+
+        # property to decide using which call function
+        self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A TAPEX sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Args:
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Args:
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not:
+        make use of token type ids, therefore a list of zeros is returned.
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]] = None,
+        query: Optional[Union[TextInput, List[TextInput]]] = None,
+        answer: Union[str, List[str]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several table-sequence pair(s).
+
+        Args:
+            table (`pd.DataFrame`, `List[pd.DataFrame]`):
+                Table(s) containing tabular data.
+            query (`str` or `List[str]`, *optional*):
+                Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
+                sentences must match the number of tables.
+            answer (`str` or `List[str]`, *optional*):
+                Optionally, the corresponding answer to the questions as supervision.
+        """
+
+        if self.current_tokenizer == TokenizerStrategy.TOKENIZE_SOURCE:
+            if table is None:
+                raise ValueError("Please ensure that the table is not empty if you use TAPEX to encode source.")
+            return self.source_call_func(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            if answer is None:
+                raise ValueError("Please ensure that the answer is not empty if you use TAPEX to encode target.")
+            return self.target_call_func(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def source_call_func(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[Union[TextInput, List[TextInput]]] = None,
+        answer: Union[str, List[str]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        # Input type checking for clearer error
+        valid_table = False
+        valid_query = False
+
+        # Check that table have a valid type
+        if isinstance(table, pd.DataFrame):
+            valid_table = True
+        elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
+            valid_table = True
+
+        # Check that query have a valid type
+        if query is None or isinstance(query, str):
+            valid_query = True
+        elif isinstance(query, (list, tuple)):
+            if len(query) == 0 or isinstance(query[0], str):
+                valid_query = True
+
+        if not valid_table:
+            raise ValueError(
+                "table input must of type `pd.DataFrame` (single example), `List[pd.DataFrame]` (batch of examples). "
+            )
+        if not valid_query:
+            raise ValueError("query input must of type `str` (single example), `List[str]` (batch of examples). ")
+        is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
+
+        if is_batched:
+            return self.batch_encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[List[TextInput]] = None,
+        answer: List[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[List[TextInput]] = None,
+        answer: Optional[List[str]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
+            # single table, many queries case
+            # duplicate table for every query
+            table = [table] * len(query)
+        if isinstance(table, (list, tuple)) and isinstance(query, str):
+            # many tables, single query case
+            # duplicate query for every table
+            query = [query] * len(table)
+
+        batch_outputs = self._batch_prepare_for_model(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[Union[TextInput, List[TextInput]]] = None,
+        answer: Optional[Union[str, List[str]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        This method adds special tokens, truncates sequences if overflowing while taking into account the special
+        tokens and manages a moving window (with user defined stride) for overflowing tokens.
+        """
+        batch_outputs = {}
+        if answer is None:
+            answer = [None] * len(table)
+        for _table, _query, _answer in zip(table, query, answer):
+            text = self.prepare_table_query(
+                _table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
+            )
+
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
+    def encode(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs
+    ) -> List[int]:
+        """
+        Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
+        attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
+        your processing on your own, otherwise refer to `__call__`.
+        """
+        encoded_inputs = self.encode_plus(
+            table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = self.prepare_table_query(
+            table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
+        )
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def target_call_func(
+        self,
+        answer: Union[str, List[str]],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        The method tokenizes and prepares the answer label for the model.
+
+        Args:
+            answer (`str` or `List[str]`):
+                Corresponding answer supervision to the queries for training the model.
+        """
+        is_batched = isinstance(answer, (list, tuple))
+
+        if is_batched:
+            return self.target_batch_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.target_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def target_batch_encode_plus(
+        self,
+        answer: List[str],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Prepare answer strings for the model.
+
+        Args:
+            answer `List[str]`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_batch_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_batch_encode_plus(
+        self,
+        answer: List[str],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+
+        batch_outputs = {}
+        for text in answer:
+
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return BatchEncoding(batch_outputs)
+
+    def target_encode(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs
+    ) -> List[int]:
+        """
+        Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
+        which are necessary for the model to work correctly. Use this method if you want to build your processing on
+        your own, otherwise refer to `__call__`.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model
+        """
+        encoded_outputs = self.target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_outputs["input_ids"]
+
+    def target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Prepare a answer string for the model.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = answer
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @contextmanager
+    def as_target_tokenizer(self):
+        """
+        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
+        sequence-to-sequence models that need a slightly different processing for the labels.
+        """
+        self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET
+        yield
+        # restore the call function
+        self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
+
+    def prepare_table_query(
+        self,
+        table,
+        query,
+        answer=None,
+        truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy],
+        max_length=None,
+    ):
+        """
+        This method can be used to linearize a table and add a corresponding query.
+
+        Optionally, it also handles truncation of the table (cells).
+
+        An answer can be provided for more precise truncation.
+        """
+        if not table.empty:
+            # step 1: create table dictionary
+            table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
+
+            # step 2: modify table internally
+            # always truncate table cells based on self.max_cell_length
+            # optionally truncate rows if truncation_strategy is set to it
+            self.truncate_table_cells(table_content, query, answer)
+            if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
+                self.truncate_table_rows(table_content, query, answer, max_length=max_length)
+
+            # step 3: linearize table
+            linear_table = self.table_linearize.process_table(table_content)
+        else:
+            linear_table = ""
+
+        if linear_table == "":
+            logger.warning(
+                "You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+                + f"Please carefully check the corresponding table with the query : {query}."
+            )
+        if query == "":
+            logger.warning("You provide nothing to query with respect to the table.")
+        # step 4: concatenate query with linear_table
+        separator = " " if query and linear_table else ""
+        joint_input = (query + separator + linear_table) if query else linear_table
+
+        return joint_input
+
+    def truncate_table_cells(self, table_content: Dict, question: str, answer: List):
+        # TODO (Qian): is it possible to revert the original cell if it is in the final answer?
+        cell_mapping = {}
+        for row in table_content["rows"]:
+            for i, cell in enumerate(row):
+                truncate_cell = self.truncate_cell(cell)
+                if truncate_cell is not None:
+                    cell_mapping[cell] = truncate_cell
+                    row[i] = truncate_cell
+
+        # modify the answer list
+        if answer is not None:
+            for i, case in enumerate(answer):
+                if case in cell_mapping.keys():
+                    answer[i] = cell_mapping[case]
+
+    def truncate_cell(self, cell_value):
+        # do not process on these cases
+        if isinstance(cell_value, int) or isinstance(cell_value, float):
+            return cell_value
+        if cell_value.strip() != "":
+            try_tokens = self.tokenize(cell_value)
+            if len(try_tokens) >= self.max_cell_length:
+                retain_tokens = try_tokens[: self.max_cell_length]
+                retain_cell_value = self.convert_tokens_to_string(retain_tokens)
+                return retain_cell_value
+            else:
+                return None
+        else:
+            return cell_value
+
+    def truncate_table_rows(
+        self, table_content: Dict, question: str, answer: Optional[Union[str, List[str]]] = None, max_length=None
+    ):
+        """
+        Args:
+
+        table_content:
+            {"header": xxx, "rows": xxx, "id" (Optionally): xxx}
+
+        question:
+            natural language sentence
+
+        answer:
+            if for training, is the supervision; otherwise will be empty
+        """
+        delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
+        # randomly delete unrelated rows
+        self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
+        # guarantee the result < max_length
+        maximum_keep_rows = 0
+        for ind, row_example in enumerate(table_content["rows"]):
+            value_string = self.table_linearize.process_row(row_example, ind + 1)
+            value_token_len = len(self.tokenize(value_string))
+            # over the size limit, and take action
+            if value_token_len > remain_token_len:
+                break
+            remain_token_len -= value_token_len
+            maximum_keep_rows += 1
+        del table_content["rows"][maximum_keep_rows:]
+
+    def estimate_delete_ratio(self, table_content: Dict, question: str, max_length=None):
+        if "header" not in table_content or "rows" not in table_content:
+            raise ValueError("The table content should contain both 'header' and 'rows' keys.")
+        # calculate the tokens of header, special tokens will only be pre-prepended into question
+        question_tokens = self.tokenize(question, add_special_tokens=True)
+        # calculate the tokens of header
+        header_string = self.table_linearize.process_header(table_content["header"])
+        header_tokens = self.tokenize(header_string, add_special_tokens=False)
+        # split all cell values into tokens and see how many can be accommodated
+        used_token_len = len(question_tokens) + len(header_tokens)
+        # remaining token space for rows
+        remain_token_len = max_length - used_token_len
+
+        value_string = ""
+        for _, row_example in enumerate(table_content["rows"]):
+            # use a general index to roughly estimate the overall token len
+            value_string += self.table_linearize.process_row(row_example, 100) + " "
+        value_token_len = len(self.tokenize(value_string))
+
+        if value_token_len < remain_token_len:
+            # no row will be deleted
+            return 0.0, remain_token_len
+        else:
+            # calc a roughly delete rate
+            return 1.0 - remain_token_len / value_token_len, remain_token_len
+
+    def delete_unrelated_rows(self, table_content: Dict, question: str, answer: List, delete_ratio: float):
+        """
+        The argument answer is used only during training.
+        """
+        truncated_unrelated_indices = []
+        related_indices = []
+        if answer is None or len(answer) == 0:
+            answer_set = set([])
+        else:
+            answer_set = set([ans_ex.lower() for ans_ex in answer])
+        # add question key words into answer set
+        if question is not None:
+            answer_set.update(question.split())
+        question_set = set(question.strip("?!.,").split(" "))
+        row_max_len = len(table_content["rows"])
+        for _row_idx, row in enumerate(table_content["rows"]):
+            lower_row = set([str(cell).lower() for cell in row])
+            if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
+                truncated_unrelated_indices.append(_row_idx)
+            else:
+                # add neighbours to preserve information aggressively
+                related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
+
+        # remove the neighbours
+        truncated_unrelated_indices = [
+            _row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
+        ]
+        # select some cases to drop
+        drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
+        drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
+
+        for _row_idx in reversed(range(row_max_len)):
+            if _row_idx in drop_row_indices:
+                del table_content["rows"][_row_idx]
+
+        # only when the drop ratio is too large, logging for warning.
+        if "id" in table_content and len(drop_row_indices) > 0:
+            logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index 8ad931150edc..d1f7ad01dd5c 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -998,12 +998,13 @@ def call(
         pred_hid = last_hidden[:, -tgt_len:]
 
         softmax_output = self.crit(pred_hid, labels, training=training)
+        prediction_scores = softmax_output if labels is None else ()
 
         if not return_dict:
-            return (softmax_output,) + transformer_outputs[1:]
+            return (prediction_scores,) + transformer_outputs[1:]
 
         return TFTransfoXLLMHeadModelOutput(
-            prediction_scores=softmax_output,
+            prediction_scores=prediction_scores,
             mems=transformer_outputs.mems,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index cbf935f4f743..e3c039ca83e1 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -33,14 +33,23 @@
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_vit import ViTConfig
 
 
 logger = logging.get_logger(__name__)
 
+# General docstring
 _CONFIG_FOR_DOC = "ViTConfig"
-_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224"
+_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
 
 
 # Inspired by
@@ -645,7 +654,14 @@ def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs)
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -656,26 +672,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import ViTFeatureExtractor, TFViTModel
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
-        >>> model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
-
-        >>> inputs = feature_extractor(images=image, return_tensors="tf")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
 
         outputs = self.vit(
             pixel_values=pixel_values,
@@ -744,7 +740,13 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs):
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -761,30 +763,7 @@ def call(
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import ViTFeatureExtractor, TFViTForImageClassification
-        >>> import tensorflow as tf
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-        >>> model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
-
-        >>> inputs = feature_extractor(images=image, return_tensors="tf")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
-        >>> # model predicts one of the 1000 ImageNet classes
-        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
-        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
-        ```"""
+        """
 
         outputs = self.vit(
             pixel_values=pixel_values,
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index a2fb10b9e040..684a96ad6494 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -836,16 +836,20 @@ def create_optimizer(self):
         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
         Trainer's init through `optimizers`, or subclass and override this method in a subclass.
         """
+        if is_sagemaker_mp_enabled():
+            opt_model = self.model_wrapped
+        else:
+            opt_model = self.model
         if self.optimizer is None:
-            decay_parameters = get_parameter_names(self.model, [nn.LayerNorm])
+            decay_parameters = get_parameter_names(opt_model, [nn.LayerNorm])
             decay_parameters = [name for name in decay_parameters if "bias" not in name]
             optimizer_grouped_parameters = [
                 {
-                    "params": [p for n, p in self.model.named_parameters() if n in decay_parameters],
+                    "params": [p for n, p in opt_model.named_parameters() if n in decay_parameters],
                     "weight_decay": self.args.weight_decay,
                 },
                 {
-                    "params": [p for n, p in self.model.named_parameters() if n not in decay_parameters],
+                    "params": [p for n, p in opt_model.named_parameters() if n not in decay_parameters],
                     "weight_decay": 0.0,
                 },
             ]
@@ -1032,6 +1036,8 @@ def _wrap_model(self, model, training=True):
             # Wrapping the base model twice in a DistributedModel will raise an error.
             if isinstance(self.model_wrapped, smp.model.DistributedModel):
                 return self.model_wrapped
+            if self.args.smp_tensor_parallel_full_model:
+                smp.set_tensor_parallelism(model)
             return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
 
         # already initialized its own DDP and AMP
@@ -1165,8 +1171,16 @@ def train(
             if resume_from_checkpoint is None:
                 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
 
+
         if resume_from_checkpoint is not None:
-            if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
+            if self.args.smp_load_partial:
+                import glob
+                # SMP partial checkpoints are in {filename}_{pp_rank()}_{tp_rank()} or {filename}_{pp_rank()}_{tp_rank()}_{rdp_rank()} format.
+                checkpoint_file_exists = glob.glob(os.path.join(resume_from_checkpoint, WEIGHTS_NAME) + "_*")
+            else:
+                checkpoint_file_exists = os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME))
+
+            if not checkpoint_file_exists:
                 raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
 
             logger.info(f"Loading model from {resume_from_checkpoint}).")
@@ -1185,13 +1199,12 @@ def train(
                 # will be resumed in deepspeed_init
                 pass
             else:
-                # We load the model state dict on the CPU to avoid an OOM error.
-                state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
-                # If the model is on the GPU, it still works!
-                self._load_state_dict_in_model(state_dict)
+                if not is_sagemaker_mp_enabled():
+                    state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
+                    self._load_state_dict_in_model(state_dict)
 
-                # release memory
-                del state_dict
+                    # release memory
+                    del state_dict
 
         # If model was re-initialized, put it on the right device and update self.model_wrapped
         if model_reloaded:
@@ -1272,6 +1285,10 @@ def train(
 
         model = self._wrap_model(self.model_wrapped)
 
+        if resume_from_checkpoint is not None and is_sagemaker_mp_enabled():
+            state_dict = smp.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), partial=self.args.smp_load_partial)
+            model.load_state_dict(state_dict)
+
         # for the rest of this function `model` is the outside model, whether it was wrapped or not
         if model is not self.model:
             self.model_wrapped = model
@@ -1534,13 +1551,20 @@ def train(
                 xm.rendezvous("load_best_model_at_end")
             elif args.local_rank != -1:
                 dist.barrier()
+            elif is_sagemaker_mp_enabled():
+                smp.barrier()
 
             logger.info(
                 f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
             )
 
             best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
-            if os.path.exists(best_model_path):
+            if self.args.smp_load_partial:
+                import glob
+                # SMP partial checkpoints are in {filename}_{pp_rank()}_{tp_rank()} or {filename}_{pp_rank()}_{tp_rank()}_{rdp_rank()} format.
+                smp_checkpoint_file_exists = glob.glob(best_model_path + "_*")
+
+            if os.path.exists(best_model_path) or smp_checkpoint_file_exists:
                 if self.deepspeed:
                     # temp hack until Deepspeed fixes the problem with resume from an existing engine that did some stepping
                     deepspeed_engine, optimizer, lr_scheduler = deepspeed_reinit(self)
@@ -1554,9 +1578,13 @@ def train(
                     )
                 else:
                     # We load the model state dict on the CPU to avoid an OOM error.
-                    state_dict = torch.load(best_model_path, map_location="cpu")
-                    # If the model is on the GPU, it still works!
-                    self._load_state_dict_in_model(state_dict)
+                    if is_sagemaker_mp_enabled():
+                        state_dict = smp.load(best_model_path, partial=self.args.smp_load_partial)
+                        model.load_state_dict(state_dict)
+                    else:
+                        state_dict = torch.load(best_model_path, map_location="cpu")
+                         # If the model is on the GPU, it still works!
+                        self._load_state_dict_in_model(state_dict)
             else:
                 logger.warning(
                     f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
@@ -1714,17 +1742,20 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
                 reissue_pt_warnings(caught_warnings)
         elif is_sagemaker_mp_enabled():
-            if smp.rdp_rank() == 0:
-                # Consolidate the state dict on all processed of rdp_rank 0
+            # Consolidate the state dict on all processes
+            if self.args.smp_save_partial:
+                opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
+            else:
                 opt_state_dict = self.optimizer.state_dict()
-                # Save it and the scheduler on the main process
-                if self.args.should_save:
-                    torch.save(opt_state_dict, os.path.join(output_dir, OPTIMIZER_NAME))
-                    with warnings.catch_warnings(record=True) as caught_warnings:
-                        torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-                    reissue_pt_warnings(caught_warnings)
-                    if self.do_grad_scaling:
-                        torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
+            if self.args.should_save or smp.state.cfg.shard_optimizer_state:
+                smp.save(opt_state_dict, os.path.join(output_dir, OPTIMIZER_NAME), partial=self.args.smp_save_partial, v3=smp.state.cfg.shard_optimizer_state)
+            # Save it and the scheduler on the main process
+            if self.args.should_save:
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+                reissue_pt_warnings(caught_warnings)
+                if self.do_grad_scaling:
+                    torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
         elif self.args.should_save and not self.deepspeed:
             # deepspeed.save_checkpoint above saves model/optim/sched
             torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
@@ -1795,7 +1826,14 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
             return
 
-        if os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME)) and os.path.isfile(
+        if self.args.smp_load_partial:
+            import glob
+            # SMP partial checkpoints are in {filename}_{pp_rank()}_{tp_rank()} or {filename}_{pp_rank()}_{tp_rank()}_{rdp_rank()} format.
+            checkpoint_file_exists = glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*")
+        else:
+            checkpoint_file_exists = os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
+
+        if checkpoint_file_exists and os.path.isfile(
             os.path.join(checkpoint, SCHEDULER_NAME)
         ):
             # Load in optimizer and scheduler states
@@ -1813,7 +1851,12 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                 self.lr_scheduler.load_state_dict(lr_scheduler_state)
             else:
                 map_location = "cpu" if is_sagemaker_mp_enabled() else self.args.device
-                self.optimizer.load_state_dict(
+                if is_sagemaker_mp_enabled():
+                    def opt_load_hook(mod, opt):
+                        opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=self.args.smp_load_partial))
+                    self.model_wrapped.register_post_step_hook(opt_load_hook)
+                else:
+                    self.optimizer.load_state_dict(
                     torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
                 )
                 with warnings.catch_warnings(record=True) as caught_warnings:
@@ -2083,11 +2126,15 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
         if output_dir is None:
             output_dir = self.args.output_dir
 
+
         if is_torch_tpu_available():
             self._save_tpu(output_dir)
         elif is_sagemaker_mp_enabled():
             # Calling the state_dict needs to be done on the wrapped model and on all processes.
-            state_dict = self.model_wrapped.state_dict()
+            if self.args.smp_save_partial:
+                state_dict = self.model_wrapped.local_state_dict()
+            else:
+                state_dict = self.model_wrapped.state_dict()
             if self.args.should_save:
                 self._save(output_dir, state_dict=state_dict)
         elif (
@@ -2175,9 +2222,15 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
                 logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
                 if state_dict is None:
                     state_dict = self.model.state_dict()
-                torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+                if self.args.smp_save_partial:
+                    smp.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME), partial=self.args.smp_save_partial)
+                else:
+                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
         else:
-            self.model.save_pretrained(output_dir, state_dict=state_dict)
+            if self.args.smp_save_partial:
+                self.model.save_pretrained(output_dir, save_function=smp.save, state_dict=state_dict)
+            else:
+                self.model.save_pretrained(output_dir, save_function=torch.save, state_dict=state_dict)
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(output_dir)
 
@@ -2437,10 +2490,13 @@ def evaluation_loop(
         losses_host = None
         preds_host = None
         labels_host = None
+        inputs_host = None
+
         # losses/preds/labels on CPU (final containers)
         all_losses = None
         all_preds = None
         all_labels = None
+        all_inputs = None
         # Will be useful when we have an iterable dataset so don't know its length.
 
         observed_num_examples = 0
@@ -2456,6 +2512,7 @@ def evaluation_loop(
 
             # Prediction step
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
 
             if is_torch_tpu_available():
                 xm.mark_step()
@@ -2468,6 +2525,14 @@ def evaluation_loop(
                 labels = self._pad_across_processes(labels)
                 labels = self._nested_gather(labels)
                 labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_decode = self._pad_across_processes(inputs_decode)
+                inputs_decode = self._nested_gather(inputs_decode)
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
             if logits is not None:
                 logits = self._pad_across_processes(logits)
                 logits = self._nested_gather(logits)
@@ -2484,6 +2549,13 @@ def evaluation_loop(
                 if preds_host is not None:
                     logits = nested_numpify(preds_host)
                     all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+                if inputs_host is not None:
+                    inputs_decode = nested_numpify(inputs_host)
+                    all_inputs = (
+                        inputs_decode
+                        if all_inputs is None
+                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+                    )
                 if labels_host is not None:
                     labels = nested_numpify(labels_host)
                     all_labels = (
@@ -2491,7 +2563,7 @@ def evaluation_loop(
                     )
 
                 # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host = None, None, None
+                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
 
         if args.past_index and hasattr(self, "_past"):
             # Clean the state at the end of the evaluation loop
@@ -2504,6 +2576,11 @@ def evaluation_loop(
         if preds_host is not None:
             logits = nested_numpify(preds_host)
             all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+        if inputs_host is not None:
+            inputs_decode = nested_numpify(inputs_host)
+            all_inputs = (
+                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+            )
         if labels_host is not None:
             labels = nested_numpify(labels_host)
             all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
@@ -2529,10 +2606,17 @@ def evaluation_loop(
             all_preds = nested_truncate(all_preds, num_samples)
         if all_labels is not None:
             all_labels = nested_truncate(all_labels, num_samples)
+        if all_inputs is not None:
+            all_inputs = nested_truncate(all_inputs, num_samples)
 
         # Metrics!
         if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
         else:
             metrics = {}
 
@@ -2561,7 +2645,10 @@ def _nested_gather(self, tensors, name=None):
                 name = "nested_gather"
             tensors = nested_xla_mesh_reduce(tensors, name)
         elif is_sagemaker_mp_enabled():
-            tensors = smp_gather(tensors)
+            if smp.state.cfg.ddp:
+                tensors = distributed_concat(tensors.cuda())
+            else:
+                tensors = smp_gather(tensors)
         elif self.args.local_rank != -1:
             tensors = distributed_concat(tensors)
         return tensors
@@ -2913,7 +3000,6 @@ def prediction_loop(
 
         # if eval is called w/o train init deepspeed here
         if args.deepspeed and not self.deepspeed:
-
             # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
             # from the checkpoint eventually
             deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
@@ -2944,6 +3030,7 @@ def prediction_loop(
         losses_host: torch.Tensor = None
         preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
         labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
 
         world_size = max(1, args.world_size)
 
@@ -2956,6 +3043,7 @@ def prediction_loop(
                 make_multiple_of = dataloader.sampler.batch_size
             preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
             labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
 
         model.eval()
 
@@ -2969,6 +3057,8 @@ def prediction_loop(
 
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
+
             if loss is not None:
                 losses = loss.repeat(batch_size)
                 losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
@@ -2976,6 +3066,12 @@ def prediction_loop(
                 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
             if labels is not None:
                 labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
@@ -2984,9 +3080,10 @@ def prediction_loop(
                 if not prediction_loss_only:
                     preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
                     labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+                    inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
 
                 # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host = None, None, None
+                losses_host, preds_host, labels_host, inputs_host = None, None, None, None
 
         if args.past_index and hasattr(self, "_past"):
             # Clean the state at the end of the evaluation loop
@@ -2997,13 +3094,20 @@ def prediction_loop(
         if not prediction_loss_only:
             preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
             labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+            inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
 
         eval_loss = eval_losses_gatherer.finalize()
         preds = preds_gatherer.finalize() if not prediction_loss_only else None
         label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
+        inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
 
         if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
         else:
             metrics = {}
 
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 73d5181ec364..5513b58bef94 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -163,9 +163,11 @@ def prediction_step(
 
         if "attention_mask" in inputs:
             gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
 
         # prepare generation inputs
-        # some encoder-decoder models can have varying encder's and thus
+        # some encoder-decoder models can have varying encoder's and thus
         # varying model input names
         if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
             generation_inputs = inputs[self.model.encoder.main_input_name]
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index d8d15affb9ed..4450bfde646e 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -63,17 +63,43 @@ def set_seed(seed: int):
         tf.random.set_seed(seed)
 
 
-class EvalPrediction(NamedTuple):
+class EvalPrediction:
     """
     Evaluation output (always contains labels), to be used to compute metrics.
 
     Parameters:
         predictions (`np.ndarray`): Predictions of the model.
         label_ids (`np.ndarray`): Targets to be matched.
+        inputs (`np.ndarray`, *optional*)
     """
 
-    predictions: Union[np.ndarray, Tuple[np.ndarray]]
-    label_ids: Union[np.ndarray, Tuple[np.ndarray]]
+    def __init__(
+        self,
+        predictions: Union[np.ndarray, Tuple[np.ndarray]],
+        label_ids: Union[np.ndarray, Tuple[np.ndarray]],
+        inputs: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
+    ):
+        self.predictions = predictions
+        self.label_ids = label_ids
+        self.inputs = inputs
+
+    def __iter__(self):
+        if self.inputs is not None:
+            return iter((self.predictions, self.label_ids, self.inputs))
+        else:
+            return iter((self.predictions, self.label_ids))
+
+    def __getitem__(self, idx):
+        if idx < 0 or idx > 2:
+            raise IndexError("tuple index out of range")
+        if idx == 2 and self.inputs is None:
+            raise IndexError("tuple index out of range")
+        if idx == 0:
+            return self.predictions
+        elif idx == 1:
+            return self.label_ids
+        elif idx == 2:
+            return self.inputs
 
 
 class EvalLoopOutput(NamedTuple):
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 6176794ab032..dd9a8bd0ff01 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -247,8 +247,10 @@ class TrainingArguments:
             Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values.
         tf32 (`bool`, *optional*):
-            Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API
-            and it may change.
+            Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends
+            on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to
+            the [TF32](https://huggingface.co/docs/transformers/performance#tf32) documentation. This is an
+            experimental API and it may change.
         local_rank (`int`, *optional*, defaults to -1):
             Rank of the process during distributed training.
         xpu_backend (`str`, *optional*):
@@ -416,6 +418,18 @@ class TrainingArguments:
             `huggingface-cli login`.
         gradient_checkpointing (`bool`, *optional*, defaults to `False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+<<<<<<< HEAD
+        include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
+            Whether or not the inputs will be passed to the `compute_metrics` function. This is intended for metrics
+            that need inputs, predictions and references for scoring calculation in Metric class.
+=======
+>>>>>>> 29ae2ec71c66be1428e20654e97edd58e43ecb53
+        smp_save_partial (`bool`, *optional*, defaults to `False`):
+            If True, saves checkpoints partially. `"smp_save_partial"` can only be used with Sagemaker Model Parallel library.
+        smp_load_partial (`bool`, *optional*, defaults to `False`):
+            If True, loads partial checkpoints. `"smp_load_partial"` can only be used with Sagemaker Model Parallel library.
+        smp_tensor_parallel_full_model (`bool`, *optional*, defaults to `False`):
+            If True, apply tensor paralellism for the full model. `"smp_tensor_parallel_full_model"` can only be used with Sagemaker Model Parallel library.
     """
 
     output_dir: str = field(
@@ -739,6 +753,16 @@ class TrainingArguments:
             "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
         },
     )
+<<<<<<< HEAD
+    include_inputs_for_metrics: bool = field(
+        default=False, metadata={"help": "Whether or not the inputs will be passed to the `compute_metrics` function."}
+    )
+=======
+>>>>>>> 29ae2ec71c66be1428e20654e97edd58e43ecb53
+    smp_save_partial: bool = field(default=False, metadata={"help": "Save checkpoints partially for SMP."})
+    smp_load_partial: bool = field(default=False, metadata={"help": "Load partial checkpoints for SMP."})
+    smp_tensor_parallel_full_model: bool = field(default=False, metadata={"help": "Enables tensor parallelism for full model in SMP."})
+
     # Deprecated arguments
     fp16_backend: str = field(
         default="auto",
@@ -757,6 +781,7 @@ class TrainingArguments:
         metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer"},
     )
 
+
     def __post_init__(self):
         # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
         # This needs to happen before any call to self.device or self.n_gpu.
@@ -974,6 +999,19 @@ def __post_init__(self):
                 FutureWarning,
             )
 
+        if (self.smp_save_partial or self.smp_load_partial) and not is_sagemaker_mp_enabled():
+            raise ValueError(f"smp_save_partial and smp_load_partial can only be used with Sagemaker Model Parallel library.")
+
+        if (is_sagemaker_mp_enabled() and not self.smp_save_partial and smp.state.cfg.shard_optimizer_state ):
+            warnings.warn("Optimizer sharding can only be used with partial checkpointing. Switching to partial checkpointing.")
+            self.smp_save_partial = True
+                                      
+        if (is_sagemaker_mp_enabled() and self.smp_save_partial and self.load_best_model_at_end ):
+            self.smp_load_partial = True
+
+        if (is_sagemaker_mp_enabled() and (not self.smp_save_partial) and (self.save_strategy != IntervalStrategy.NO)):
+            warnings.warn("Saving weights but not the optimizer state.")
+
     def __str__(self):
         self_as_dict = asdict(self)
 
@@ -1207,7 +1245,10 @@ def should_save(self):
             return self.local_process_index == 0
         else:
             if is_sagemaker_mp_enabled():
-                return smp.rank() == 0
+                if self.smp_save_partial:
+                    return smp.rdp_rank() == 0
+                else: 
+                    return smp.rank() == 0
             else:
                 return self.process_index == 0
 
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index eaf59ba50215..39508e18d222 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -207,7 +207,8 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> # target is "nice puppet"
-    >>> target_start_index, target_end_index = torch.tensor([14]), torch.tensor([15])
+    >>> target_start_index = torch.tensor([{qa_target_start_index}])
+    >>> target_end_index = torch.tensor([{qa_target_end_index}])
 
     >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
     >>> loss = outputs.loss
@@ -269,9 +270,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     ```python
     >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
     >>> num_labels = len(model.config.id2label)
-    >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels)
+    >>> model = {model_class}.from_pretrained(
+    ...     "{checkpoint}", num_labels=num_labels, problem_type="multi_label_classification"
+    ... )
 
-    >>> num_labels = len(model.config.id2label)
     >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
     ...     torch.float
     ... )
@@ -666,7 +668,8 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> # target is "nice puppet"
-    >>> target_start_index, target_end_index = tf.constant([14]), tf.constant([15])
+    >>> target_start_index = tf.constant([{qa_target_start_index}])
+    >>> target_end_index = tf.constant([{qa_target_end_index}])
 
     >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
     >>> loss = tf.math.reduce_mean(outputs.loss)
@@ -1053,6 +1056,8 @@ def add_code_sample_docstrings(
     output_type=None,
     config_class=None,
     mask="[MASK]",
+    qa_target_start_index=14,
+    qa_target_end_index=15,
     model_cls=None,
     modality=None,
     expected_output="",
@@ -1077,6 +1082,8 @@ def docstring_decorator(fn):
             processor_class=processor_class,
             checkpoint=checkpoint,
             mask=mask,
+            qa_target_start_index=qa_target_start_index,
+            qa_target_end_index=qa_target_end_index,
             expected_output=expected_output,
             expected_loss=expected_loss,
         )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2bd71277b520..3233b44d896c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3350,6 +3350,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RegNetForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RegNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RegNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index db6d8d467aa2..cbd3358ac8ff 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -17,6 +17,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFForcedBOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFForcedEOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFLogitsProcessor(metaclass=DummyObject):
     _backends = ["tf"]
 
diff --git a/templates/adding_a_missing_tokenization_test/README.md b/templates/adding_a_missing_tokenization_test/README.md
new file mode 100644
index 000000000000..935f21c5ca8a
--- /dev/null
+++ b/templates/adding_a_missing_tokenization_test/README.md
@@ -0,0 +1,39 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+This folder contains a template to add a tokenization test. 
+
+## Usage
+
+Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. 
+
+Let's first [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the `transformers` repo on github. Once it's done you can clone your fork and install `transformers` in our environment:
+
+```shell script
+git clone https://github.com/YOUR-USERNAME/transformers
+cd transformers
+pip install -e ".[dev]"
+```
+
+Once the installation is done, you can generate the template by running the following command. Be careful, the template will be generated inside a new folder in your current working directory.
+
+```shell script
+cookiecutter path-to-the folder/adding_a_missing_tokenization_test/
+```
+
+You will then have to answer some questions about the tokenizer for which you want to add tests. The `modelname` should be cased according to the plain text casing, i.e., BERT, RoBERTa, DeBERTa.
+
+Once the command has finished, you should have a one new file inside the newly created folder named `test_tokenization_Xxx.py`. At this point the template is finished and you can move it to the sub-folder of the corresponding model in the test folder.
diff --git a/templates/adding_a_missing_tokenization_test/cookiecutter-template-{{cookiecutter.modelname}}/test_tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_missing_tokenization_test/cookiecutter-template-{{cookiecutter.modelname}}/test_tokenization_{{cookiecutter.lowercase_modelname}}.py
new file mode 100644
index 000000000000..631886f6b2eb
--- /dev/null
+++ b/templates/adding_a_missing_tokenization_test/cookiecutter-template-{{cookiecutter.modelname}}/test_tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2022 {{cookiecutter.authors}}. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the {{cookiecutter.modelname}} tokenizer. """
+
+
+import unittest
+
+{% if cookiecutter.has_slow_class == "True" and  cookiecutter.has_fast_class == "True" -%}
+from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}TokenizerFast
+{% elif  cookiecutter.has_slow_class == "True" -%}
+from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer
+{% elif  cookiecutter.has_fast_class == "True" -%}
+from transformers import {{cookiecutter.camelcase_modelname}}TokenizerFast
+{% endif -%}
+{% if cookiecutter.has_fast_class == "True" and  cookiecutter.slow_tokenizer_use_sentencepiece == "True" -%}
+from transformers.testing_utils import require_sentencepiece, require_tokenizers
+from ..test_tokenization_common import TokenizerTesterMixin
+
+
+@require_sentencepiece
+@require_tokenizers
+{% elif  cookiecutter.slow_tokenizer_use_sentencepiece == "True" -%}
+from transformers.testing_utils import require_sentencepiece
+from ..test_tokenization_common import TokenizerTesterMixin
+
+
+@require_sentencepiece
+{% elif  cookiecutter.has_fast_class == "True" -%}
+from transformers.testing_utils import require_tokenizers
+from ..test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+{% else -%}
+from ..test_tokenization_common import TokenizerTesterMixin
+
+
+{% endif -%}
+class {{cookiecutter.camelcase_modelname}}TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    {% if cookiecutter.has_slow_class == "True" -%}
+    tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
+    test_slow_tokenizer = True
+    {% else -%}
+    tokenizer_class = None
+    test_slow_tokenizer = False
+    {% endif -%}
+    {% if cookiecutter.has_fast_class == "True" -%}
+    rust_tokenizer_class = {{cookiecutter.camelcase_modelname}}TokenizerFast
+    test_rust_tokenizer = True
+    {% else -%}
+    rust_tokenizer_class = None
+    test_rust_tokenizer = False
+    {% endif -%}
+    {% if  cookiecutter.slow_tokenizer_use_sentencepiece == "True" -%}
+    test_sentencepiece = True
+    {% endif -%}
+    # TODO: Check in `TokenizerTesterMixin` if other attributes need to be changed
+    def setUp(self):
+        super().setUp()
+
+        raise NotImplementedError(
+            "Here you have to implement the saving of a toy tokenizer in "
+            "`self.tmpdirname`."
+        )
+
+    # TODO: add tests with hard-coded target values 
\ No newline at end of file
diff --git a/templates/adding_a_missing_tokenization_test/cookiecutter.json b/templates/adding_a_missing_tokenization_test/cookiecutter.json
new file mode 100644
index 000000000000..2e53818f9bb6
--- /dev/null
+++ b/templates/adding_a_missing_tokenization_test/cookiecutter.json
@@ -0,0 +1,10 @@
+{
+  "modelname": "BrandNewBERT",
+  "uppercase_modelname": "BRAND_NEW_BERT",
+  "lowercase_modelname": "brand_new_bert",
+  "camelcase_modelname": "BrandNewBert",
+  "has_slow_class": ["True", "False"],
+  "has_fast_class": ["True", "False"],
+  "slow_tokenizer_use_sentencepiece": ["True", "False"],
+  "authors": "The HuggingFace Team"
+}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 57fd95dd3ff6..0f4d7824c164 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -259,6 +259,7 @@ def create_and_check_causal_lm_model_as_decoder(
             list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
         )
 
+
     def create_and_check_causal_lm_model_past(
         self,
         config,
@@ -597,6 +598,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip(reason="Template classes interact badly with this test.")
+    def test_keras_fit(self):
+        pass
+
     def test_causal_lm_base_model(self):
         """Test the base model of the causal LM model
 
@@ -947,6 +952,10 @@ def _get_word_embedding_weight(model, embedding_layer):
                                 models_equal = False
                     self.assertTrue(models_equal)
 
+    @unittest.skip(reason="Template classes interact badly with this test.")
+    def test_keras_fit(self):
+        pass
+
 
 def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
     """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
diff --git a/tests/auto/test_modeling_auto.py b/tests/auto/test_modeling_auto.py
index ae04501ea2dd..02ecb08e1e2f 100644
--- a/tests/auto/test_modeling_auto.py
+++ b/tests/auto/test_modeling_auto.py
@@ -74,12 +74,9 @@
         MODEL_FOR_MASKED_LM_MAPPING,
         MODEL_FOR_PRETRAINING_MAPPING,
         MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
-        MODEL_WITH_LM_HEAD_MAPPING,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
     from transformers.models.gpt2.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -251,40 +248,6 @@ def test_from_pretrained_with_tuple_values(self):
             model = AutoModel.from_pretrained(tmp_dir)
             self.assertIsInstance(model, FunnelBaseModel)
 
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-
-        mappings = (
-            MODEL_MAPPING,
-            MODEL_FOR_PRETRAINING_MAPPING,
-            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-            MODEL_WITH_LM_HEAD_MAPPING,
-            MODEL_FOR_CAUSAL_LM_MAPPING,
-            MODEL_FOR_MASKED_LM_MAPPING,
-            MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        )
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, child_model) in enumerate(mapping[1:]):
-                for parent_config, parent_model in mapping[: index + 1]:
-                    assert not issubclass(
-                        child_config, parent_config
-                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
-
-                    # Tuplify child_model and parent_model since some of them could be tuples.
-                    if not isinstance(child_model, (list, tuple)):
-                        child_model = (child_model,)
-                    if not isinstance(parent_model, (list, tuple)):
-                        parent_model = (parent_model,)
-
-                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
-                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
-
     def test_from_pretrained_dynamic_model_local(self):
         try:
             AutoConfig.register("custom", CustomConfig)
diff --git a/tests/auto/test_modeling_tf_auto.py b/tests/auto/test_modeling_tf_auto.py
index 04f2b4862cd7..a803a3451107 100644
--- a/tests/auto/test_modeling_tf_auto.py
+++ b/tests/auto/test_modeling_tf_auto.py
@@ -58,12 +58,9 @@
         TF_MODEL_FOR_MASKED_LM_MAPPING,
         TF_MODEL_FOR_PRETRAINING_MAPPING,
         TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         TF_MODEL_MAPPING,
-        TF_MODEL_WITH_LM_HEAD_MAPPING,
     )
     from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
     from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -218,38 +215,6 @@ def test_from_pretrained_with_tuple_values(self):
             model = TFAutoModel.from_pretrained(tmp_dir)
             self.assertIsInstance(model, TFFunnelBaseModel)
 
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-        mappings = (
-            TF_MODEL_MAPPING,
-            TF_MODEL_FOR_PRETRAINING_MAPPING,
-            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-            TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-            TF_MODEL_WITH_LM_HEAD_MAPPING,
-            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-            TF_MODEL_FOR_MASKED_LM_MAPPING,
-            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        )
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, child_model) in enumerate(mapping[1:]):
-                for parent_config, parent_model in mapping[: index + 1]:
-                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
-                        self.assertFalse(issubclass(child_config, parent_config))
-
-                    # Tuplify child_model and parent_model since some of them could be tuples.
-                    if not isinstance(child_model, (list, tuple)):
-                        child_model = (child_model,)
-                    if not isinstance(parent_model, (list, tuple)):
-                        parent_model = (parent_model,)
-
-                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
-                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
-
     def test_new_model_registration(self):
         try:
             AutoConfig.register("new-model", NewModelConfig)
diff --git a/tests/auto/test_tokenization_auto.py b/tests/auto/test_tokenization_auto.py
index ae4e5896508d..57041e583029 100644
--- a/tests/auto/test_tokenization_auto.py
+++ b/tests/auto/test_tokenization_auto.py
@@ -151,19 +151,6 @@ def test_tokenizer_identifier_non_existent(self):
             ):
                 _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
 
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-
-        mappings = (TOKENIZER_MAPPING,)
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, _) in enumerate(mapping[1:]):
-                for parent_config, _ in mapping[: index + 1]:
-                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
-                        self.assertFalse(issubclass(child_config, parent_config))
-
     def test_model_name_edge_cases_in_mappings(self):
         # tests: https://github.com/huggingface/transformers/pull/13251
         # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
index edab09fb69b9..579c27dd27a6 100644
--- a/tests/convnext/test_modeling_tf_convnext.py
+++ b/tests/convnext/test_modeling_tf_convnext.py
@@ -143,6 +143,13 @@ def setUp(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skipIf(
+        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
+        reason="TF (<=2.8) does not support backprop for grouped convolutions on CPU.",
+    )
+    def test_keras_fit(self):
+        pass
+
     @unittest.skip(reason="ConvNext does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/generation/test_generation_logits_process.py b/tests/generation/test_generation_logits_process.py
index b95110d0e06b..8489efd44d16 100644
--- a/tests/generation/test_generation_logits_process.py
+++ b/tests/generation/test_generation_logits_process.py
@@ -472,14 +472,14 @@ def test_forced_eos_token_logits_processor(self):
 
         logits_processor = ForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
 
-        # check that all scores are -inf except the eos_token_id when max_length is reached
+        # check that all scores are -inf except the eos_token_id when max_length-1 is reached
         input_ids = ids_tensor((batch_size, 4), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
         scores = logits_processor(input_ids, scores)
         self.assertTrue(torch.isneginf(scores[:, eos_token_id + 1 :]).all())
         self.assertListEqual(scores[:, eos_token_id].tolist(), 4 * [0])  # score for eos_token_id should be zero
 
-        # check that eos_token_id is not forced if max_length is not reached
+        # check that eos_token_id is not forced if max_length-1 is not reached
         input_ids = ids_tensor((batch_size, 3), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
         scores = logits_processor(input_ids, scores)
diff --git a/tests/generation/test_generation_tf_logits_process.py b/tests/generation/test_generation_tf_logits_process.py
index 2dc366f05c76..8a5ee053680e 100644
--- a/tests/generation/test_generation_tf_logits_process.py
+++ b/tests/generation/test_generation_tf_logits_process.py
@@ -26,6 +26,8 @@
     import tensorflow as tf
 
     from transformers.generation_tf_logits_process import (
+        TFForcedBOSTokenLogitsProcessor,
+        TFForcedEOSTokenLogitsProcessor,
         TFLogitsProcessorList,
         TFMinLengthLogitsProcessor,
         TFNoBadWordsLogitsProcessor,
@@ -43,7 +45,7 @@
 @require_tf
 class TFLogitsProcessorTest(unittest.TestCase):
     def _get_uniform_logits(self, batch_size: int, length: int):
-        scores = np.ones((batch_size, length), dtype=np.float32) / length
+        scores = tf.ones((batch_size, length), dtype=tf.float32) / length
         return scores
 
     def test_min_length_dist_processor(self):
@@ -54,15 +56,17 @@ def test_min_length_dist_processor(self):
         min_dist_processor = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
 
         # check that min length is applied at length 5
-        input_ids = ids_tensor((batch_size, 5), vocab_size=20)
+        cur_len = 5
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores)
+        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len)
         self.assertListEqual(scores_before_min_length[:, eos_token_id].numpy().tolist(), 4 * [-float("inf")])
 
         # check that min length is not applied anymore at length 15
-        input_ids = ids_tensor((batch_size, 15), vocab_size=20)
+        cur_len = 15
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
         scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores)
+        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len)
         self.assertFalse(tf.math.reduce_any(tf.math.is_inf(scores_before_min_length)).numpy())
 
     def test_temperature_dist_warper(self):
@@ -72,8 +76,10 @@ def test_temperature_dist_warper(self):
         scores = self._get_uniform_logits(batch_size=2, length=length)
 
         # tweak scores to not be uniform anymore
+        scores = scores.numpy()
         scores[1, 5] = (1 / length) + 0.1  # peak, 1st batch
         scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
+        scores = tf.convert_to_tensor(scores)
 
         # compute softmax
         probs = tf.nn.softmax(scores, axis=-1)
@@ -97,8 +103,11 @@ def test_temperature_dist_warper(self):
         self.assertLess(tf.math.reduce_min(probs[1, :]), tf.math.reduce_min(warped_prob_smooth[1, :]))
 
     def test_repetition_penalty_dist_process(self):
-        input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32)
         vocab_size = 10
+        cur_len = 2
+
+        input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32)
+        self.assertEqual(cur_len, input_ids.shape[1])
 
         scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
 
@@ -109,7 +118,7 @@ def test_repetition_penalty_dist_process(self):
 
         rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
 
-        scores = rep_penalty_proc(input_ids, tf.identity(scores))
+        scores = rep_penalty_proc(input_ids, tf.identity(scores), cur_len)
 
         # check that values were correctly changed
         self.assertAlmostEqual(scores[0, 0].numpy(), -(1 / vocab_size) * 2)
@@ -188,15 +197,18 @@ def test_top_p_dist_warper(self):
     def test_no_repeat_ngram_dist_processor(self):
         vocab_size = 3
         batch_size = 2
+        cur_len = 4
 
         input_ids = tf.constant([[1, 1, 2, 1], [0, 1, 0, 1]], dtype=tf.int32)
+        self.assertEqual(cur_len, input_ids.shape[1])
+
         scores = self._get_uniform_logits(batch_size, vocab_size)
 
         no_repeat_proc_2_gram = TFNoRepeatNGramLogitsProcessor(2)
         no_repeat_proc_3_gram = TFNoRepeatNGramLogitsProcessor(3)
 
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores))
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores))
+        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores), cur_len)
+        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores), cur_len)
 
         # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
         self.assertListEqual(
@@ -212,14 +224,17 @@ def test_no_bad_words_dist_processor(self):
         vocab_size = 5
         batch_size = 2
         eos_token_id = 4
+        cur_len = 4
 
         input_ids = tf.constant([[0, 1, 3, 1], [0, 1, 0, 1]], dtype=tf.int32)
+        self.assertEqual(cur_len, input_ids.shape[1])
+
         bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
         scores = self._get_uniform_logits(batch_size, vocab_size)
 
         no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
 
-        filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores))
+        filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores), cur_len)
 
         # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
         # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
@@ -228,14 +243,65 @@ def test_no_bad_words_dist_processor(self):
             [[True, True, False, True, True], [True, True, True, False, True]],
         )
 
+    def test_forced_bos_token_logits_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        bos_token_id = 0
+
+        logits_processor = TFForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
+
+        # check that all scores are -inf except the bos_token_id score
+        cur_len = 1
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores, cur_len)
+        self.assertTrue(
+            tf.math.reduce_all(tf.math.is_inf(scores[:, bos_token_id + 1 :]) & (scores[:, bos_token_id + 1 :] < 0))
+        )
+        self.assertListEqual(scores[:, bos_token_id].numpy().tolist(), 4 * [0])  # score for bos_token_id shold be zero
+
+        # check that bos_token_id is not forced if current length is greater than 1
+        cur_len = 4
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores, cur_len)
+        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
+
+    def test_forced_eos_token_logits_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        eos_token_id = 0
+        max_length = 5
+
+        logits_processor = TFForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
+
+        # check that all scores are -inf except the eos_token_id when max_length-1 is reached
+        cur_len = 4
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores, cur_len)
+        self.assertTrue(
+            tf.math.reduce_all(tf.math.is_inf(scores[:, eos_token_id + 1 :]) & (scores[:, eos_token_id + 1 :] < 0))
+        )
+        self.assertListEqual(
+            scores[:, eos_token_id].numpy().tolist(), 4 * [0]
+        )  # score for eos_token_id should be zero
+
+        # check that eos_token_id is not forced if max_length-1 is not reached
+        cur_len = 3
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores, cur_len)
+        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
+
     def test_processor_list(self):
         batch_size = 4
-        sequence_length = 10
+        cur_len = 10
         vocab_size = 15
         eos_token_id = 0
 
         # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
+        input_ids = ids_tensor((batch_size, cur_len), vocab_size)
         input_ids_comp = tf.identity(input_ids)
 
         scores = self._get_uniform_logits(batch_size, vocab_size)
@@ -251,13 +317,13 @@ def test_processor_list(self):
         no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
 
         # no processor list
-        scores = min_dist_proc(input_ids, scores)
+        scores = min_dist_proc(input_ids, scores, cur_len)
         scores = temp_dist_warp(input_ids, scores)
-        scores = rep_penalty_proc(input_ids, scores)
+        scores = rep_penalty_proc(input_ids, scores, cur_len)
         scores = top_k_warp(input_ids, scores)
         scores = top_p_warp(input_ids, scores)
-        scores = no_repeat_proc(input_ids, scores)
-        scores = no_bad_words_dist_proc(input_ids, scores)
+        scores = no_repeat_proc(input_ids, scores, cur_len)
+        scores = no_bad_words_dist_proc(input_ids, scores, cur_len)
 
         # with processor list
         processor = TFLogitsProcessorList(
@@ -271,7 +337,7 @@ def test_processor_list(self):
                 no_bad_words_dist_proc,
             ]
         )
-        scores_comp = processor(input_ids, scores_comp)
+        scores_comp = processor(input_ids, scores_comp, cur_len=cur_len)
 
         # remove inf
         scores = set_tensor_by_indices_to_value(scores, tf.math.is_inf(scores), -1e9)
diff --git a/tests/gpt2/test_modeling_tf_gpt2.py b/tests/gpt2/test_modeling_tf_gpt2.py
index d6470c0d1526..5b78bdf1bd2e 100644
--- a/tests/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/gpt2/test_modeling_tf_gpt2.py
@@ -536,7 +536,6 @@ def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
             "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
             "no_repeat_ngram_size": 2,
             "do_sample": False,
-            "repetition_penalty": 1.3,
             "num_beams": 2,
         }
 
@@ -544,8 +543,8 @@ def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
 
         output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
         expected_output_string = [
-            "Today is a beautiful day and I hope you enjoy it.\nI am very happy to announce that",
-            "Yesterday was the first time I've ever seen a game where you can play with",
+            "Today is a beautiful day and a great day for all of us.\n\nI’m",
+            "Yesterday was the first day of the year for the second time in a row,",
         ]
         self.assertListEqual(output_strings, expected_output_string)
 
diff --git a/tests/regnet/__init__.py b/tests/regnet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/regnet/test_modeling_regnet.py b/tests/regnet/test_modeling_regnet.py
new file mode 100644
index 000000000000..331e45296bc3
--- /dev/null
+++ b/tests/regnet/test_modeling_regnet.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch RegNet model. """
+
+
+import inspect
+import unittest
+
+from transformers import RegNetConfig
+from transformers.file_utils import cached_property, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from ..test_configuration_common import ConfigTester
+from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import RegNetForImageClassification, RegNetModel
+    from transformers.models.regnet.modeling_regnet import REGNET_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoFeatureExtractor
+
+
+class RegNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        embeddings_size=10,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[1, 1, 2, 1],
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.embeddings_size = embeddings_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.scope = scope
+        self.num_stages = len(hidden_sizes)
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return RegNetConfig(
+            num_channels=self.num_channels,
+            embeddings_size=self.embeddings_size,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            hidden_act=self.hidden_act,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = RegNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected last hidden states: B, C, H // 32, W // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = RegNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class RegNetModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as RegNet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (RegNetModel, RegNetForImageClassification) if is_torch_available() else ()
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = RegNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RegNetConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="RegNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="RegNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, module in model.named_modules():
+                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+                    self.assertTrue(
+                        torch.all(module.weight == 1),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+                    self.assertTrue(
+                        torch.all(module.bias == 0),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # RegNet's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        layers_type = ["basic", "bottleneck"]
+        for model_class in self.all_model_classes:
+            for layer_type in layers_type:
+                config.layer_type = layer_type
+                inputs_dict["output_hidden_states"] = True
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+                # check that output_hidden_states also work using config
+                del inputs_dict["output_hidden_states"]
+                config.output_hidden_states = True
+
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RegNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class RegNetModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            AutoFeatureExtractor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = RegNetForImageClassification.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.4180, -1.5051, -3.4836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
index 403255c4ce22..873c09105bc0 100644
--- a/tests/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ b/tests/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
@@ -413,28 +413,22 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
         pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
 
         with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
+            pt_outputs = pt_model(**pt_inputs).to_tuple()
 
+        fx_outputs = fx_model(**inputs_dict).to_tuple()
         self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
+        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+            self.assert_almost_equals(fx_output, pt_output.numpy(), 1e-5)
 
         # PT -> Flax
         with tempfile.TemporaryDirectory() as tmpdirname:
             pt_model.save_pretrained(tmpdirname)
             fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
 
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-
+        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
         self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
+        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 1e-5)
 
         # Flax -> PT
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -445,12 +439,11 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
         pt_model_loaded.eval()
 
         with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
+            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
 
         self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
+        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
+            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
 
diff --git a/tests/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/speech_to_text/test_modeling_tf_speech_to_text.py
index d8d538fe194a..897c54722a2f 100644
--- a/tests/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -508,7 +508,7 @@ def test_lm_head_model_random_beam_search_generate(self):
                 # if bos token id is not defined model needs input_ids, num_return_sequences = 1
                 self._check_generated_ids(model.generate(input_features, do_sample=True, num_beams=2))
 
-            with self.assertRaises(AssertionError):
+            with self.assertRaises(ValueError):
                 # generating more sequences than having beams leads is not possible
                 model.generate(input_features, do_sample=False, num_return_sequences=3, num_beams=2)
 
diff --git a/tests/t5/test_modeling_tf_t5.py b/tests/t5/test_modeling_tf_t5.py
index 7ac0b33e426b..7445aae53001 100644
--- a/tests/t5/test_modeling_tf_t5.py
+++ b/tests/t5/test_modeling_tf_t5.py
@@ -804,33 +804,3 @@ def test_translation_en_to_ro(self):
         translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
         self.assertEqual(translation, expected_translation)
-
-    def test_finetune_keras_trainer(self):
-        """Ensure that the model can be fine-tuned via the keras API and
-        that metrics work as expected.
-        """
-
-        # This metric expects to be called with the logits output
-        def _accuracy(y_true, y_pred):
-            return tf.keras.metrics.sparse_categorical_crossentropy(y_true[:, 0], y_pred[:, 0])
-
-        # measure the accuracy of the first token
-        class FirstTokenAccuracy(tf.keras.metrics.MeanMetricWrapper):
-            def __init__(self, name="accuracy", **kwargs):
-                super().__init__(_accuracy, name=name, **kwargs)
-
-        model = self.model
-        model.compile("adam", metrics=FirstTokenAccuracy())
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
-
-        examples = [
-            ("sentiment: Everything is awesome!", "positive"),
-            ("sentiment: Tensorflow datasets are hard to use", "negative"),
-        ]
-
-        inputs = dict(tokenizer([x[0] for x in examples], padding=True, return_tensors="tf"))
-        inputs["labels"] = tokenizer([x[1] for x in examples], return_tensors="tf").input_ids
-
-        model.fit(inputs)
-        m = model.evaluate(inputs)
-        self.assertEqual(len(m), 2)
diff --git a/tests/tapex/__init__.py b/tests/tapex/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/tapex/test_tokenization_tapex.py b/tests/tapex/test_tokenization_tapex.py
new file mode 100644
index 000000000000..dd9f3d4bcf25
--- /dev/null
+++ b/tests/tapex/test_tokenization_tapex.py
@@ -0,0 +1,909 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from typing import List
+
+import pandas as pd
+
+from transformers import AddedToken, TapexTokenizer
+from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
+from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
+
+from ..test_tokenization_common import TokenizerTesterMixin
+
+
+@require_pandas
+class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = TapexTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {"cls_token": "<s>"}
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]  # noqa: E231
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_table(self, tokenizer, length=5):
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+
+        if length == 0:
+            data = {}
+        else:
+            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
+
+        table = pd.DataFrame.from_dict(data)
+
+        return table
+
+    def get_table_and_query(self, tokenizer, length=5):
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+        table = self.get_table(tokenizer, length=length - 3)
+        query = " ".join(toks[:3])
+
+        return table, query
+
+    def get_clean_sequence(
+        self,
+        tokenizer,
+        with_prefix_space=False,
+        max_length=20,
+        min_length=5,
+        empty_table: bool = False,
+        add_special_tokens: bool = True,
+        return_table_and_query: bool = False,
+    ):
+
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+
+        if empty_table:
+            table = pd.DataFrame.from_dict({})
+            query = " ".join(toks[:min_length])
+        else:
+            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
+            table = pd.DataFrame.from_dict(data)
+            query = " ".join(toks[:3])
+
+        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
+        output_txt = tokenizer.decode(output_ids)
+
+        if len(output_ids) < min_length:
+            raise ValueError("Update the code to generate the sequences so that they are larger")
+        if len(output_ids) > max_length:
+            raise ValueError("Update the code to generate the sequences so that they are smaller")
+
+        if return_table_and_query:
+            return output_txt, output_ids, table, query
+
+        return output_txt, output_ids
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer_roberta(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def roberta_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+
+        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
+        self.assertListEqual(
+            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
+        )
+
+    def test_add_tokens_tokenizer(self):
+        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    table,
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
+                    add_special_tokens=False,
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokens[-3])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
+
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                empty_table = self.get_table(tokenizer, length=0)
+                seq_0 = "Test this method."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardless of whether the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
+
+                # Assert that the token type IDs have the same length as the input IDs
+                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
+                self.assertIn(0, output["token_type_ids"])
+
+    def test_add_special_tokens(self):
+        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_table = self.get_table(tokenizer, length=0)
+
+                special_token = "[SPECIAL_TOKEN]"
+
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+
+                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+
+    def test_batch_encode_plus_overflowing_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            table = self.get_table(tokenizer, length=10)
+            string_sequences = ["Testing the prepare_for_model method.", "Test"]
+
+            if tokenizer.pad_token is None:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+            tokenizer.batch_encode_plus(
+                table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
+            )
+
+    @is_pt_tf_cross_test
+    def test_batch_encode_plus_tensors(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                table = self.get_table(tokenizer, length=0)
+
+                # A Tensor cannot be build by sequences which are not the same size
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
+
+                if tokenizer.pad_token_id is None:
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding=True,
+                        return_tensors="pt",
+                    )
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding="longest",
+                        return_tensors="tf",
+                    )
+                else:
+                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
+                    tensorflow_tensor = tokenizer.batch_encode_plus(
+                        table, sequences, padding="longest", return_tensors="tf"
+                    )
+                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
+
+                    for key in encoded_sequences.keys():
+                        pytorch_value = pytorch_tensor[key].tolist()
+                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
+                        encoded_value = encoded_sequences[key]
+
+                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                # Test not batched
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
+                encoded_sequences_2 = tokenizer(table, sequences[0])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test not batched pairs
+                table = self.get_table(tokenizer, length=10)
+                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
+                encoded_sequences_2 = tokenizer(table, sequences[1])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test batched
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
+                encoded_sequences_2 = tokenizer(table, sequences)
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+    def test_internal_consistency(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                input_text, output_text = self.get_input_output_texts(tokenizer)
+
+                tokens = tokenizer.tokenize(input_text)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
+                self.assertListEqual(ids, ids_2)
+
+                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+                self.assertNotEqual(len(tokens_2), 0)
+                text_2 = tokenizer.decode(ids)
+                self.assertIsInstance(text_2, str)
+
+                self.assertEqual(text_2, output_text)
+
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                table = self.get_table(tokenizer, length=0)
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+
+                shutil.rmtree(tmpdirname)
+
+    def test_number_of_added_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                table, query = self.get_table_and_query(tokenizer)
+
+                sequences = tokenizer.encode(table, query, add_special_tokens=False)
+                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
+
+                self.assertEqual(2, len(attached_sequences) - len(sequences))
+
+    @unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
+    def test_prepare_for_model(self):
+        pass
+
+    @unittest.skip("TAPEX tokenizer does not support pairs.")
+    def test_maximum_encoding_length_pair_input(self):
+        pass
+
+    @unittest.skip("TAPEX tokenizer does not support pairs.")
+    def test_maximum_encoding_length_single_input(self):
+        pass
+
+    @unittest.skip("Not implemented")
+    def test_right_and_left_truncation(self):
+        pass
+
+    def test_encode_decode_with_spaces(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+
+                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
+                tokenizer.add_tokens(new_toks)
+                input = "[ABC][DEF][ABC][DEF]"
+                if self.space_between_special_tokens:
+                    output = "[ABC] [DEF] [ABC] [DEF]"
+                else:
+                    output = input
+                encoded = tokenizer.encode(table, input, add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+                self.assertIn(decoded, [output, output.lower()])
+
+    def test_tokenize_special_tokens(self):
+        """Test `tokenize` with special tokens."""
+        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
+                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
+
+                # TODO:
+                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
+                # with one variable(property) for a better maintainability?
+
+                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
+                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
+                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
+                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
+                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
+
+                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
+                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
+
+                self.assertEqual(len(token_1), 1)
+                self.assertEqual(len(token_2), 1)
+                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
+                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
+
+    def test_special_tokens_mask(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence_0 = "Encode this."
+                # Testing single inputs
+                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    def test_padding_to_max_length(self):
+        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer)
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    pad_to_max_length=True,
+                )
+                padded_sequence_length = len(padded_sequence)
+                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
+                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
+
+                # Check that nothing is done when a maximum length is not specified
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                self.assertEqual(sequence_length, padded_sequence_right_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_right)
+
+    def test_padding_to_multiple_of(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                if tokenizer.pad_token is None:
+                    self.skipTest("No padding token.")
+                else:
+                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
+                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
+                    for key, value in empty_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    # Should also work with truncation
+                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+    def test_right_and_left_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
+                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
+
+                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "left"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
+                self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
+
+                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                self.assertEqual(sequence_length, padded_sequence_right_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_right)
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
+                padded_sequence_left_length = len(padded_sequence_left)
+                self.assertEqual(sequence_length, padded_sequence_left_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_left)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence)
+                padded_sequence_right_length = len(padded_sequence_right)
+                self.assertEqual(sequence_length, padded_sequence_right_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_right)
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
+                padded_sequence_left_length = len(padded_sequence_left)
+                self.assertEqual(sequence_length, padded_sequence_left_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_left)
+
+    def test_encode_plus_with_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence = "Sequence"
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_size = 10
+                padding_idx = tokenizer.pad_token_id
+                token_type_padding_idx = tokenizer.pad_token_type_id
+
+                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
+                input_ids = encoded_sequence["input_ids"]
+                special_tokens_mask = encoded_sequence["special_tokens_mask"]
+                sequence_length = len(input_ids)
+
+                # Test 'longest' and 'no_padding' don't do anything
+                tokenizer.padding_side = "right"
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                self.assertEqual(sequence_length, not_padded_sequence_length)
+                self.assertListEqual(input_ids, not_padded_input_ids)
+                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                self.assertEqual(sequence_length, not_padded_sequence_length)
+                self.assertListEqual(input_ids, not_padded_input_ids)
+                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
+
+                # Test right padding
+                tokenizer.padding_side = "right"
+
+                right_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                right_padded_input_ids = right_padded_sequence["input_ids"]
+
+                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
+                right_padded_sequence_length = len(right_padded_input_ids)
+
+                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
+                self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
+                self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
+
+                # Test left padding
+                tokenizer.padding_side = "left"
+                left_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                left_padded_input_ids = left_padded_sequence["input_ids"]
+                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
+                left_padded_sequence_length = len(left_padded_input_ids)
+
+                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
+                self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
+                self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
+
+                if "token_type_ids" in tokenizer.model_input_names:
+                    token_type_ids = encoded_sequence["token_type_ids"]
+                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
+                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
+
+                    self.assertListEqual(
+                        (token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
+                    )
+                    self.assertListEqual(
+                        [[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
+                    )
+
+                if "attention_mask" in tokenizer.model_input_names:
+                    attention_mask = encoded_sequence["attention_mask"]
+                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
+                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
+
+                    self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
+                    self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
+
+    def test_batch_encode_plus_padding(self):
+        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
+
+        # Right padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+        # Left padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                tokenizer.padding_side = "left"
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+    def test_batch_encode_plus_batch_sequence_length(self):
+        # Tests that all encoded values have the correct size
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+                maximum_length = len(
+                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
+                )
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences_padded = [
+                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
+                    for sequence in sequences
+                ]
+
+                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
+                self.assertListEqual(
+                    encoded_sequences_padded,
+                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
+                )
+
+                # check 'longest' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=maximum_length + 10, padding="longest"
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+                # check 'no_padding' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=maximum_length + 10, padding=False
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+    def test_special_tokens_mask_input_pairs(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                empty_table = self.get_table(tokenizer, length=0)
+                table = self.get_table(tokenizer, length=10)
+                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
+                number_of_tokens = len(encoded_sequence)
+                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    table,
+                    sequence_0,
+                    add_special_tokens=True,
+                    return_special_tokens_mask=True,
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [
+                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+                ]
+                # NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
+                # in order to have equivalent results with encoding an empty table or empty sequence
+                del filtered_sequence[number_of_tokens + 1]
+                filtered_sequence = [x for x in filtered_sequence if x is not None]
+                print("Encoded sequence:", encoded_sequence)
+                print("Filtered sequence:", filtered_sequence)
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    @slow
+    def test_full_tokenizer(self):
+        question = "Greece held its last Summer Olympics in 2004"
+        table_dict = {
+            "header": ["Year", "City", "Country", "Nations"],
+            "rows": [
+                [1896, "Athens", "Greece", 14],
+                [1900, "Paris", "France", 24],
+                [1904, "St. Louis", "USA", 12],
+                [2004, "Athens", "Greece", 201],
+                [2008, "Beijing", "China", 204],
+                [2012, "London", "UK", 204],
+            ],
+        }
+        table = pd.DataFrame.from_dict(table_dict["rows"])
+        table.columns = table_dict["header"]
+
+        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+        encoding = tokenizer(table, question)
+
+        # fmt: off
+        expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
+        # fmt: on
+
+        self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
+
+    def test_tokenizer_as_target(self):
+        # by default the tokenizer do_lower_case
+        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
+        answer_text = "tapex is a good model!"
+        expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
+        with tokenizer.as_target_tokenizer():
+            answer_encoding = tokenizer(answer=answer_text)
+            self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
+
+    @slow
+    def test_tokenizer_lower_case(self):
+        cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
+        uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
+        answer_text = "Beijing, London, Paris"
+        answer_text_lower = "beijing, london, paris"
+
+        with cased_tokenizer.as_target_tokenizer():
+            with uncased_tokenizer.as_target_tokenizer():
+                self.assertNotEqual(
+                    cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
+                )
+                self.assertEqual(
+                    cased_tokenizer(answer=answer_text_lower).input_ids,
+                    uncased_tokenizer(answer=answer_text).input_ids,
+                )
+                # batched encoding assert
+                self.assertNotEqual(
+                    cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
+                )
+                self.assertEqual(
+                    cased_tokenizer(answer=[answer_text_lower]).input_ids,
+                    uncased_tokenizer(answer=[answer_text]).input_ids,
+                )
+        # test input encoding lowercase
+        question = "Greece held its last Summer Olympics in 2004"
+        table_dict = {
+            "header": ["Year", "City", "Country", "Nations"],
+            "rows": [
+                [1896, "Athens", "Greece", 14],
+                [1900, "Paris", "France", 24],
+                [1904, "St. Louis", "USA", 12],
+                [2004, "Athens", "Greece", 201],
+                [2008, "Beijing", "China", 204],
+                [2012, "London", "UK", 204],
+            ],
+        }
+        table = pd.DataFrame.from_dict(table_dict["rows"])
+        table.columns = table_dict["header"]
+
+        self.assertNotEqual(
+            cased_tokenizer(table=table, query=question).input_ids,
+            uncased_tokenizer(table=table, query=question).input_ids,
+        )
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index b72034de6958..bbb0befeb631 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1179,7 +1179,7 @@ def test_lm_head_model_random_beam_search_generate(self):
                 # num_return_sequences = 1
                 self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
 
-            with self.assertRaises(AssertionError):
+            with self.assertRaises(ValueError):
                 # generating more sequences than having beams leads is not possible
                 model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
@@ -1302,6 +1302,56 @@ def test_loss_computation(self):
 
                 self.assertEqual(loss.shape, [loss_size])
 
+    def test_keras_fit(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            if getattr(model, "hf_compute_loss", None):
+                # Test that model correctly compute the loss with kwargs
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                # Is there a better way to remove these decoder inputs?
+                prepared_for_class = {
+                    key: val
+                    for key, val in prepared_for_class.items()
+                    if key not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "decoder_input_ids")
+                }
+
+                possible_label_cols = {
+                    "labels",
+                    "label",
+                    "label_ids",
+                    "start_positions",
+                    "start_position",
+                    "end_positions",
+                    "end_position",
+                    "next_sentence_label",
+                }
+                label_names = possible_label_cols.intersection(set(prepared_for_class))
+                self.assertGreater(len(label_names), 0, msg="No matching label names found!")
+                labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
+                inputs_minus_labels = {key: val for key, val in prepared_for_class.items() if key not in label_names}
+                self.assertGreater(len(inputs_minus_labels), 0)
+                model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True)
+                # Make sure the model fits without crashing regardless of where we pass the labels
+                history1 = model.fit(
+                    prepared_for_class,
+                    validation_data=prepared_for_class,
+                    steps_per_epoch=1,
+                    validation_steps=1,
+                    shuffle=False,
+                )
+                val_loss1 = history1.history["val_loss"][0]
+                history2 = model.fit(
+                    inputs_minus_labels,
+                    labels,
+                    validation_data=(inputs_minus_labels, labels),
+                    steps_per_epoch=1,
+                    validation_steps=1,
+                    shuffle=False,
+                )
+                val_loss2 = history2.history["val_loss"][0]
+                self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3))
+
     def test_generate_with_headmasking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/vit_mae/test_modeling_vit_mae.py b/tests/vit_mae/test_modeling_vit_mae.py
index 6ae62cb1c2c3..8cbde5b2ce92 100644
--- a/tests/vit_mae/test_modeling_vit_mae.py
+++ b/tests/vit_mae/test_modeling_vit_mae.py
@@ -561,7 +561,7 @@ def test_inference_for_pretraining(self):
 
         # forward pass
         with torch.no_grad():
-            outputs = model(**inputs, noise=torch.from_numpy(noise))
+            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
 
         # verify the logits
         expected_shape = torch.Size((1, 196, 768))
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 372e63ad232b..8fe33b240ede 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -1,17 +1,12 @@
-docs/source/quicktour.mdx
-docs/source/quicktour.mdx
-docs/source/task_summary.mdx
-docs/source/task_summary.mdx
+docs/source/en/quicktour.mdx
+docs/source/en/task_summary.mdx
+docs/source/en/model_doc/speech_to_text.mdx
+docs/source/en/model_doc/tapex.mdx
 src/transformers/generation_utils.py
-src/transformers/generation_utils.py
-src/transformers/models/bart/modeling_bart.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/beit/modeling_beit.py
 src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
-src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
 src/transformers/models/blenderbot/modeling_blenderbot.py
-src/transformers/models/blenderbot/modeling_blenderbot.py
-src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
 src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
 src/transformers/models/convnext/modeling_convnext.py
 src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -20,16 +15,11 @@ src/transformers/models/dpt/modeling_dpt.py
 src/transformers/models/glpn/modeling_glpn.py
 src/transformers/models/hubert/modeling_hubert.py
 src/transformers/models/marian/modeling_marian.py
-src/transformers/models/marian/modeling_marian.py
-src/transformers/models/mbart/modeling_mbart.py
 src/transformers/models/mbart/modeling_mbart.py
 src/transformers/models/pegasus/modeling_pegasus.py
-src/transformers/models/pegasus/modeling_pegasus.py
-src/transformers/models/plbart/modeling_plbart.py
 src/transformers/models/plbart/modeling_plbart.py
 src/transformers/models/poolformer/modeling_poolformer.py
 src/transformers/models/resnet/modeling_resnet.py
-src/transformers/models/resnet/modeling_resnet.py
 src/transformers/models/roberta/modeling_roberta.py
 src/transformers/models/roberta/modeling_tf_roberta.py
 src/transformers/models/segformer/modeling_segformer.py
@@ -46,8 +36,9 @@ src/transformers/models/van/modeling_van.py
 src/transformers/models/vilt/modeling_vilt.py
 src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
 src/transformers/models/vit/modeling_vit.py
+src/transformers/models/vit/modeling_tf_vit.py
 src/transformers/models/vit_mae/modeling_vit_mae.py
 src/transformers/models/wav2vec2/modeling_wav2vec2.py
 src/transformers/models/wav2vec2/tokenization_wav2vec2.py
 src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
-src/transformers/models/wavlm/modeling_wavlm.py
\ No newline at end of file
+src/transformers/models/wavlm/modeling_wavlm.py
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 5ad7b4b1f788..16bf6348d387 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -466,6 +466,7 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None):
             # Example files are tested separately
             elif f.startswith("examples/pytorch"):
                 test_files_to_run.append("examples/pytorch/test_pytorch_examples.py")
+                test_files_to_run.append("examples/pytorch/test_accelerate_examples.py")
             elif f.startswith("examples/flax"):
                 test_files_to_run.append("examples/flax/test_flax_examples.py")
             else: