diff --git a/.circleci/config.yml b/.circleci/config.yml
index 856211e280cb..1bfe5d29f7f0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -587,6 +587,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
             - run: pip install -r examples/pytorch/_tests_requirements.txt
+            - run: pip install git+https://github.com/huggingface/accelerate
             - save_cache:
                   key: v0.4-torch_examples-{{ checksum "setup.py" }}
                   paths:
diff --git a/README.md b/README.md
index 024e45220129..f37b11088022 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ In Audio:
 ## If you are looking for custom support from the Hugging Face team
 
 <a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
 </a><br>
 
 ## Quick tour
@@ -303,6 +303,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
@@ -317,6 +318,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
diff --git a/README_ko.md b/README_ko.md
index 5d813b0cc76d..71aa11474401 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -281,6 +281,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@@ -296,6 +297,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 5570335d49f9..efbe0e4547c7 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -305,6 +305,7 @@ conda install -c huggingface transformers
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
@@ -320,6 +321,7 @@ conda install -c huggingface transformers
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 88ba012d5a89..c9396e45faa6 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -317,6 +317,7 @@ conda install -c huggingface transformers
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@@ -332,6 +333,7 @@ conda install -c huggingface transformers
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 69717477e1f2..c32004ff5214 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -296,6 +296,8 @@
       title: Reformer
     - local: model_doc/rembert
       title: RemBERT
+    - local: model_doc/regnet
+      title: RegNet
     - local: model_doc/resnet
       title: ResNet
     - local: model_doc/retribert
@@ -328,6 +330,8 @@
       title: T5v1.1
     - local: model_doc/tapas
       title: TAPAS
+    - local: model_doc/tapex
+      title: TAPEX
     - local: model_doc/transfo-xl
       title: Transformer XL
     - local: model_doc/trocr
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 281add6e5ef4..2071e41e672f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -124,6 +124,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
 1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
 1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
@@ -138,6 +139,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
@@ -234,6 +236,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           ResNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
@@ -250,6 +253,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TAPEX            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/regnet.mdx b/docs/source/en/model_doc/regnet.mdx
new file mode 100644
index 000000000000..666a9ee39675
--- /dev/null
+++ b/docs/source/en/model_doc/regnet.mdx
@@ -0,0 +1,48 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RegNet
+
+## Overview
+
+The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+
+The authors design search spaces to perform Neural Architecture Search (NAS). They first start from a high dimensional search space and iteratively reduce the search space by empirically applying constraints based on the best-performing models sampled by the current search space.
+
+The abstract from the paper is the following:
+
+*In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.*
+
+Tips:
+
+- One can use [`AutoFeatureExtractor`] to prepare images for the model.
+- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer)
+
+This model was contributed by [Francesco](https://huggingface.co/Francesco).
+The original code can be found [here](https://github.com/facebookresearch/pycls).
+
+
+## RegNetConfig
+
+[[autodoc]] RegNetConfig
+
+
+## RegNetModel
+
+[[autodoc]] RegNetModel
+    - forward
+
+
+## RegNetForImageClassification
+
+[[autodoc]] RegNetForImageClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/tapex.mdx b/docs/source/en/model_doc/tapex.mdx
new file mode 100644
index 000000000000..f6e65764e50d
--- /dev/null
+++ b/docs/source/en/model_doc/tapex.mdx
@@ -0,0 +1,130 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# TAPEX
+
+## Overview
+
+The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu,
+Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
+which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. 
+
+TAPEX has been fine-tuned on several datasets: 
+- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
+- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
+- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
+- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
+
+The abstract from the paper is the following:
+
+*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
+still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
+propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
+synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
+executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
+TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
+on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
+to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
+and to achieve new state-of-the-art results on various downstream tasks.*
+
+Tips:
+
+- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. 
+- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
+- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format: 
+  `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
+- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
+  and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). 
+
+## Usage: inference
+
+Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
+We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
+based on the configuration file of the checkpoint on the hub.
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+>>> import pandas as pd
+
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+
+>>> # prepare table + question
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> question = "how many movies does Leonardo Di Caprio have?"
+
+>>> encoding = tokenizer(table, question, return_tensors="pt")
+
+>>> # let the model generate an answer autoregressively
+>>> outputs = model.generate(**encoding)
+
+>>> # decode back to text
+>>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+>>> print(predicted_answer)
+53
+```
+
+Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
+and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
+
+```python
+>>> # prepare table + question
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> questions = [
+...     "how many movies does Leonardo Di Caprio have?",
+...     "which actor has 69 movies?",
+...     "what's the first name of the actor who has 87 movies?",
+... ]
+>>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
+
+>>> # let the model generate an answer autoregressively
+>>> outputs = model.generate(**encoding)
+
+>>> # decode back to text
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+[' 53', ' george clooney', ' brad pitt']
+```
+
+In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
+of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
+benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
+>>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
+
+>>> # prepare table + sentence
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> table = pd.DataFrame.from_dict(data)
+>>> sentence = "George Clooney has 30 movies"
+
+>>> encoding = tokenizer(table, sentence, return_tensors="pt")
+
+>>> # forward pass
+>>> outputs = model(**encoding)
+
+>>> # print prediction
+>>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
+>>> print(model.config.id2label[predicted_class_idx])
+Refused
+```
+
+
+## TapexTokenizer
+
+[[autodoc]] TapexTokenizer
+    - __call__
+    - save_vocabulary
\ No newline at end of file
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 65fb5fa5cc54..acdabb717024 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -67,6 +67,7 @@ Ready-made configurations include the following architectures:
 - PLBart
 - RoBERTa
 - T5
+- TAPEX
 - ViT
 - XLM-RoBERTa
 - XLM-RoBERTa-XL
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 76eca5486939..247ba09d54ab 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -23,6 +23,7 @@
 # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -537,7 +538,10 @@ def group_texts(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -581,7 +585,10 @@ def group_texts(examples):
                 )
 
         if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
@@ -592,6 +599,9 @@ def group_texts(examples):
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"perplexity": perplexity}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 6a3b48c3b1c2..2634cc25e5b2 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -23,6 +23,7 @@
 # You can also adapt this script on your own mlm task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -457,9 +458,11 @@ def group_texts(examples):
     train_dataset = tokenized_datasets["train"]
     eval_dataset = tokenized_datasets["validation"]
 
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # Conditional for small test subsets
+    if len(train_dataset) > 3:
+        # Log a few random samples from the training set:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
 
     # Data collator
     # This one will take care of randomly masking the tokens.
@@ -581,7 +584,10 @@ def group_texts(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -625,7 +631,10 @@ def group_texts(examples):
                 )
 
         if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
@@ -636,6 +645,9 @@ def group_texts(examples):
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"perplexity": perplexity}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index d97fb71f395c..a575644130f0 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -540,7 +541,10 @@ def preprocess_function(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -578,6 +582,12 @@ def preprocess_function(examples):
                     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                 )
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -586,6 +596,8 @@ def preprocess_function(examples):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 08f8339036c2..6da75822398c 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -783,11 +784,20 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
 
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
         if args.push_to_hub and epoch < args.num_train_epochs - 1:
             accelerator.wait_for_everyone()
             unwrapped_model = accelerator.unwrap_model(model)
@@ -879,9 +889,6 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
 
         accelerator.log(log, step=completed_steps)
 
-    if args.checkpointing_steps == "epoch":
-        accelerator.save_state(f"epoch_{epoch}")
-
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
@@ -890,6 +897,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_f1": eval_metric["f1"], "eval_exact": eval_metric["exact"]}, f)
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index fd2bb2cc8162..adc9e616dda0 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own summarization task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -602,7 +603,10 @@ def postprocess_text(preds, labels):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -669,7 +673,10 @@ def postprocess_text(preds, labels):
                 )
 
         if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
@@ -679,6 +686,16 @@ def postprocess_text(preds, labels):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump(
+                {
+                    "eval_rouge1": result["rouge1"],
+                    "eval_rouge2": result["rouge2"],
+                    "eval_rougeL": result["rougeL"],
+                    "eval_rougeLsum": result["rougeLsum"],
+                },
+                f,
+            )
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
new file mode 100644
index 000000000000..883dc434deb7
--- /dev/null
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import json
+import logging
+import os
+import sys
+from unittest.mock import patch
+
+import torch
+
+from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device
+from transformers.utils import is_apex_available
+
+
+SRC_DIRS = [
+    os.path.join(os.path.dirname(__file__), dirname)
+    for dirname in [
+        "text-generation",
+        "text-classification",
+        "token-classification",
+        "language-modeling",
+        "multiple-choice",
+        "question-answering",
+        "summarization",
+        "translation",
+        "image-classification",
+        "speech-recognition",
+        "audio-classification",
+        "speech-pretraining",
+        "image-pretraining",
+    ]
+]
+sys.path.extend(SRC_DIRS)
+
+
+if SRC_DIRS is not None:
+    import run_clm_no_trainer
+    import run_glue_no_trainer
+    import run_mlm_no_trainer
+    import run_ner_no_trainer
+    import run_qa_no_trainer as run_squad_no_trainer
+    import run_summarization_no_trainer
+    import run_swag_no_trainer
+    import run_translation_no_trainer
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f")
+    args = parser.parse_args()
+    return args.f
+
+
+def get_results(output_dir):
+    results = {}
+    path = os.path.join(output_dir, "all_results.json")
+    if os.path.exists(path):
+        with open(path, "r") as f:
+            results = json.load(f)
+    else:
+        raise ValueError(f"can't find {path}")
+    return results
+
+
+def is_cuda_and_apex_available():
+    is_using_cuda = torch.cuda.is_available() and torch_device == "cuda"
+    return is_using_cuda and is_apex_available()
+
+
+class ExamplesTestsNoTrainer(TestCasePlus):
+    def test_run_glue_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_glue_no_trainer.py
+            --model_name_or_path distilbert-base-uncased
+            --output_dir {tmp_dir}
+            --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
+            --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --learning_rate=1e-4
+            --seed=42
+            --checkpointing_steps epoch
+            """.split()
+
+        if is_cuda_and_apex_available():
+            testargs.append("--fp16")
+
+        with patch.object(sys, "argv", testargs):
+            run_glue_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_clm_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_clm_no_trainer.py
+            --model_name_or_path distilgpt2
+            --train_file ./tests/fixtures/sample_text.txt
+            --validation_file ./tests/fixtures/sample_text.txt
+            --block_size 128
+            --per_device_train_batch_size 5
+            --per_device_eval_batch_size 5
+            --num_train_epochs 2
+            --output_dir {tmp_dir}
+            --checkpointing_steps epoch
+            """.split()
+
+        if torch.cuda.device_count() > 1:
+            # Skipping because there are not enough batches to train the model + would need a drop_last to work.
+            return
+
+        with patch.object(sys, "argv", testargs):
+            run_clm_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertLess(result["perplexity"], 100)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_mlm_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_mlm_no_trainer.py
+            --model_name_or_path distilroberta-base
+            --train_file ./tests/fixtures/sample_text.txt
+            --validation_file ./tests/fixtures/sample_text.txt
+            --output_dir {tmp_dir}
+            --num_train_epochs=1
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_mlm_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertLess(result["perplexity"], 42)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_ner_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
+        epochs = 7 if get_gpu_count() > 1 else 2
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_ner_no_trainer.py
+            --model_name_or_path bert-base-uncased
+            --train_file tests/fixtures/tests_samples/conll/sample.json
+            --validation_file tests/fixtures/tests_samples/conll/sample.json
+            --output_dir {tmp_dir}
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=2
+            --num_train_epochs={epochs}
+            --seed 7
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_ner_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
+            self.assertLess(result["train_loss"], 0.5)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_squad_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_qa_no_trainer.py
+            --model_name_or_path bert-base-uncased
+            --version_2_with_negative=False
+            --train_file tests/fixtures/tests_samples/SQUAD/sample.json
+            --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=10
+            --num_warmup_steps=2
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_squad_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_f1"], 30)
+            self.assertGreaterEqual(result["eval_exact"], 30)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    def test_run_swag_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_swag_no_trainer.py
+            --model_name_or_path bert-base-uncased
+            --train_file tests/fixtures/tests_samples/swag/sample.json
+            --validation_file tests/fixtures/tests_samples/swag/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=20
+            --num_warmup_steps=2
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_swag_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.8)
+
+    @slow
+    def test_run_summarization_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_summarization_no_trainer.py
+            --model_name_or_path t5-small
+            --train_file tests/fixtures/tests_samples/xsum/sample.json
+            --validation_file tests/fixtures/tests_samples/xsum/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=50
+            --num_warmup_steps=8
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_summarization_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_rouge1"], 10)
+            self.assertGreaterEqual(result["eval_rouge2"], 2)
+            self.assertGreaterEqual(result["eval_rougeL"], 7)
+            self.assertGreaterEqual(result["eval_rougeLsum"], 7)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
+
+    @slow
+    def test_run_translation_no_trainer(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_translation_no_trainer.py
+            --model_name_or_path sshleifer/student_marian_en_ro_6_1
+            --source_lang en
+            --target_lang ro
+            --train_file tests/fixtures/tests_samples/wmt16/sample.json
+            --validation_file tests/fixtures/tests_samples/wmt16/sample.json
+            --output_dir {tmp_dir}
+            --max_train_steps=50
+            --num_warmup_steps=8
+            --learning_rate=3e-3
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --source_lang en_XX
+            --target_lang ro_RO
+            --checkpointing_steps epoch
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_translation_no_trainer.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_bleu"], 30)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 5bd1d1fa1e5c..2c7fa186d0e0 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Finetuning a 🤗 Transformers model for sequence classification on GLUE."""
 import argparse
+import json
 import logging
 import math
 import os
@@ -150,7 +151,6 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
-    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -488,7 +488,10 @@ def preprocess_function(examples):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -526,7 +529,10 @@ def preprocess_function(examples):
                 )
 
         if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
@@ -557,6 +563,10 @@ def preprocess_function(examples):
         eval_metric = metric.compute()
         logger.info(f"mnli-mm: {eval_metric}")
 
+    if args.output_dir is not None:
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 57d3ceee905d..ab9fcce6df95 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -19,6 +19,7 @@
 """
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -639,7 +640,10 @@ def compute_metrics():
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -662,7 +666,6 @@ def compute_metrics():
                 references=refs,
             )  # predictions and preferences are expected to be a nested list of labels, not label_ids
 
-        # eval_metric = metric.compute()
         eval_metric = compute_metrics()
         accelerator.print(f"epoch {epoch}:", eval_metric)
         if args.with_tracking:
@@ -686,7 +689,10 @@ def compute_metrics():
                 )
 
         if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
@@ -697,6 +703,9 @@ def compute_metrics():
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_accuracy": eval_metric["accuracy"], "train_loss": float(loss.cpu().detach().numpy())}, f)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index bf7e15ae4dd1..034387582b84 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -19,6 +19,7 @@
 # You can also adapt this script on your own text translation task. Pointers for this are left as comments.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -586,7 +587,10 @@ def postprocess_text(preds, labels):
 
             if isinstance(checkpointing_steps, int):
                 if completed_steps % checkpointing_steps == 0:
-                    accelerator.save_state(f"step_{completed_steps}")
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
 
             if completed_steps >= args.max_train_steps:
                 break
@@ -653,7 +657,10 @@ def postprocess_text(preds, labels):
                 )
 
         if args.checkpointing_steps == "epoch":
-            accelerator.save_state(f"epoch_{epoch}")
+            output_dir = f"step_{completed_steps}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
 
     if args.output_dir is not None:
         accelerator.wait_for_everyone()
@@ -663,6 +670,8 @@ def postprocess_text(preds, labels):
             tokenizer.save_pretrained(args.output_dir)
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+            json.dump({"eval_bleu": eval_metric["score"]}, f)
 
 
 if __name__ == "__main__":
diff --git a/examples/research_projects/tapex/README.md b/examples/research_projects/tapex/README.md
new file mode 100644
index 000000000000..7d98901e281e
--- /dev/null
+++ b/examples/research_projects/tapex/README.md
@@ -0,0 +1,288 @@
+<!---
+Copyright 2022 The Microsoft Inc. and The HuggingFace Inc. Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Run Table Tasks with TAPEX
+
+TAPEX is a table pre-training approach for table-related tasks. By learning a neural SQL executor over a synthetic corpus based on generative language models (e.g., BART), it achieves state-of-the-art performance on several table-based question answering benchmarks and table-based fact verification benchmark. More details can be found in the original paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/pdf/2107.07653.pdf).
+
+> If you are also familiar with [fairseq](https://github.com/pytorch/fairseq), you may also find [the official implementation](https://github.com/microsoft/Table-Pretraining) useful, which leverages the framework.
+
+## Table Question Answering Tasks
+
+### What is Table Question Answering
+
+![Example](https://table-pretraining.github.io/assets/tableqa_task.png)
+
+The task of Table Question Answering (TableQA) is to empower machines to answer users' questions over a given table. The resulting answer(s) can be a region in the table, or a number calculated by applying aggregation operators to a specific region.
+
+### What Questions Can be Answered
+
+Benefiting from the powerfulness of generative models, TAPEX can deal with almost all kinds of questions over tables (if there is training data). Below are some typical question and their answers taken from [WikiTableQuestion](https://nlp.stanford.edu/blog/wikitablequestions-a-complex-real-world-question-understanding-dataset).
+
+| Question | Answer |
+| :---: | :---: |
+| What is the years won for each team? | 2004, 2008, 2012 |
+| How long did Taiki Tsuchiya last? | 4:27 |
+| What is the total amount of matches drawn? | 1 |
+| Besides Tiger Woods, what other player won between 2007 and 2009? | Camilo Villegas |
+| What was the last Baekje Temple? | Uija |
+| What is the difference between White voters and Black voters in 1948? | 0 |
+| What is the average number of sailors for each country during the worlds qualification tournament? | 2 |
+
+
+### How to Fine-tune TAPEX on TableQA
+
+We provide a fine-tuning script of tapex for TableQA on the WikiSQL benchmark: [WikiSQL](https://github.com/salesforce/WikiSQL).
+This script is customized for tapex models, and can be easily adapted to other benchmarks such as WikiTableQuestion
+(only some tweaks in the function `preprocess_tableqa_function`).
+
+#### TAPEX-Base on WikiSQL
+
+Here is how to run the script on the WikiSQL with `tapex-base`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly.
+
+```bash
+export EXP_NAME=wikisql_tapex_base
+
+python run_wikisql_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-base \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 4 \
+  --gradient_accumulation_steps 8 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000
+```
+
+#### TAPEX-Large on WikiSQL
+
+Here is how to run the script on the WikiSQL with `tapex-large`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-large results within the memory budget of 16GB and 1 GPU card with fp16. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. If you do not install apex or other mixed-precision-training libs, you could disable the `predict_with_generate` option to save GPU memory and manually evaluate the model once the fine-tuning finished. Or just pick up the last checkpoint, which usually performs good enough on the dataset.
+
+```bash
+export EXP_NAME=wikisql_tapex_large
+
+python run_wikisql_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-large \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 32 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000 \
+  --fp16
+```
+
+#### TAPEX-Base on WikiTableQuestions
+
+Here is how to run the script on the WikiTableQuestions with `tapex-base`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly.
+
+```bash
+export EXP_NAME=wikitablequestions_tapex_base
+
+python run_wikitablequestions_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-base \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 4 \
+  --gradient_accumulation_steps 8 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000
+```
+
+#### TAPEX-Large on WikiTableQuestions
+
+Here is how to run the script on the WikiTableQuestions with `tapex-large`:
+> The default hyper-parameter may allow you to reproduce our reported tapex-large results within the memory budget of 16GB and 1 GPU card with fp16. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. If you do not install apex or other mixed-precision-training libs, you could reduce the `per_device_train_batch_size` and `per_device_eval_batch_size` and have another try. Or you could disable the `predict_with_generate` option to save GPU memory and manually evaluate the model once the fine-tuning finished. Or just pick up the last checkpoint, which usually performs good enough on the dataset.
+
+```bash
+export EXP_NAME=wikitablequestions_tapex_large
+
+python run_wikitablequestions_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-large \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 2 \
+  --gradient_accumulation_steps 12 \
+  --per_device_eval_batch_size 4 \
+  --learning_rate 3e-5 \
+  --logging_steps 10 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --evaluation_strategy steps \
+  --predict_with_generate \
+  --num_beams 5 \
+  --weight_decay 1e-2 \
+  --label_smoothing_factor 0.1 \
+  --max_steps 20000 \
+  --fp16
+```
+
+### How to Evaluate TAPEX Fine-tuned Models on TableQA
+
+We provide fine-tuned model weights to reproduce our results. You can evaluate them using the following command:
+> You can also replace `microsoft/tapex-base-finetuned-wikisql` with your local directory to evaluate your fine-tuned models. Notice that if the model has a larger size, you should reduce `per_device_eval_batch_size` to fit the memory requirement.
+
+```bash
+export EXP_NAME=wikisql_tapex_base_eval
+
+python run_wikisql_with_tapex.py \
+  --do_eval \
+  --model_name_or_path microsoft/tapex-base-finetuned-wikisql \
+  --output_dir $EXP_NAME \
+  --per_device_eval_batch_size 4 \
+  --predict_with_generate \
+  --num_beams 5
+```
+
+## Table Fact Verification Tasks
+
+### What is Table Fact Verification
+
+![Example](https://table-pretraining.github.io/assets/tableft_task.png)
+
+The task of Table Fact Verification (TableFV) is to empower machines to justify if a statement follows facts in a given table. The result is a binary classification belonging to `1` (entailed) or `0` (refused).
+
+### How to Fine-tune TAPEX on TableFV
+
+#### TAPEX-Base on TabFact
+
+We provide a fine-tuning script of tapex for TableFV on the TabFact benchmark: [TabFact](https://github.com/wenhuchen/Table-Fact-Checking).
+
+Here is how to run the script on the TabFact:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. Note that the `eval_accumulation_steps` is necessary, otherwise GPU memory leaks will occur during the evaluation.
+
+```bash
+export EXP_NAME=tabfact_tapex_base
+
+python run_tabfact_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-base \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 3 \
+  --gradient_accumulation_steps 16 \
+  --per_device_eval_batch_size 12 \
+  --eval_accumulation_steps 6 \
+  --warm_steps 1000 \
+  --logging_steps 10 \
+  --learning_rate 3e-5 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --evaluation_strategy steps \
+  --weight_decay 1e-2 \
+  --max_steps 30000 \
+  --max_grad_norm 0.1
+```
+
+#### TAPEX-Large on TabFact
+
+Here is how to run the script on the TabFact:
+> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 24GB and 1 GPU card. Sorry we cannot reduce the memory consumption since the model input in TabFact usually contains nearly ~1000 tokens. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. Note that the `eval_accumulation_steps` is necessary, otherwise GPU memory leaks will occur during the evaluation.
+
+```bash
+export EXP_NAME=tabfact_tapex_large
+
+python run_tabfact_with_tapex.py \
+  --do_train \
+  --do_eval \
+  --output_dir $EXP_NAME \
+  --model_name_or_path microsoft/tapex-large \
+  --overwrite_output_dir \
+  --per_device_train_batch_size 2 \
+  --gradient_accumulation_steps 18 \
+  --per_device_eval_batch_size 4 \
+  --eval_accumulation_steps 12 \
+  --warm_steps 1000 \
+  --logging_steps 10 \
+  --learning_rate 3e-5 \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --evaluation_strategy steps \
+  --weight_decay 1e-2 \
+  --max_steps 30000 \
+  --max_grad_norm 0.1
+```
+
+### How to Evaluate TAPEX Fine-tuned Models on TableFV
+
+We provide fine-tuned model weights to reproduce our results. You can evaluate them using the following command:
+> You can also replace `microsoft/tapex-base-finetuned-tabfact` with your local directory to evaluate your fine-tuned models. Notice that if the model has a larger size, you should reduce `per_device_eval_batch_size` to fit the memory requirement.
+
+```bash
+export EXP_NAME=tabfact_tapex_base_eval
+
+python run_tabfact_with_tapex.py \
+  --do_eval \
+  --model_name_or_path microsoft/tapex-base-finetuned-tabfact \
+  --output_dir $EXP_NAME \
+  --per_device_eval_batch_size 12 \
+  --eval_accumulation_steps 6
+```
+
+## Reproduced Results
+
+We get the following results on the dev set of the benchmark with the previous commands:
+
+| Task | Model Size | Metric | Result |
+|:---:|:---:|:---:|:---:|
+| WikiSQL (Weak) | Base | Denotation Accuracy | 88.1 |
+| WikiSQL (Weak) | Large | Denotation Accuracy | 89.5 |
+| WikiTableQuestion | Base | Denotation Accuracy | 47.1 |
+| WikiTableQuestion | Large | Denotation Accuracy | 57.2 |
+| TabFact | Base | Accuracy | 78.7 |
+| TabFact | Large | Accuracy | 83.6 |
diff --git a/examples/research_projects/tapex/requirements.txt b/examples/research_projects/tapex/requirements.txt
new file mode 100644
index 000000000000..2379012a9b23
--- /dev/null
+++ b/examples/research_projects/tapex/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+datasets
+pandas
+nltk
\ No newline at end of file
diff --git a/examples/research_projects/tapex/run_tabfact_with_tapex.py b/examples/research_projects/tapex/run_tabfact_with_tapex.py
new file mode 100644
index 000000000000..0ed573ad9c1a
--- /dev/null
+++ b/examples/research_projects/tapex/run_tabfact_with_tapex.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fine-tuning the library models for tapex on table-based fact verification tasks.
+Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py
+"""
+
+import logging
+import os
+import random
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    AutoConfig,
+    BartForSequenceClassification,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    TapexTokenizer,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="tab_fact", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default="tab_fact",
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
+    )
+    max_seq_length: int = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the training data."}
+    )
+    validation_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the validation data."}
+    )
+    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
+
+    def __post_init__(self):
+        if self.dataset_name is not None:
+            pass
+        elif self.train_file is None or self.validation_file is None:
+            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
+        else:
+            train_extension = self.train_file.split(".")[-1]
+            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            validation_extension = self.validation_file.split(".")[-1]
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
+    #
+    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
+    # single column. You can easily tweak this behavior (see below)
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        # Loading a dataset from your local files.
+        # CSV/JSON training and evaluation files are needed.
+        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
+
+        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
+        # when you use `do_predict` without specifying a GLUE benchmark task.
+        if training_args.do_predict:
+            if data_args.test_file is not None:
+                train_extension = data_args.train_file.split(".")[-1]
+                test_extension = data_args.test_file.split(".")[-1]
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                data_files["test"] = data_args.test_file
+            else:
+                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
+
+        for key in data_files.keys():
+            logger.info(f"load a local file for {key}: {data_files[key]}")
+
+        if data_args.train_file.endswith(".csv"):
+            # Loading a dataset from local csv files
+            raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
+        else:
+            # Loading a dataset from local json files
+            raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Labels
+    label_list = raw_datasets["train"].features["label"].names
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # load tapex tokenizer
+    tokenizer = TapexTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        add_prefix_space=True,
+    )
+    model = BartForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Padding strategy
+    if data_args.pad_to_max_length:
+        padding = "max_length"
+    else:
+        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
+        padding = False
+
+    # Some models have set the order of the labels to use, so let's make sure we do use it.
+    model.config.label2id = {"Refused": 0, "Entailed": 1}
+    model.config.id2label = {0: "Refused", 1: "Entailed"}
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def preprocess_tabfact_function(examples):
+        # Tokenize the texts
+        def _convert_table_text_to_pandas(_table_text):
+            """Runs the structured pandas table object for _table_text.
+            An example _table_text can be: round#clubs remaining\nfirst round#156\n
+            """
+            _table_content = [_table_row.split("#") for _table_row in _table_text.strip("\n").split("\n")]
+            _table_pd = pd.DataFrame.from_records(_table_content[1:], columns=_table_content[0])
+            return _table_pd
+
+        questions = examples["statement"]
+        tables = list(map(_convert_table_text_to_pandas, examples["table_text"]))
+        result = tokenizer(tables, questions, padding=padding, max_length=max_seq_length, truncation=True)
+
+        result["label"] = examples["label"]
+        return result
+
+    with training_args.main_process_first(desc="dataset map pre-processing"):
+        raw_datasets = raw_datasets.map(
+            preprocess_tabfact_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    if training_args.do_predict or data_args.test_file is not None:
+        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+
+    # Log a few random samples from the training set:
+    if training_args.do_train:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+    # predictions and label_ids field) and has to return a dictionary string to float.
+    def compute_metrics(p: EvalPrediction):
+        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        preds = np.argmax(preds, axis=1)
+        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+
+    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    elif training_args.fp16:
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+    else:
+        data_collator = None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(eval_dataset=eval_dataset)
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        # Removing the `label` columns because it contains -1 and Trainer won't like that.
+        predict_dataset = predict_dataset.remove_columns("label")
+        predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
+        predictions = np.argmax(predictions, axis=1)
+
+        output_predict_file = os.path.join(training_args.output_dir, "predict_results_tabfact.txt")
+        if trainer.is_world_process_zero():
+            with open(output_predict_file, "w") as writer:
+                logger.info("***** Predict Results *****")
+                writer.write("index\tprediction\n")
+                for index, item in enumerate(predictions):
+                    item = label_list[item]
+                    writer.write(f"{index}\t{item}\n")
+
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
new file mode 100644
index 000000000000..594c83cb6be5
--- /dev/null
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -0,0 +1,629 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fine-tuning the library models for tapex on table-based question answering tasks.
+Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py
+"""
+
+import logging
+import os
+import sys
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from functools import partial
+from typing import List, Optional
+
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+import transformers
+from filelock import FileLock
+from transformers import (
+    AutoConfig,
+    BartForConditionalGeneration,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    TapexTokenizer,
+    set_seed,
+)
+from transformers.file_utils import is_offline_mode
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+try:
+    nltk.data.find("tokenizers/punkt")
+except (LookupError, OSError):
+    if is_offline_mode():
+        raise LookupError(
+            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
+        )
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name. "
+            "By default we use BART-large tokenizer for TAPEX-large."
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="wikisql", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (rouge) on "
+            "(a jsonlines or csv file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (rouge) on " "(a jsonlines or csv file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+            "during ``evaluate`` and ``predict``."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+            "which is used during ``evaluate`` and ``predict``."
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # IMPORTANT: the initial BART model's decoding is penalized by no_repeat_ngram_size, and thus
+    # we should disable it here to avoid problematic generation
+    config.no_repeat_ngram_size = 0
+    config.max_length = 1024
+    config.early_stopping = False
+
+    # load tapex tokenizer
+    tokenizer = TapexTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        add_prefix_space=True,
+    )
+
+    # load Bart based Tapex model (default tapex-large)
+    model = BartForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    if model.config.decoder_start_token_id is None:
+        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = datasets["validation"].column_names
+    elif training_args.do_predict:
+        column_names = datasets["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+
+    # Temporarily set max_target_length for training.
+    max_target_length = data_args.max_target_length
+    padding = "max_length" if data_args.pad_to_max_length else False
+
+    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
+        logger.warning(
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
+        )
+
+    def preprocess_tableqa_function(examples, is_training=False):
+        """
+        The is_training FLAG is used to identify if we could use the supervision
+        to truncate the table content if it is required.
+        """
+
+        # this function is specific for WikiSQL since the util function need the data structure
+        # to retrieve the WikiSQL answer for each question
+        def _convert_table_types(_table):
+            """Runs the type converter over the table cells."""
+            ret_table = deepcopy(_table)
+            types = ret_table["types"]
+            ret_table["real_rows"] = ret_table["rows"]
+            typed_rows = []
+            for row in ret_table["rows"]:
+                typed_row = []
+                for column, cell_value in enumerate(row):
+                    typed_row.append(_TYPE_CONVERTER[types[column]](cell_value))
+                typed_rows.append(typed_row)
+            ret_table["rows"] = typed_rows
+            return ret_table
+
+        questions = [question.lower() for question in examples["question"]]
+        example_tables = examples["table"]
+        example_sqls = examples["sql"]
+        tables = [
+            pd.DataFrame.from_records(example_table["rows"], columns=example_table["header"])
+            for example_table in example_tables
+        ]
+
+        # using tapas utils to obtain wikisql answer
+        answers = []
+        for example_sql, example_table in zip(example_sqls, example_tables):
+            tapas_table = _convert_table_types(example_table)
+            answer_list: List[str] = retrieve_wikisql_query_answer_tapas(tapas_table, example_sql)
+            # you can choose other delimiters to split each answer
+            answers.append(answer_list)
+
+        # IMPORTANT: we cannot pass by answers during evaluation, answers passed during training are used to
+        # truncate large tables in the train set!
+        if is_training:
+            model_inputs = tokenizer(
+                table=tables,
+                query=questions,
+                answer=answers,
+                max_length=data_args.max_source_length,
+                padding=padding,
+                truncation=True,
+            )
+        else:
+            model_inputs = tokenizer(
+                table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
+            )
+
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(
+                answer=[", ".join(answer) for answer in answers],
+                max_length=max_target_length,
+                padding=padding,
+                truncation=True,
+            )
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    # in training, we can use the answer as extra information to truncate large tables
+    preprocess_tableqa_function_training = partial(preprocess_tableqa_function, is_training=True)
+
+    if training_args.do_train:
+        if "train" not in datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        train_dataset = train_dataset.map(
+            preprocess_tableqa_function_training,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_eval:
+        max_target_length = data_args.val_max_target_length
+        if "validation" not in datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        eval_dataset = eval_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_predict:
+        max_target_length = data_args.val_max_target_length
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )
+
+    def postprocess_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+
+        return preds, labels
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        if data_args.ignore_pad_token_for_loss:
+            # Replace -100 in the labels as we can't decode them.
+            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        # Some simple post-processing
+        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+        delimiter = ", "
+
+        # define example evaluation
+        def evaluate_example(predict_str: str, ground_str: str):
+            predict_spans = predict_str.split(delimiter)
+            ground_spans = ground_str.split(delimiter)
+            predict_values = defaultdict(lambda: 0)
+            ground_values = defaultdict(lambda: 0)
+            for span in predict_spans:
+                try:
+                    predict_values[float(span)] += 1
+                except ValueError:
+                    predict_values[span.strip()] += 1
+            for span in ground_spans:
+                try:
+                    ground_values[float(span)] += 1
+                except ValueError:
+                    ground_values[span.strip()] += 1
+            is_correct = predict_values == ground_values
+            return is_correct
+
+        def get_denotation_accuracy(predictions: List[str], references: List[str]):
+            assert len(predictions) == len(references)
+            correct_num = 0
+            for predict_str, ground_str in zip(predictions, references):
+                is_correct = evaluate_example(predict_str.lower(), ground_str.lower())
+                if is_correct:
+                    correct_num += 1
+            return correct_num / len(predictions)
+
+        accuracy = get_denotation_accuracy(decoded_preds, decoded_labels)
+        result = {"denotation_accuracy": accuracy}
+
+        return result
+
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+    )
+
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(
+            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
+        )
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        predict_results = trainer.predict(
+            predict_dataset,
+            metric_key_prefix="predict",
+            max_length=data_args.val_max_target_length,
+            num_beams=data_args.num_beams,
+        )
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+        if trainer.is_world_process_zero():
+            if training_args.predict_with_generate:
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "tapex_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
new file mode 100644
index 000000000000..4398309566a8
--- /dev/null
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -0,0 +1,605 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fine-tuning the library models for tapex on table-based question answering tasks.
+Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py
+"""
+
+import logging
+import os
+import sys
+from collections import defaultdict
+from dataclasses import dataclass, field
+from functools import partial
+from typing import List, Optional
+
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+
+import transformers
+from filelock import FileLock
+from transformers import (
+    AutoConfig,
+    BartForConditionalGeneration,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    TapexTokenizer,
+    set_seed,
+)
+from transformers.file_utils import is_offline_mode
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+try:
+    nltk.data.find("tokenizers/punkt")
+except (LookupError, OSError):
+    if is_offline_mode():
+        raise LookupError(
+            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
+        )
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name. "
+            "By default we use BART-large tokenizer for TAPEX-large."
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="wikitablequestions", metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (rouge) on "
+            "(a jsonlines or csv file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (rouge) on " "(a jsonlines or csv file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+            "during ``evaluate`` and ``predict``."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+            "which is used during ``evaluate`` and ``predict``."
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # IMPORTANT: the initial BART model's decoding is penalized by no_repeat_ngram_size, and thus
+    # we should disable it here to avoid problematic generation
+    config.no_repeat_ngram_size = 0
+    config.max_length = 1024
+    config.early_stopping = False
+
+    # load tapex tokenizer
+    tokenizer = TapexTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        add_prefix_space=True,
+    )
+
+    # load Bart based Tapex model (default tapex-large)
+    model = BartForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    if model.config.decoder_start_token_id is None:
+        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = datasets["validation"].column_names
+    elif training_args.do_predict:
+        column_names = datasets["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+
+    # Temporarily set max_target_length for training.
+    max_target_length = data_args.max_target_length
+    padding = "max_length" if data_args.pad_to_max_length else False
+
+    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
+        logger.warning(
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
+        )
+
+    def preprocess_tableqa_function(examples, is_training=False):
+        """
+        The is_training FLAG is used to identify if we could use the supervision
+        to truncate the table content if it is required.
+        """
+
+        questions = [question.lower() for question in examples["question"]]
+        example_tables = examples["table"]
+        tables = [
+            pd.DataFrame.from_records(example_table["rows"], columns=example_table["header"])
+            for example_table in example_tables
+        ]
+
+        # using wikitablequestion's answer set
+        answers = examples["answers"]
+
+        # IMPORTANT: we cannot pass by answers during evaluation, answers passed during training are used to
+        # truncate large tables in the train set!
+        if is_training:
+            model_inputs = tokenizer(
+                table=tables,
+                query=questions,
+                answer=answers,
+                max_length=data_args.max_source_length,
+                padding=padding,
+                truncation=True,
+            )
+        else:
+            model_inputs = tokenizer(
+                table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
+            )
+
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(
+                answer=[", ".join(answer) for answer in answers],
+                max_length=max_target_length,
+                padding=padding,
+                truncation=True,
+            )
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    # in training, we can use the answer as extra information to truncate large tables
+    preprocess_tableqa_function_training = partial(preprocess_tableqa_function, is_training=True)
+
+    if training_args.do_train:
+        if "train" not in datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        train_dataset = train_dataset.map(
+            preprocess_tableqa_function_training,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_eval:
+        max_target_length = data_args.val_max_target_length
+        if "validation" not in datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        eval_dataset = eval_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    if training_args.do_predict:
+        max_target_length = data_args.val_max_target_length
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        predict_dataset = predict_dataset.map(
+            preprocess_tableqa_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )
+
+    def postprocess_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+
+        return preds, labels
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        if data_args.ignore_pad_token_for_loss:
+            # Replace -100 in the labels as we can't decode them.
+            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        # Some simple post-processing
+        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+        delimiter = ", "
+
+        # define example evaluation
+        def evaluate_example(predict_str: str, ground_str: str):
+            predict_spans = predict_str.split(delimiter)
+            ground_spans = ground_str.split(delimiter)
+            predict_values = defaultdict(lambda: 0)
+            ground_values = defaultdict(lambda: 0)
+            for span in predict_spans:
+                try:
+                    predict_values[float(span)] += 1
+                except ValueError:
+                    predict_values[span.strip()] += 1
+            for span in ground_spans:
+                try:
+                    ground_values[float(span)] += 1
+                except ValueError:
+                    ground_values[span.strip()] += 1
+            _is_correct = predict_values == ground_values
+            return _is_correct
+
+        def get_denotation_accuracy(predictions: List[str], references: List[str]):
+            assert len(predictions) == len(references)
+            correct_num = 0
+            for predict_str, ground_str in zip(predictions, references):
+                is_correct = evaluate_example(predict_str.lower(), ground_str.lower())
+                if is_correct:
+                    correct_num += 1
+            return correct_num / len(predictions)
+
+        accuracy = get_denotation_accuracy(decoded_preds, decoded_labels)
+        result = {"denotation_accuracy": accuracy}
+
+        return result
+
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+    )
+
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(
+            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
+        )
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        predict_results = trainer.predict(
+            predict_dataset,
+            metric_key_prefix="predict",
+            max_length=data_args.val_max_target_length,
+            num_beams=data_args.num_beams,
+        )
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+        if trainer.is_world_process_zero():
+            if training_args.predict_with_generate:
+                predictions = tokenizer.batch_decode(
+                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "tapex_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py
new file mode 100644
index 000000000000..9147fdc882e4
--- /dev/null
+++ b/examples/research_projects/tapex/wikisql_utils.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright 2022 The Microsoft, The Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import enum
+import functools
+import math
+import re
+
+# The following script is adapted from the script of TaPas.
+# Original: https://github.com/google-research/tapas/master/wikisql_utils.py
+from typing import Any, List, Text
+
+import six
+
+
+EMPTY_ANSWER = "none"
+EMPTY_ANSWER_AGG = "none"
+
+
+def _split_thousands(delimiter, value):
+    split = value.split(delimiter)
+    return len(split) > 1 and any(map(lambda x: len(x) == 3, split))
+
+
+def convert_to_float(value):
+    """Converts value to a float using a series of increasingly complex heuristics.
+    Args:
+      value: object that needs to be converted. Allowed types include
+        float/int/strings.
+    Returns:
+      A float interpretation of value.
+    Raises:
+      ValueError if the float conversion of value fails.
+    """
+    if isinstance(value, float):
+        return value
+    if isinstance(value, int):
+        return float(value)
+    if not isinstance(value, six.string_types):
+        raise ValueError("Argument value is not a string. Can't parse it as float")
+    sanitized = value
+
+    try:
+        # Example: 1,000.7
+        if "." in sanitized and "," in sanitized:
+            return float(sanitized.replace(",", ""))
+        # 1,000
+        if "," in sanitized and _split_thousands(",", sanitized):
+            return float(sanitized.replace(",", ""))
+        # 5,5556
+        if "," in sanitized and sanitized.count(",") == 1 and not _split_thousands(",", sanitized):
+            return float(sanitized.replace(",", "."))
+        # 0.0.0.1
+        if sanitized.count(".") > 1:
+            return float(sanitized.replace(".", ""))
+        # 0,0,0,1
+        if sanitized.count(",") > 1:
+            return float(sanitized.replace(",", ""))
+        return float(sanitized)
+    except ValueError:
+        # Avoid adding the sanitized value in the error message.
+        raise ValueError("Unable to convert value to float")
+
+
+def _normalize_float(answer):
+    if answer is None:
+        return None
+    try:
+        value = convert_to_float(answer)
+        if isinstance(value, float) and math.isnan(value):
+            return None
+        return value
+    except ValueError:
+        return answer.lower()
+
+
+_TYPE_CONVERTER = {
+    "text": lambda x: x,
+    "real": convert_to_float,
+}
+
+
+class _Aggregation(enum.Enum):
+    """Aggregations as defined by WikiSQL. Indexes match the data."""
+
+    NONE = 0
+    MAX = 1
+    MIN = 2
+    COUNT = 3
+    SUM = 4
+    AVERAGE = 5
+
+
+class _Operator(enum.Enum):
+    """The boolean operators used by WikiSQL. Indexes match the data."""
+
+    EQUALS = 0
+    GREATER = 1
+    LESSER = 2
+
+
+@dataclasses.dataclass
+class _Condition:
+    """Represents an SQL where clauses (e.g A = "a" or B > 5)."""
+
+    column: Text
+    operator: _Operator
+    cmp_value: Any
+
+
+_TOKENIZER = re.compile(r"\w+|[^\w\s]+", re.UNICODE | re.MULTILINE | re.DOTALL)
+
+
+def _normalize_for_match(x):
+    return [t for t in _TOKENIZER.findall(x.lower())]
+
+
+def _compare(operator, src, tgt):
+    if operator == _Operator.EQUALS:
+        return src == tgt
+    elif operator == _Operator.GREATER:
+        return src > tgt
+    elif operator == _Operator.LESSER:
+        return src < tgt
+    raise ValueError(f"Unknown operator: {operator}")
+
+
+def _parse_value(table, column, cell_value):
+    """Convert numeric values to floats and keeps everything else as string."""
+    types = table["types"]
+    return _TYPE_CONVERTER[types[column]](cell_value)
+
+
+def _is_string(x):
+    return isinstance(x, str)
+
+
+def _respect_conditions(table, row, conditions):
+    """True if 'row' satisfies all 'conditions'."""
+    for cond in conditions:
+        table_value = row[cond.column]
+
+        cmp_value = _parse_value(table, cond.column, cond.cmp_value)
+
+        if _is_string(table_value) and _is_string(cmp_value):
+            table_value = _normalize_for_match(table_value)
+            cmp_value = _normalize_for_match(cmp_value)
+
+        if not isinstance(table_value, type(cmp_value)):
+            raise ValueError("Type difference {} != {}".format(type(table_value), type(cmp_value)))
+
+        if not _compare(cond.operator, table_value, cmp_value):
+            return False
+    return True
+
+
+def _get_float_answer(table, answer_coordinates, aggregation_op):
+    """Applies operation to produce reference float answer."""
+    if not answer_coordinates:
+        if aggregation_op == _Aggregation.COUNT:
+            return 0.0
+        else:
+            return EMPTY_ANSWER_AGG
+
+    # Count can support non numeric answers.
+    if aggregation_op == _Aggregation.COUNT:
+        return float(len(answer_coordinates))
+
+    # If we have just one answer, if float returns it or try a conversion.
+    values = [table["rows"][i][j] for (i, j) in answer_coordinates]
+    if len(answer_coordinates) == 1:
+        try:
+            return convert_to_float(values[0])
+        except ValueError as e:
+            if aggregation_op != _Aggregation.NONE:
+                raise e
+
+    if aggregation_op == _Aggregation.NONE:
+        return None
+
+    # Other aggregation only support numeric values. Bail out if we have strings.
+    if not all((isinstance(v, (int, float)) for v in values)):
+        return None
+
+    if aggregation_op == _Aggregation.SUM:
+        return float(sum(values))
+    elif aggregation_op == _Aggregation.AVERAGE:
+        return sum(values) / len(answer_coordinates)
+    else:
+        raise ValueError(f"Unknown aggregation: {aggregation_op}")
+
+
+def _get_answer_coordinates(table, sql_query):
+    """Retrieves references coordinates by executing SQL."""
+    # MAX and MIN are automatically supported by the model.
+    aggregation_op_index = sql_query["agg"]
+    if aggregation_op_index >= 3:
+        aggregation_op = _Aggregation(aggregation_op_index)
+    else:
+        aggregation_op = _Aggregation.NONE
+
+    target_column = sql_query["sel"]
+    conditions = [
+        _Condition(column, _Operator(operator), cmp_value)
+        for column, operator, cmp_value in zip(
+            sql_query["conds"]["column_index"], sql_query["conds"]["operator_index"], sql_query["conds"]["condition"]
+        )
+    ]
+
+    indices = []
+    for row in range(len(table["rows"])):
+        if _respect_conditions(table, table["rows"][row], conditions):
+            indices.append((row, target_column))
+
+    if not indices:
+        return [], aggregation_op
+
+    if len(indices) == 1:
+        return indices, aggregation_op
+
+    # Parsing of MIN/MAX.
+    if aggregation_op_index in (1, 2):
+        operators = {2: min, 1: max}
+        values = [(table["rows"][i][j], index) for index, (i, j) in enumerate(indices)]
+        reduced = functools.reduce(operators[sql_query["agg"]], values)
+
+        ret = [indices[reduced[1]]]
+        return ret, _Aggregation.NONE
+
+    return indices, aggregation_op
+
+
+def _get_answer_text(table, answer_coordinates, float_answer):
+    if float_answer is not None:
+        return [str(float_answer)]
+    return [str(table["real_rows"][r][c]) for r, c in answer_coordinates]
+
+
+def retrieve_wikisql_query_answer_tapas(table, example) -> List:
+    answer_coordinates, aggregation_op = _get_answer_coordinates(table, example)
+    float_answer = _get_float_answer(table, answer_coordinates, aggregation_op)
+    answer_text = _get_answer_text(table, answer_coordinates, float_answer)
+    # keep the original data the same with TaPas
+    if len(answer_text) == 0:
+        answer_text = [EMPTY_ANSWER]
+    return answer_text
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 650009149116..dbdef5824d6a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -242,6 +242,7 @@
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
     "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
     "models.reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"],
+    "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
     "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
@@ -266,6 +267,7 @@
     "models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"],
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
+    "models.tapex": ["TapexTokenizer"],
     "models.transfo_xl": [
         "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TransfoXLConfig",
@@ -1342,6 +1344,14 @@
             "ReformerPreTrainedModel",
         ]
     )
+    _import_structure["models.regnet"].extend(
+        [
+            "REGNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RegNetForImageClassification",
+            "RegNetModel",
+            "RegNetPreTrainedModel",
+        ]
+    )
     _import_structure["models.rembert"].extend(
         [
             "REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2612,6 +2622,7 @@
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
     from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
+    from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
     from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
@@ -2633,6 +2644,7 @@
     from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
+    from .models.tapex import TapexTokenizer
     from .models.transfo_xl import (
         TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TransfoXLConfig,
@@ -3537,6 +3549,12 @@
             ReformerModelWithLMHead,
             ReformerPreTrainedModel,
         )
+        from .models.regnet import (
+            REGNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RegNetForImageClassification,
+            RegNetModel,
+            RegNetPreTrainedModel,
+        )
         from .models.rembert import (
             REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             RemBertForCausalLM,
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 1599cfab8a28..0c9276e948f5 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -220,12 +220,11 @@ class DataCollatorWithPadding:
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
-              is provided).
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
             - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
               acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-              lengths).
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
         max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
         pad_to_multiple_of (`int`, *optional*):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 746bcfb9d2a4..c81ef06ebed8 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2162,13 +2162,14 @@ def find_submodule_and_param_name(model, long_key):
 
             # materialize state_dict entries one by one on CPU
             for k in loaded_state_dict_keys:
-                submodule, param_name = find_submodule_and_param_name(model, k)
-                if submodule is not None:
-                    param_dtype = getattr(submodule, param_name).dtype
-                    new_val = state_dict[k].to(param_dtype)
-                    if isinstance(getattr(submodule, param_name), torch.nn.Parameter):
-                        new_val = torch.nn.Parameter(new_val)
-                    setattr(submodule, param_name, new_val)
+                if k in state_dict:
+                    submodule, param_name = find_submodule_and_param_name(model, k)
+                    if submodule is not None:
+                        param_dtype = getattr(submodule, param_name).dtype
+                        new_val = state_dict[k].to(param_dtype)
+                        if isinstance(getattr(submodule, param_name), torch.nn.Parameter):
+                            new_val = torch.nn.Parameter(new_val)
+                        setattr(submodule, param_name, new_val)
 
             del state_dict
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7045f18c5561..d21f2789b3d7 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -96,6 +96,7 @@
     rag,
     realm,
     reformer,
+    regnet,
     rembert,
     resnet,
     retribert,
@@ -112,6 +113,7 @@
     swin,
     t5,
     tapas,
+    tapex,
     transfo_xl,
     trocr,
     unispeech,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 84e23f106e5b..88235528d6a6 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -29,6 +29,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("tapex", "BartConfig"),
         ("dpt", "DPTConfig"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("glpn", "GLPNConfig"),
@@ -38,6 +39,7 @@
         ("convnext", "ConvNextConfig"),
         ("van", "VanConfig"),
         ("resnet", "ResNetConfig"),
+        ("regnet", "RegNetConfig"),
         ("yoso", "YosoConfig"),
         ("swin", "SwinConfig"),
         ("vilt", "ViltConfig"),
@@ -134,7 +136,7 @@
 
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
-        # Add archive maps here
+        # Add archive maps here)
         ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("maskformer", "MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -142,6 +144,7 @@
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("resnet", "RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("regnet", "REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("yoso", "YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -226,6 +229,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("tapex", "TAPEX"),
         ("dpt", "DPT"),
         ("decision_transformer", "Decision Transformer"),
         ("glpn", "GLPN"),
@@ -234,6 +238,7 @@
         ("convnext", "ConvNext"),
         ("van", "VAN"),
         ("resnet", "ResNet"),
+        ("regnet", "RegNet"),
         ("yoso", "YOSO"),
         ("swin", "Swin"),
         ("vilt", "ViLT"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index dc83fc133fad..dad7e165e8d7 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -54,6 +54,7 @@
         ("convnext", "ConvNextFeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
         ("resnet", "ConvNextFeatureExtractor"),
+        ("regnet", "ConvNextFeatureExtractor"),
         ("poolformer", "PoolFormerFeatureExtractor"),
         ("maskformer", "MaskFormerFeatureExtractor"),
     ]
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b0cfb4767249..fbe3b50c98a9 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -38,6 +38,7 @@
         ("convnext", "ConvNextModel"),
         ("van", "VanModel"),
         ("resnet", "ResNetModel"),
+        ("regnet", "RegNetModel"),
         ("yoso", "YosoModel"),
         ("swin", "SwinModel"),
         ("vilt", "ViltModel"),
@@ -303,6 +304,7 @@
         ("convnext", "ConvNextForImageClassification"),
         ("van", "VanForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
+        ("regnet", "RegNetForImageClassification"),
         ("poolformer", "PoolFormerForImageClassification"),
     ]
 )
@@ -388,6 +390,7 @@
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Seq2Seq Causal LM mapping
+        ("tapex", "BartForConditionalGeneration"),
         ("plbart", "PLBartForConditionalGeneration"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
         ("m2m_100", "M2M100ForConditionalGeneration"),
@@ -417,6 +420,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("tapex", "BartForSequenceClassification"),
         ("yoso", "YosoForSequenceClassification"),
         ("nystromformer", "NystromformerForSequenceClassification"),
         ("plbart", "PLBartForSequenceClassification"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index a60484134096..b3b0960b23f1 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -105,6 +105,7 @@
             ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
             ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
             ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
+            ("tapex", ("TapexTokenizer", None)),
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
index 4eaffae3fa6e..048b1b9cb678 100644
--- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
@@ -13,13 +13,16 @@
 # limitations under the License.
 
 """
-This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official GitHub:
-https://github.com/tensorflow/models/tree/master/official/nlp/bert
+This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now
+deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert
 
 TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert
 weight names to the original names, so the model can be imported with Huggingface/transformer.
 
 You may adapt this script to include classification/MLM/NSP/etc. heads.
+
+Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0).
+      Models trained with never versions are not compatible with this script.
 """
 import argparse
 import os
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index e5347cc19c24..c8cfa45359c1 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -1092,16 +1092,23 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import ElectraTokenizer, ElectraForPreTraining
+        >>> from transformers import ElectraForPreTraining, ElectraTokenizerFast
         >>> import torch
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
-        >>> model = ElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
+        >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
+        >>> tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-discriminator")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
-        ...     0
-        >>> )  # Batch size 1
-        >>> logits = model(input_ids).logits
+        >>> sentence = "The quick brown fox jumps over the lazy dog"
+        >>> fake_sentence = "The quick brown fox fake over the lazy dog"
+
+        >>> fake_tokens = tokenizer.tokenize(fake_sentence)
+        >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
+        >>> discriminator_outputs = discriminator(fake_inputs)
+        >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)
+
+        >>> [print("%7s" % token, end="") for token in fake_tokens]
+
+        >>> [print("%7s" % int(prediction), end="") for prediction in predictions.squeeze().tolist()]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1172,9 +1179,12 @@ def set_output_embeddings(self, word_embeddings):
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="google/electra-small-discriminator",
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
+        mask="[mask]",
+        expected_output="'paris'",
+        expected_loss=1.22,
     )
     def forward(
         self,
@@ -1256,9 +1266,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']",
+        expected_loss=0.11,
     )
     def forward(
         self,
@@ -1336,9 +1348,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="deepset/electra-base-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'a nice puppet'",
+        expected_loss=0.0,
     )
     def forward(
         self,
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index c6169c0ff5e7..bcf155ad7257 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -300,6 +300,10 @@ def main():
     if args.config_file == "":
         # Default config of megatron-bert 345m
         config = MegatronBertConfig()
+
+        # different megatron-bert-*-345m models have different vocab sizes, so override the default
+        # config (which is for megatron-bert-cased-345m) with the actual vocab dimension
+        config.vocab_size = input_state_dict["model"]["lm_head"]["bias"].numel()
     else:
         config = MegatronBertConfig.from_json_file(args.config_file)
 
diff --git a/src/transformers/models/regnet/__init__.py b/src/transformers/models/regnet/__init__.py
new file mode 100644
index 000000000000..185ead37b640
--- /dev/null
+++ b/src/transformers/models/regnet/__init__.py
@@ -0,0 +1,52 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_regnet"] = [
+        "REGNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RegNetForImageClassification",
+        "RegNetModel",
+        "RegNetPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
+
+    if is_torch_available():
+        from .modeling_regnet import (
+            REGNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RegNetForImageClassification,
+            RegNetModel,
+            RegNetPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py
new file mode 100644
index 000000000000..9bfe35ec9b3d
--- /dev/null
+++ b/src/transformers/models/regnet/configuration_regnet.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RegNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "regnety-40": "https://huggingface.co/zuppif/regnety-040/blob/main/config.json",
+}
+
+
+class RegNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RegNetModel`]. It is used to instantiate a RegNet
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [facebook/regnet-y-40](https://huggingface.co/facebook/regnet-y-40) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"y"`):
+            The layer to use, it can be either `"x" or `"y"`. An `x` layer is a ResNet's BottleNeck layer with
+            `reduction` fixed to `1`. While a `y` layer is a `x` but with squeeze and excitation. Please refer to the
+            paper for a detailed explanation of how these layers were constructed.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+
+    Example:
+    ```python
+    >>> from transformers import RegNetConfig, RegNetModel
+
+    >>> # Initializing a RegNet regnet-y-40 style configuration
+    >>> configuration = RegNetConfig()
+    >>> # Initializing a model from the regnet-y-40 style configuration
+    >>> model = RegNetModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "regnet"
+    layer_types = ["x", "y"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=32,
+        hidden_sizes=[128, 192, 512, 1088],
+        depths=[2, 6, 12, 2],
+        groups_width=64,
+        layer_type="y",
+        hidden_act="relu",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.groups_width = groups_width
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        # always downsample in the first stage
+        self.downsample_in_first_stage = True
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
new file mode 100644
index 000000000000..8024ef679201
--- /dev/null
+++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
@@ -0,0 +1,301 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RegNet 10B checkpoints vissl."""
+# You need to install a specific version of classy vision
+# pip install git+https://github.com/FrancescoSaverioZuppichini/ClassyVision.git@convert_weights
+
+import argparse
+import json
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from pprint import pprint
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from classy_vision.models.regnet import RegNet, RegNetParams
+from huggingface_hub import cached_download, hf_hub_url
+from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger()
+
+
+@dataclass
+class Tracker:
+    module: nn.Module
+    traced: List[nn.Module] = field(default_factory=list)
+    handles: list = field(default_factory=list)
+    name2module: Dict[str, nn.Module] = field(default_factory=OrderedDict)
+
+    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor, name: str):
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        if has_not_submodules:
+            self.traced.append(m)
+            self.name2module[name] = m
+
+    def __call__(self, x: Tensor):
+        for name, m in self.module.named_modules():
+            self.handles.append(m.register_forward_hook(partial(self._forward_hook, name=name)))
+        self.module(x)
+        list(map(lambda x: x.remove(), self.handles))
+        return self
+
+    @property
+    def parametrized(self):
+        # check the len of the state_dict keys to see if we have learnable params
+        return {k: v for k, v in self.name2module.items() if len(list(v.state_dict().keys())) > 0}
+
+
+class FakeRegNetVisslWrapper(nn.Module):
+    """
+    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
+    """
+
+    def __init__(self, model: nn.Module):
+        super().__init__()
+
+        feature_blocks: List[Tuple[str, nn.Module]] = []
+        # - get the stem
+        feature_blocks.append(("conv1", model.stem))
+        # - get all the feature blocks
+        for k, v in model.trunk_output.named_children():
+            assert k.startswith("block"), f"Unexpected layer name {k}"
+            block_index = len(feature_blocks) + 1
+            feature_blocks.append((f"res{block_index}", v))
+
+        self._feature_blocks = nn.ModuleDict(feature_blocks)
+
+    def forward(self, x: Tensor):
+        return get_trunk_forward_outputs(
+            x,
+            out_feat_keys=None,
+            feature_blocks=self._feature_blocks,
+        )
+
+
+class FakeRegNetParams(RegNetParams):
+    """
+    Used to instantiace a RegNet model from classy vision with the same depth as the 10B one but with super small
+    parameters, so we can trace it in memory.
+    """
+
+    def get_expanded_params(self):
+        return [(8, 2, 2, 8, 1.0), (8, 2, 7, 8, 1.0), (8, 2, 17, 8, 1.0), (8, 2, 1, 8, 1.0)]
+
+
+def get_from_to_our_keys(model_name: str) -> Dict[str, str]:
+    """
+    Returns a dictionary that maps from original model's key -> our implementation's keys
+    """
+
+    # create our model (with small weights)
+    our_config = RegNetConfig(depths=[2, 7, 17, 1], hidden_sizes=[8, 8, 8, 8], groups_width=8)
+    if "in1k" in model_name:
+        our_model = RegNetForImageClassification(our_config)
+    else:
+        our_model = RegNetModel(our_config)
+    # create from model (with small weights)
+    from_model = FakeRegNetVisslWrapper(
+        RegNet(FakeRegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
+    )
+
+    with torch.no_grad():
+        from_model = from_model.eval()
+        our_model = our_model.eval()
+
+        x = torch.randn((1, 3, 32, 32))
+        # trace both
+        dest_tracker = Tracker(our_model)
+        dest_traced = dest_tracker(x).parametrized
+
+        pprint(dest_tracker.name2module)
+        src_tracker = Tracker(from_model)
+        src_traced = src_tracker(x).parametrized
+
+    # convert the keys -> module dict to keys -> params
+    def to_params_dict(dict_with_modules):
+        params_dict = OrderedDict()
+        for name, module in dict_with_modules.items():
+            for param_name, param in module.state_dict().items():
+                params_dict[f"{name}.{param_name}"] = param
+        return params_dict
+
+    from_to_ours_keys = {}
+
+    src_state_dict = to_params_dict(src_traced)
+    dst_state_dict = to_params_dict(dest_traced)
+
+    for (src_key, src_param), (dest_key, dest_param) in zip(src_state_dict.items(), dst_state_dict.items()):
+        from_to_ours_keys[src_key] = dest_key
+        logger.info(f"{src_key} -> {dest_key}")
+    # if "in1k" was in the model_name it means it must have a classification head (was finetuned)
+    if "in1k" in model_name:
+        from_to_ours_keys["0.clf.0.weight"] = "classifier.1.weight"
+        from_to_ours_keys["0.clf.0.bias"] = "classifier.1.bias"
+
+    return from_to_ours_keys
+
+
+def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
+    filename = "imagenet-1k-id2label.json"
+    num_labels = 1000
+
+    repo_id = "datasets/huggingface/label-files"
+    num_labels = num_labels
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+
+    id2label = id2label
+    label2id = {v: k for k, v in id2label.items()}
+
+    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
+
+    names_to_config = {
+        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+        # finetuned on imagenet
+        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+    }
+
+    # add seer weights logic
+    def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
+        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
+        # check if we have a head, if yes add it
+        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
+        return model_state_dict["trunk"], model_state_dict["heads"]
+
+    names_to_from_model = {
+        "regnet-y-10b-seer": partial(
+            load_using_classy_vision,
+            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
+        ),
+        "regnet-y-10b-seer-in1k": partial(
+            load_using_classy_vision,
+            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
+        ),
+    }
+
+    from_to_ours_keys = get_from_to_our_keys(model_name)
+
+    if not (save_directory / f"{model_name}.pth").exists():
+        logger.info("Loading original state_dict.")
+        from_state_dict_trunk, from_state_dict_head = names_to_from_model[model_name]()
+        from_state_dict = from_state_dict_trunk
+        if "in1k" in model_name:
+            # add the head
+            from_state_dict = {**from_state_dict_trunk, **from_state_dict_head}
+        logger.info("Done!")
+
+        converted_state_dict = {}
+
+        not_used_keys = list(from_state_dict.keys())
+        regex = r"\.block.-part."
+        # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it
+        for key in from_state_dict.keys():
+            # remove the weird "block[0,1]-part" from the key
+            src_key = re.sub(regex, "", key)
+            # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key
+            dest_key = from_to_ours_keys[src_key]
+            # store the parameter with our key
+            converted_state_dict[dest_key] = from_state_dict[key]
+            not_used_keys.remove(key)
+        # check that all keys have been updated
+        assert len(not_used_keys) == 0, f"Some keys where not used {','.join(not_used_keys)}"
+
+        logger.info(f"The following keys were not used: {','.join(not_used_keys)}")
+
+        # save our state dict to disk
+        torch.save(converted_state_dict, save_directory / f"{model_name}.pth")
+
+        del converted_state_dict
+    else:
+        logger.info("The state_dict was already stored on disk.")
+    if push_to_hub:
+        logger.info(f"Token is {os.environ['HF_TOKEN']}")
+        logger.info("Loading our model.")
+        # create our model
+        our_config = names_to_config[model_name]
+        our_model_func = RegNetModel
+        if "in1k" in model_name:
+            our_model_func = RegNetForImageClassification
+        our_model = our_model_func(our_config)
+        # place our model to the meta device (so remove all the weights)
+        our_model.to(torch.device("meta"))
+        logger.info("Loading state_dict in our model.")
+        # load state dict
+        state_dict_keys = our_model.state_dict().keys()
+        PreTrainedModel._load_pretrained_model_low_mem(
+            our_model, state_dict_keys, [save_directory / f"{model_name}.pth"]
+        )
+        logger.info("Finally, pushing!")
+        # push it to hub
+        our_model.push_to_hub(
+            repo_path_or_name=save_directory / model_name,
+            commit_message="Add model",
+            output_dir=save_directory / model_name,
+        )
+        size = 384
+        # we can use the convnext one
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        feature_extractor.push_to_hub(
+            repo_path_or_name=save_directory / model_name,
+            commit_message="Add feature extractor",
+            output_dir=save_directory / model_name,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default=None,
+        type=str,
+        help="The name of the model you wish to convert, it must be one of the supported regnet* architecture, currently: regnetx-*, regnety-*. If `None`, all of them will the converted.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=Path,
+        required=True,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        required=False,
+        help="If True, push model and feature extractor to the hub.",
+    )
+
+    args = parser.parse_args()
+
+    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
+    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
+    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
new file mode 100644
index 000000000000..96e4ab700ab5
--- /dev/null
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RegNet checkpoints from timm and vissl."""
+
+
+import argparse
+import json
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Callable, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+import timm
+from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf
+from huggingface_hub import cached_download, hf_hub_url
+from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers.utils import logging
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger()
+
+
+@dataclass
+class Tracker:
+    module: nn.Module
+    traced: List[nn.Module] = field(default_factory=list)
+    handles: list = field(default_factory=list)
+
+    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
+        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
+        if has_not_submodules:
+            self.traced.append(m)
+
+    def __call__(self, x: Tensor):
+        for m in self.module.modules():
+            self.handles.append(m.register_forward_hook(self._forward_hook))
+        self.module(x)
+        list(map(lambda x: x.remove(), self.handles))
+        return self
+
+    @property
+    def parametrized(self):
+        # check the len of the state_dict keys to see if we have learnable params
+        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
+
+
+@dataclass
+class ModuleTransfer:
+    src: nn.Module
+    dest: nn.Module
+    verbose: int = 1
+    src_skip: List = field(default_factory=list)
+    dest_skip: List = field(default_factory=list)
+    raise_if_mismatch: bool = True
+
+    def __call__(self, x: Tensor):
+        """
+        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
+        hood we tracked all the operations in both modules.
+        """
+        dest_traced = Tracker(self.dest)(x).parametrized
+        src_traced = Tracker(self.src)(x).parametrized
+
+        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
+        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
+
+        if len(dest_traced) != len(src_traced) and self.raise_if_mismatch:
+            raise Exception(
+                f"Numbers of operations are different. Source module has {len(src_traced)} operations while destination module has {len(dest_traced)}."
+            )
+
+        for dest_m, src_m in zip(dest_traced, src_traced):
+            dest_m.load_state_dict(src_m.state_dict())
+            if self.verbose == 1:
+                print(f"Transfered from={src_m} to={dest_m}")
+
+
+class FakeRegNetVisslWrapper(nn.Module):
+    """
+    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
+    """
+
+    def __init__(self, model: nn.Module):
+        super().__init__()
+
+        feature_blocks: List[Tuple[str, nn.Module]] = []
+        # - get the stem
+        feature_blocks.append(("conv1", model.stem))
+        # - get all the feature blocks
+        for k, v in model.trunk_output.named_children():
+            assert k.startswith("block"), f"Unexpected layer name {k}"
+            block_index = len(feature_blocks) + 1
+            feature_blocks.append((f"res{block_index}", v))
+
+        self._feature_blocks = nn.ModuleDict(feature_blocks)
+
+    def forward(self, x: Tensor):
+        return get_trunk_forward_outputs(
+            x,
+            out_feat_keys=None,
+            feature_blocks=self._feature_blocks,
+        )
+
+
+class NameToFromModelFuncMap(dict):
+    """
+    A Dictionary with some additional logic to return a function that creates the correct original model.
+    """
+
+    def convert_name_to_timm(self, x: str) -> str:
+        x_split = x.split("-")
+        return x_split[0] + x_split[1] + "_" + "".join(x_split[2:])
+
+    def __getitem__(self, x: str) -> Callable[[], Tuple[nn.Module, Dict]]:
+        # default to timm!
+        if x not in self:
+            x = self.convert_name_to_timm(x)
+            val = partial(lambda: (timm.create_model(x, pretrained=True).eval(), None))
+
+        else:
+            val = super().__getitem__(x)
+
+        return val
+
+
+class NameToOurModelFuncMap(dict):
+    """
+    A Dictionary with some additional logic to return the correct hugging face RegNet class reference.
+    """
+
+    def __getitem__(self, x: str) -> Callable[[], nn.Module]:
+        if "seer" in x and "in1k" not in x:
+            val = RegNetModel
+        else:
+            val = RegNetForImageClassification
+        return val
+
+
+def manually_copy_vissl_head(from_state_dict, to_state_dict, keys: List[Tuple[str, str]]):
+    for from_key, to_key in keys:
+        to_state_dict[to_key] = from_state_dict[from_key].clone()
+        print(f"Copied key={from_key} to={to_key}")
+    return to_state_dict
+
+
+def convert_weight_and_push(
+    name: str,
+    from_model_func: Callable[[], nn.Module],
+    our_model_func: Callable[[], nn.Module],
+    config: RegNetConfig,
+    save_directory: Path,
+    push_to_hub: bool = True,
+):
+    print(f"Converting {name}...")
+    with torch.no_grad():
+        from_model, from_state_dict = from_model_func()
+        our_model = our_model_func(config).eval()
+        module_transfer = ModuleTransfer(src=from_model, dest=our_model, raise_if_mismatch=False)
+        x = torch.randn((1, 3, 224, 224))
+        module_transfer(x)
+
+    if from_state_dict is not None:
+        keys = []
+        # for seer - in1k finetuned we have to manually copy the head
+        if "seer" in name and "in1k" in name:
+            keys = [("0.clf.0.weight", "classifier.1.weight"), ("0.clf.0.bias", "classifier.1.bias")]
+        to_state_dict = manually_copy_vissl_head(from_state_dict, our_model.state_dict(), keys)
+        our_model.load_state_dict(to_state_dict)
+
+    our_outputs = our_model(x, output_hidden_states=True)
+    our_output = (
+        our_outputs.logits if isinstance(our_model, RegNetForImageClassification) else our_outputs.last_hidden_state
+    )
+
+    from_output = from_model(x)
+    from_output = from_output[-1] if type(from_output) is list else from_output
+
+    # now since I don't want to use any config files, vissl seer model doesn't actually have an head, so let's just check the last hidden state
+    if "seer" in name and "in1k" in name:
+        our_output = our_outputs.hidden_states[-1]
+
+    assert torch.allclose(from_output, our_output), "The model logits don't match the original one."
+
+    if push_to_hub:
+        our_model.push_to_hub(
+            repo_path_or_name=save_directory / name,
+            commit_message="Add model",
+            use_temp_dir=True,
+        )
+
+        size = 224 if "seer" not in name else 384
+        # we can use the convnext one
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        feature_extractor.push_to_hub(
+            repo_path_or_name=save_directory / name,
+            commit_message="Add feature extractor",
+            use_temp_dir=True,
+        )
+
+        print(f"Pushed {name}")
+
+
+def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
+    filename = "imagenet-1k-id2label.json"
+    num_labels = 1000
+    expected_shape = (1, num_labels)
+
+    repo_id = "datasets/huggingface/label-files"
+    num_labels = num_labels
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+
+    id2label = id2label
+    label2id = {v: k for k, v in id2label.items()}
+
+    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
+
+    names_to_config = {
+        "regnet-x-002": ImageNetPreTrainedConfig(
+            depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8, layer_type="x"
+        ),
+        "regnet-x-004": ImageNetPreTrainedConfig(
+            depths=[1, 2, 7, 12], hidden_sizes=[32, 64, 160, 384], groups_width=16, layer_type="x"
+        ),
+        "regnet-x-006": ImageNetPreTrainedConfig(
+            depths=[1, 3, 5, 7], hidden_sizes=[48, 96, 240, 528], groups_width=24, layer_type="x"
+        ),
+        "regnet-x-008": ImageNetPreTrainedConfig(
+            depths=[1, 3, 7, 5], hidden_sizes=[64, 128, 288, 672], groups_width=16, layer_type="x"
+        ),
+        "regnet-x-016": ImageNetPreTrainedConfig(
+            depths=[2, 4, 10, 2], hidden_sizes=[72, 168, 408, 912], groups_width=24, layer_type="x"
+        ),
+        "regnet-x-032": ImageNetPreTrainedConfig(
+            depths=[2, 6, 15, 2], hidden_sizes=[96, 192, 432, 1008], groups_width=48, layer_type="x"
+        ),
+        "regnet-x-040": ImageNetPreTrainedConfig(
+            depths=[2, 5, 14, 2], hidden_sizes=[80, 240, 560, 1360], groups_width=40, layer_type="x"
+        ),
+        "regnet-x-064": ImageNetPreTrainedConfig(
+            depths=[2, 4, 10, 1], hidden_sizes=[168, 392, 784, 1624], groups_width=56, layer_type="x"
+        ),
+        "regnet-x-080": ImageNetPreTrainedConfig(
+            depths=[2, 5, 15, 1], hidden_sizes=[80, 240, 720, 1920], groups_width=120, layer_type="x"
+        ),
+        "regnet-x-120": ImageNetPreTrainedConfig(
+            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112, layer_type="x"
+        ),
+        "regnet-x-160": ImageNetPreTrainedConfig(
+            depths=[2, 6, 13, 1], hidden_sizes=[256, 512, 896, 2048], groups_width=128, layer_type="x"
+        ),
+        "regnet-x-320": ImageNetPreTrainedConfig(
+            depths=[2, 7, 13, 1], hidden_sizes=[336, 672, 1344, 2520], groups_width=168, layer_type="x"
+        ),
+        # y variant
+        "regnet-y-002": ImageNetPreTrainedConfig(depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8),
+        "regnet-y-004": ImageNetPreTrainedConfig(
+            depths=[1, 3, 6, 6], hidden_sizes=[48, 104, 208, 440], groups_width=8
+        ),
+        "regnet-y-006": ImageNetPreTrainedConfig(
+            depths=[1, 3, 7, 4], hidden_sizes=[48, 112, 256, 608], groups_width=16
+        ),
+        "regnet-y-008": ImageNetPreTrainedConfig(
+            depths=[1, 3, 8, 2], hidden_sizes=[64, 128, 320, 768], groups_width=16
+        ),
+        "regnet-y-016": ImageNetPreTrainedConfig(
+            depths=[2, 6, 17, 2], hidden_sizes=[48, 120, 336, 888], groups_width=24
+        ),
+        "regnet-y-032": ImageNetPreTrainedConfig(
+            depths=[2, 5, 13, 1], hidden_sizes=[72, 216, 576, 1512], groups_width=24
+        ),
+        "regnet-y-040": ImageNetPreTrainedConfig(
+            depths=[2, 6, 12, 2], hidden_sizes=[128, 192, 512, 1088], groups_width=64
+        ),
+        "regnet-y-064": ImageNetPreTrainedConfig(
+            depths=[2, 7, 14, 2], hidden_sizes=[144, 288, 576, 1296], groups_width=72
+        ),
+        "regnet-y-080": ImageNetPreTrainedConfig(
+            depths=[2, 4, 10, 1], hidden_sizes=[168, 448, 896, 2016], groups_width=56
+        ),
+        "regnet-y-120": ImageNetPreTrainedConfig(
+            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112
+        ),
+        "regnet-y-160": ImageNetPreTrainedConfig(
+            depths=[2, 4, 11, 1], hidden_sizes=[224, 448, 1232, 3024], groups_width=112
+        ),
+        "regnet-y-320": ImageNetPreTrainedConfig(
+            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
+        ),
+        # models created by SEER -> https://arxiv.org/abs/2202.08360
+        "regnet-y-320-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232),
+        "regnet-y-640-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328),
+        "regnet-y-1280-seer": RegNetConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
+        ),
+        "regnet-y-2560-seer": RegNetConfig(
+            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
+        ),
+        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+        # finetuned on imagenet
+        "regnet-y-320-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
+        ),
+        "regnet-y-640-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328
+        ),
+        "regnet-y-1280-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
+        ),
+        "regnet-y-2560-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
+        ),
+        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
+            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
+        ),
+    }
+
+    names_to_ours_model_map = NameToOurModelFuncMap()
+    names_to_from_model_map = NameToFromModelFuncMap()
+    # add seer weights logic
+
+    def load_using_classy_vision(checkpoint_url: str, model_func: Callable[[], nn.Module]) -> Tuple[nn.Module, Dict]:
+        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
+        model = model_func()
+        # check if we have a head, if yes add it
+        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
+        state_dict = model_state_dict["trunk"]
+        model.load_state_dict(state_dict)
+        return model.eval(), model_state_dict["heads"]
+
+    # pretrained
+    names_to_from_model_map["regnet-y-320-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet32d/seer_regnet32gf_model_iteration244000.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
+    )
+
+    names_to_from_model_map["regnet-y-640-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet64/seer_regnet64gf_model_final_checkpoint_phase0.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
+    )
+
+    names_to_from_model_map["regnet-y-1280-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/swav_ig1b_regnet128Gf_cnstant_bs32_node16_sinkhorn10_proto16k_syncBN64_warmup8k/model_final_checkpoint_phase0.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
+    )
+
+    names_to_from_model_map["regnet-y-10b-seer"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
+        lambda: FakeRegNetVisslWrapper(
+            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
+        ),
+    )
+
+    # IN1K finetuned
+    names_to_from_model_map["regnet-y-320-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet32_finetuned_in1k_model_final_checkpoint_phase78.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
+    )
+
+    names_to_from_model_map["regnet-y-640-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet64_finetuned_in1k_model_final_checkpoint_phase78.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
+    )
+
+    names_to_from_model_map["regnet-y-1280-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet128_finetuned_in1k_model_final_checkpoint_phase78.torch",
+        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
+    )
+
+    names_to_from_model_map["regnet-y-10b-seer-in1k"] = partial(
+        load_using_classy_vision,
+        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
+        lambda: FakeRegNetVisslWrapper(
+            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
+        ),
+    )
+
+    if model_name:
+        convert_weight_and_push(
+            model_name,
+            names_to_from_model_map[model_name],
+            names_to_ours_model_map[model_name],
+            names_to_config[model_name],
+            save_directory,
+            push_to_hub,
+        )
+    else:
+        for model_name, config in names_to_config.items():
+            convert_weight_and_push(
+                model_name,
+                names_to_from_model_map[model_name],
+                names_to_ours_model_map[model_name],
+                config,
+                save_directory,
+                push_to_hub,
+            )
+    return config, expected_shape
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default=None,
+        type=str,
+        help="The name of the model you wish to convert, it must be one of the supported regnet* architecture, currently: regnetx-*, regnety-*. If `None`, all of them will the converted.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=Path,
+        required=True,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        required=False,
+        help="If True, push model and feature extractor to the hub.",
+    )
+
+    args = parser.parse_args()
+
+    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
+    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
+    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py
new file mode 100644
index 000000000000..0ebd05a25ce1
--- /dev/null
+++ b/src/transformers/models/regnet/modeling_regnet.py
@@ -0,0 +1,444 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch RegNet model."""
+
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_regnet import RegNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "RegNetConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
+_EXPECTED_OUTPUT_SHAPE = [1, 1088, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/regnet-y-040"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'"
+
+REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/regnet-y-040",
+    # See all regnet models at https://huggingface.co/models?filter=regnet
+]
+
+
+class RegNetConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        groups: int = 1,
+        activation: Optional[str] = "relu",
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=False,
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state = self.convolution(hidden_state)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetEmbeddings(nn.Module):
+    """
+    RegNet Embedddings (stem) composed of a single aggressive convolution.
+    """
+
+    def __init__(self, config: RegNetConfig):
+        super().__init__()
+        self.embedder = RegNetConvLayer(
+            config.num_channels, config.embedding_size, kernel_size=3, stride=2, activation=config.hidden_act
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.embedder(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut with ResNet->RegNet
+class RegNetShortCut(nn.Sequential):
+    """
+    RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+        self.normalization = nn.BatchNorm2d(out_channels)
+
+
+class RegNetSELayer(nn.Module):
+    """
+    Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507).
+    """
+
+    def __init__(self, in_channels: int, reduced_channels: int):
+        super().__init__()
+
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        self.attention = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, hidden_state):
+        # b c h w -> b c 1 1
+        pooled = self.pooler(hidden_state)
+        attention = self.attention(pooled)
+        hidden_state = hidden_state * attention
+        return hidden_state
+
+
+class RegNetXLayer(nn.Module):
+    """
+    RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1):
+        super().__init__()
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            RegNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
+        )
+        self.layer = nn.Sequential(
+            RegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetYLayer(nn.Module):
+    """
+    RegNet's Y layer: an X layer with Squeeze and Excitation.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1):
+        super().__init__()
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            RegNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
+        )
+        self.layer = nn.Sequential(
+            RegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act),
+            RegNetSELayer(out_channels, reduced_channels=int(round(in_channels / 4))),
+            RegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetStage(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        config: RegNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 2,
+        depth: int = 2,
+    ):
+        super().__init__()
+
+        layer = RegNetXLayer if config.layer_type == "x" else RegNetYLayer
+
+        self.layers = nn.Sequential(
+            # downsampling is done in the first layer with stride of 2
+            layer(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+            ),
+            *[layer(config, out_channels, out_channels) for _ in range(depth - 1)],
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.layers(hidden_state)
+        return hidden_state
+
+
+class RegNetEncoder(nn.Module):
+    def __init__(self, config: RegNetConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        # based on `downsample_in_first_stage`, the first layer of the first stage may or may not downsample the input
+        self.stages.append(
+            RegNetStage(
+                config,
+                config.embedding_size,
+                config.hidden_sizes[0],
+                stride=2 if config.downsample_in_first_stage else 1,
+                depth=config.depths[0],
+            )
+        )
+        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+        for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
+            self.stages.append(RegNetStage(config, in_channels, out_channels, depth=depth))
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RegNet,resnet->regnet
+class RegNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RegNetConfig
+    base_model_prefix = "regnet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RegNetModel):
+            module.gradient_checkpointing = value
+
+
+REGNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`RegNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+REGNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RegNet model outputting raw features without any specific head on top.",
+    REGNET_START_DOCSTRING,
+)
+# Copied from transformers.models.resnet.modeling_resnet.ResNetModel with RESNET->REGNET,ResNet->RegNet
+class RegNetModel(RegNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embedder = RegNetEmbeddings(config)
+        self.encoder = RegNetEncoder(config)
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.pooler(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    RegNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    REGNET_START_DOCSTRING,
+)
+# Copied from transformers.models.resnet.modeling_resnet.ResNetForImageClassification with RESNET->REGNET,ResNet->RegNet,resnet->regnet
+class RegNetForImageClassification(RegNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.regnet = RegNetModel(config)
+        # classification head
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
+        )
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Tensor = None,
+        labels: Tensor = None,
+        output_hidden_states: bool = None,
+        return_dict: bool = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index 00c589d68f7b..f2f555d7f519 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -66,12 +66,14 @@ def __init__(
 
 class ResNetEmbeddings(nn.Sequential):
     """
-    ResNet Embedddings (stem) composed of a single aggressive convolution.
+    ResNet Embeddings (stem) composed of a single aggressive convolution.
     """
 
-    def __init__(self, num_channels: int, out_channels: int, activation: str = "relu"):
+    def __init__(self, config: ResNetConfig):
         super().__init__()
-        self.embedder = ResNetConvLayer(num_channels, out_channels, kernel_size=7, stride=2, activation=activation)
+        self.embedder = ResNetConvLayer(
+            config.num_channels, config.embedding_size, kernel_size=7, stride=2, activation=config.hidden_act
+        )
         self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 
 
@@ -267,7 +269,7 @@ class ResNetModel(ResNetPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        self.embedder = ResNetEmbeddings(config.num_channels, config.embedding_size, config.hidden_act)
+        self.embedder = ResNetEmbeddings(config)
         self.encoder = ResNetEncoder(config)
         self.pooler = nn.AdaptiveAvgPool2d((1, 1))
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/tapex/__init__.py b/src/transformers/models/tapex/__init__.py
new file mode 100644
index 000000000000..36c5938d23c9
--- /dev/null
+++ b/src/transformers/models/tapex/__init__.py
@@ -0,0 +1,36 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...file_utils import _LazyModule
+
+
+_import_structure = {
+    "tokenization_tapex": ["TapexTokenizer"],
+}
+
+
+if TYPE_CHECKING:
+    from .tokenization_tapex import TapexTokenizer
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py
new file mode 100644
index 000000000000..0b5c1241415a
--- /dev/null
+++ b/src/transformers/models/tapex/tokenization_tapex.py
@@ -0,0 +1,1512 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for TAPEX."""
+
+import json
+import os
+import random
+from contextlib import contextmanager
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple, Union
+
+import regex as re
+
+from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ...utils import logging
+
+
+if is_pandas_available():
+    import pandas as pd
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/tapex-base": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/tapex-base": {"do_lower_case": True},
+}
+
+
+class TapexTruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE.
+    """
+
+    DROP_ROWS_TO_FIT = "drop_rows_to_fit"
+
+
+class TokenizerStrategy(ExplicitEnum):
+
+    TOKENIZE_SOURCE = "tokenize_source"
+    TOKENIZE_TARGET = "tokenize_target"
+
+
+TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`],
+                   *optional*, defaults to `False`):
+
+                Activates and controls truncation. Accepts the following values:
+
+                - `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  row by row, removing rows from the table.
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
+                truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
+    of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
+    you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
+    vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class IndexedRowTableLinearize:
+    """
+    FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+    """
+
+    def process_table(self, table_content: Dict):
+        """
+        Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
+        # process header
+        table_str = self.process_header(table_content["header"]) + " "
+        # process rows
+        for i, row_example in enumerate(table_content["rows"]):
+            # NOTE: the row should start from row 1 instead of 0
+            table_str += self.process_row(row_example, row_index=i + 1) + " "
+        return table_str.strip()
+
+    def process_header(self, headers: List):
+        """
+        Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        return "col : " + " | ".join(headers)
+
+    def process_row(self, row: List, row_index: int):
+        """
+        Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        row_str = ""
+        row_cell_values = []
+        for cell_value in row:
+            if isinstance(cell_value, int):
+                row_cell_values.append(str(cell_value))
+            else:
+                row_cell_values.append(cell_value)
+        row_str += " | ".join(row_cell_values)
+        return "row " + str(row_index) + " : " + row_str
+
+
+class TapexTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
+
+    This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
+    to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
+
+    sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+
+    The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
+    will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
+    for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
+    the tokenizer for instance to prepare them for the model.
+
+    Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (BART tokenizer detect beginning of words by the preceding space).
+        max_cell_length (`int`, *optional*, defaults to 15):
+            Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation
+            takes place.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lower_case=True,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        max_cell_length=15,
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            max_cell_length=max_cell_length,
+            **kwargs,
+        )
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.do_lower_case = do_lower_case
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        # additional properties
+        self.max_cell_length = max_cell_length
+        self.table_linearize = IndexedRowTableLinearize()
+
+        # property to decide using which call function
+        self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A TAPEX sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Args:
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Args:
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not:
+        make use of token type ids, therefore a list of zeros is returned.
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]] = None,
+        query: Optional[Union[TextInput, List[TextInput]]] = None,
+        answer: Union[str, List[str]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several table-sequence pair(s).
+
+        Args:
+            table (`pd.DataFrame`, `List[pd.DataFrame]`):
+                Table(s) containing tabular data.
+            query (`str` or `List[str]`, *optional*):
+                Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
+                sentences must match the number of tables.
+            answer (`str` or `List[str]`, *optional*):
+                Optionally, the corresponding answer to the questions as supervision.
+        """
+
+        if self.current_tokenizer == TokenizerStrategy.TOKENIZE_SOURCE:
+            if table is None:
+                raise ValueError("Please ensure that the table is not empty if you use TAPEX to encode source.")
+            return self.source_call_func(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            if answer is None:
+                raise ValueError("Please ensure that the answer is not empty if you use TAPEX to encode target.")
+            return self.target_call_func(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def source_call_func(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[Union[TextInput, List[TextInput]]] = None,
+        answer: Union[str, List[str]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        # Input type checking for clearer error
+        valid_table = False
+        valid_query = False
+
+        # Check that table have a valid type
+        if isinstance(table, pd.DataFrame):
+            valid_table = True
+        elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
+            valid_table = True
+
+        # Check that query have a valid type
+        if query is None or isinstance(query, str):
+            valid_query = True
+        elif isinstance(query, (list, tuple)):
+            if len(query) == 0 or isinstance(query[0], str):
+                valid_query = True
+
+        if not valid_table:
+            raise ValueError(
+                "table input must of type `pd.DataFrame` (single example), `List[pd.DataFrame]` (batch of examples). "
+            )
+        if not valid_query:
+            raise ValueError("query input must of type `str` (single example), `List[str]` (batch of examples). ")
+        is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
+
+        if is_batched:
+            return self.batch_encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[List[TextInput]] = None,
+        answer: List[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[List[TextInput]] = None,
+        answer: Optional[List[str]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
+            # single table, many queries case
+            # duplicate table for every query
+            table = [table] * len(query)
+        if isinstance(table, (list, tuple)) and isinstance(query, str):
+            # many tables, single query case
+            # duplicate query for every table
+            query = [query] * len(table)
+
+        batch_outputs = self._batch_prepare_for_model(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        table: Union["pd.DataFrame", List["pd.DataFrame"]],
+        query: Optional[Union[TextInput, List[TextInput]]] = None,
+        answer: Optional[Union[str, List[str]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        This method adds special tokens, truncates sequences if overflowing while taking into account the special
+        tokens and manages a moving window (with user defined stride) for overflowing tokens.
+        """
+        batch_outputs = {}
+        if answer is None:
+            answer = [None] * len(table)
+        for _table, _query, _answer in zip(table, query, answer):
+            text = self.prepare_table_query(
+                _table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
+            )
+
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
+    def encode(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs
+    ) -> List[int]:
+        """
+        Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
+        attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
+        your processing on your own, otherwise refer to `__call__`.
+        """
+        encoded_inputs = self.encode_plus(
+            table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = self.prepare_table_query(
+            table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
+        )
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def target_call_func(
+        self,
+        answer: Union[str, List[str]],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        The method tokenizes and prepares the answer label for the model.
+
+        Args:
+            answer (`str` or `List[str]`):
+                Corresponding answer supervision to the queries for training the model.
+        """
+        is_batched = isinstance(answer, (list, tuple))
+
+        if is_batched:
+            return self.target_batch_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.target_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def target_batch_encode_plus(
+        self,
+        answer: List[str],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Prepare answer strings for the model.
+
+        Args:
+            answer `List[str]`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_batch_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_batch_encode_plus(
+        self,
+        answer: List[str],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+
+        batch_outputs = {}
+        for text in answer:
+
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return BatchEncoding(batch_outputs)
+
+    def target_encode(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs
+    ) -> List[int]:
+        """
+        Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
+        which are necessary for the model to work correctly. Use this method if you want to build your processing on
+        your own, otherwise refer to `__call__`.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model
+        """
+        encoded_outputs = self.target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_outputs["input_ids"]
+
+    def target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Prepare a answer string for the model.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = answer
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @contextmanager
+    def as_target_tokenizer(self):
+        """
+        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
+        sequence-to-sequence models that need a slightly different processing for the labels.
+        """
+        self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET
+        yield
+        # restore the call function
+        self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
+
+    def prepare_table_query(
+        self,
+        table,
+        query,
+        answer=None,
+        truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy],
+        max_length=None,
+    ):
+        """
+        This method can be used to linearize a table and add a corresponding query.
+
+        Optionally, it also handles truncation of the table (cells).
+
+        An answer can be provided for more precise truncation.
+        """
+        if not table.empty:
+            # step 1: create table dictionary
+            table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
+
+            # step 2: modify table internally
+            # always truncate table cells based on self.max_cell_length
+            # optionally truncate rows if truncation_strategy is set to it
+            self.truncate_table_cells(table_content, query, answer)
+            if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
+                self.truncate_table_rows(table_content, query, answer, max_length=max_length)
+
+            # step 3: linearize table
+            linear_table = self.table_linearize.process_table(table_content)
+        else:
+            linear_table = ""
+
+        if linear_table == "":
+            logger.warning(
+                "You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+                + f"Please carefully check the corresponding table with the query : {query}."
+            )
+        if query == "":
+            logger.warning("You provide nothing to query with respect to the table.")
+        # step 4: concatenate query with linear_table
+        separator = " " if query and linear_table else ""
+        joint_input = (query + separator + linear_table) if query else linear_table
+
+        return joint_input
+
+    def truncate_table_cells(self, table_content: Dict, question: str, answer: List):
+        # TODO (Qian): is it possible to revert the original cell if it is in the final answer?
+        cell_mapping = {}
+        for row in table_content["rows"]:
+            for i, cell in enumerate(row):
+                truncate_cell = self.truncate_cell(cell)
+                if truncate_cell is not None:
+                    cell_mapping[cell] = truncate_cell
+                    row[i] = truncate_cell
+
+        # modify the answer list
+        if answer is not None:
+            for i, case in enumerate(answer):
+                if case in cell_mapping.keys():
+                    answer[i] = cell_mapping[case]
+
+    def truncate_cell(self, cell_value):
+        # do not process on these cases
+        if isinstance(cell_value, int) or isinstance(cell_value, float):
+            return cell_value
+        if cell_value.strip() != "":
+            try_tokens = self.tokenize(cell_value)
+            if len(try_tokens) >= self.max_cell_length:
+                retain_tokens = try_tokens[: self.max_cell_length]
+                retain_cell_value = self.convert_tokens_to_string(retain_tokens)
+                return retain_cell_value
+            else:
+                return None
+        else:
+            return cell_value
+
+    def truncate_table_rows(
+        self, table_content: Dict, question: str, answer: Optional[Union[str, List[str]]] = None, max_length=None
+    ):
+        """
+        Args:
+
+        table_content:
+            {"header": xxx, "rows": xxx, "id" (Optionally): xxx}
+
+        question:
+            natural language sentence
+
+        answer:
+            if for training, is the supervision; otherwise will be empty
+        """
+        delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
+        # randomly delete unrelated rows
+        self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
+        # guarantee the result < max_length
+        maximum_keep_rows = 0
+        for ind, row_example in enumerate(table_content["rows"]):
+            value_string = self.table_linearize.process_row(row_example, ind + 1)
+            value_token_len = len(self.tokenize(value_string))
+            # over the size limit, and take action
+            if value_token_len > remain_token_len:
+                break
+            remain_token_len -= value_token_len
+            maximum_keep_rows += 1
+        del table_content["rows"][maximum_keep_rows:]
+
+    def estimate_delete_ratio(self, table_content: Dict, question: str, max_length=None):
+        if "header" not in table_content or "rows" not in table_content:
+            raise ValueError("The table content should contain both 'header' and 'rows' keys.")
+        # calculate the tokens of header, special tokens will only be pre-prepended into question
+        question_tokens = self.tokenize(question, add_special_tokens=True)
+        # calculate the tokens of header
+        header_string = self.table_linearize.process_header(table_content["header"])
+        header_tokens = self.tokenize(header_string, add_special_tokens=False)
+        # split all cell values into tokens and see how many can be accommodated
+        used_token_len = len(question_tokens) + len(header_tokens)
+        # remaining token space for rows
+        remain_token_len = max_length - used_token_len
+
+        value_string = ""
+        for _, row_example in enumerate(table_content["rows"]):
+            # use a general index to roughly estimate the overall token len
+            value_string += self.table_linearize.process_row(row_example, 100) + " "
+        value_token_len = len(self.tokenize(value_string))
+
+        if value_token_len < remain_token_len:
+            # no row will be deleted
+            return 0.0, remain_token_len
+        else:
+            # calc a roughly delete rate
+            return 1.0 - remain_token_len / value_token_len, remain_token_len
+
+    def delete_unrelated_rows(self, table_content: Dict, question: str, answer: List, delete_ratio: float):
+        """
+        The argument answer is used only during training.
+        """
+        truncated_unrelated_indices = []
+        related_indices = []
+        if answer is None or len(answer) == 0:
+            answer_set = set([])
+        else:
+            answer_set = set([ans_ex.lower() for ans_ex in answer])
+        # add question key words into answer set
+        if question is not None:
+            answer_set.update(question.split())
+        question_set = set(question.strip("?!.,").split(" "))
+        row_max_len = len(table_content["rows"])
+        for _row_idx, row in enumerate(table_content["rows"]):
+            lower_row = set([str(cell).lower() for cell in row])
+            if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
+                truncated_unrelated_indices.append(_row_idx)
+            else:
+                # add neighbours to preserve information aggressively
+                related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
+
+        # remove the neighbours
+        truncated_unrelated_indices = [
+            _row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
+        ]
+        # select some cases to drop
+        drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
+        drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
+
+        for _row_idx in reversed(range(row_max_len)):
+            if _row_idx in drop_row_indices:
+                del table_content["rows"][_row_idx]
+
+        # only when the drop ratio is too large, logging for warning.
+        if "id" in table_content and len(drop_row_indices) > 0:
+            logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index cbf935f4f743..e3c039ca83e1 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -33,14 +33,23 @@
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_vit import ViTConfig
 
 
 logger = logging.get_logger(__name__)
 
+# General docstring
 _CONFIG_FOR_DOC = "ViTConfig"
-_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224"
+_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
 
 
 # Inspired by
@@ -645,7 +654,14 @@ def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs)
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -656,26 +672,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import ViTFeatureExtractor, TFViTModel
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
-        >>> model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
-
-        >>> inputs = feature_extractor(images=image, return_tensors="tf")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
 
         outputs = self.vit(
             pixel_values=pixel_values,
@@ -744,7 +740,13 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs):
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
     def call(
         self,
         pixel_values: Optional[TFModelInputType] = None,
@@ -761,30 +763,7 @@ def call(
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import ViTFeatureExtractor, TFViTForImageClassification
-        >>> import tensorflow as tf
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-        >>> model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
-
-        >>> inputs = feature_extractor(images=image, return_tensors="tf")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
-        >>> # model predicts one of the 1000 ImageNet classes
-        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
-        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
-        ```"""
+        """
 
         outputs = self.vit(
             pixel_values=pixel_values,
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index a2fb10b9e040..0c8a8a88d85d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2437,10 +2437,13 @@ def evaluation_loop(
         losses_host = None
         preds_host = None
         labels_host = None
+        inputs_host = None
+
         # losses/preds/labels on CPU (final containers)
         all_losses = None
         all_preds = None
         all_labels = None
+        all_inputs = None
         # Will be useful when we have an iterable dataset so don't know its length.
 
         observed_num_examples = 0
@@ -2456,6 +2459,7 @@ def evaluation_loop(
 
             # Prediction step
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
 
             if is_torch_tpu_available():
                 xm.mark_step()
@@ -2468,6 +2472,14 @@ def evaluation_loop(
                 labels = self._pad_across_processes(labels)
                 labels = self._nested_gather(labels)
                 labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_decode = self._pad_across_processes(inputs_decode)
+                inputs_decode = self._nested_gather(inputs_decode)
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
             if logits is not None:
                 logits = self._pad_across_processes(logits)
                 logits = self._nested_gather(logits)
@@ -2484,6 +2496,13 @@ def evaluation_loop(
                 if preds_host is not None:
                     logits = nested_numpify(preds_host)
                     all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+                if inputs_host is not None:
+                    inputs_decode = nested_numpify(inputs_host)
+                    all_inputs = (
+                        inputs_decode
+                        if all_inputs is None
+                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+                    )
                 if labels_host is not None:
                     labels = nested_numpify(labels_host)
                     all_labels = (
@@ -2491,7 +2510,7 @@ def evaluation_loop(
                     )
 
                 # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host = None, None, None
+                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
 
         if args.past_index and hasattr(self, "_past"):
             # Clean the state at the end of the evaluation loop
@@ -2504,6 +2523,11 @@ def evaluation_loop(
         if preds_host is not None:
             logits = nested_numpify(preds_host)
             all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+        if inputs_host is not None:
+            inputs_decode = nested_numpify(inputs_host)
+            all_inputs = (
+                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+            )
         if labels_host is not None:
             labels = nested_numpify(labels_host)
             all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
@@ -2529,10 +2553,17 @@ def evaluation_loop(
             all_preds = nested_truncate(all_preds, num_samples)
         if all_labels is not None:
             all_labels = nested_truncate(all_labels, num_samples)
+        if all_inputs is not None:
+            all_inputs = nested_truncate(all_inputs, num_samples)
 
         # Metrics!
         if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
         else:
             metrics = {}
 
@@ -2913,7 +2944,6 @@ def prediction_loop(
 
         # if eval is called w/o train init deepspeed here
         if args.deepspeed and not self.deepspeed:
-
             # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
             # from the checkpoint eventually
             deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
@@ -2944,6 +2974,7 @@ def prediction_loop(
         losses_host: torch.Tensor = None
         preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
         labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
 
         world_size = max(1, args.world_size)
 
@@ -2956,6 +2987,7 @@ def prediction_loop(
                 make_multiple_of = dataloader.sampler.batch_size
             preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
             labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
 
         model.eval()
 
@@ -2969,6 +3001,8 @@ def prediction_loop(
 
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = inputs["input_ids"] if args.include_inputs_for_metrics else None
+
             if loss is not None:
                 losses = loss.repeat(batch_size)
                 losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
@@ -2976,6 +3010,12 @@ def prediction_loop(
                 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
             if labels is not None:
                 labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
@@ -2984,9 +3024,10 @@ def prediction_loop(
                 if not prediction_loss_only:
                     preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
                     labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+                    inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
 
                 # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host = None, None, None
+                losses_host, preds_host, labels_host, inputs_host = None, None, None, None
 
         if args.past_index and hasattr(self, "_past"):
             # Clean the state at the end of the evaluation loop
@@ -2997,13 +3038,20 @@ def prediction_loop(
         if not prediction_loss_only:
             preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
             labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+            inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
 
         eval_loss = eval_losses_gatherer.finalize()
         preds = preds_gatherer.finalize() if not prediction_loss_only else None
         label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
+        inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
 
         if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
         else:
             metrics = {}
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index d8d15affb9ed..4450bfde646e 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -63,17 +63,43 @@ def set_seed(seed: int):
         tf.random.set_seed(seed)
 
 
-class EvalPrediction(NamedTuple):
+class EvalPrediction:
     """
     Evaluation output (always contains labels), to be used to compute metrics.
 
     Parameters:
         predictions (`np.ndarray`): Predictions of the model.
         label_ids (`np.ndarray`): Targets to be matched.
+        inputs (`np.ndarray`, *optional*)
     """
 
-    predictions: Union[np.ndarray, Tuple[np.ndarray]]
-    label_ids: Union[np.ndarray, Tuple[np.ndarray]]
+    def __init__(
+        self,
+        predictions: Union[np.ndarray, Tuple[np.ndarray]],
+        label_ids: Union[np.ndarray, Tuple[np.ndarray]],
+        inputs: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
+    ):
+        self.predictions = predictions
+        self.label_ids = label_ids
+        self.inputs = inputs
+
+    def __iter__(self):
+        if self.inputs is not None:
+            return iter((self.predictions, self.label_ids, self.inputs))
+        else:
+            return iter((self.predictions, self.label_ids))
+
+    def __getitem__(self, idx):
+        if idx < 0 or idx > 2:
+            raise IndexError("tuple index out of range")
+        if idx == 2 and self.inputs is None:
+            raise IndexError("tuple index out of range")
+        if idx == 0:
+            return self.predictions
+        elif idx == 1:
+            return self.label_ids
+        elif idx == 2:
+            return self.inputs
 
 
 class EvalLoopOutput(NamedTuple):
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 6176794ab032..6d6203a3ad44 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -416,6 +416,9 @@ class TrainingArguments:
             `huggingface-cli login`.
         gradient_checkpointing (`bool`, *optional*, defaults to `False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
+            Whether or not the inputs will be passed to the `compute_metrics` function. This is intended for metrics
+            that need inputs, predictions and references for scoring calculation in Metric class.
     """
 
     output_dir: str = field(
@@ -739,6 +742,9 @@ class TrainingArguments:
             "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
         },
     )
+    include_inputs_for_metrics: bool = field(
+        default=False, metadata={"help": "Whether or not the inputs will be passed to the `compute_metrics` function."}
+    )
     # Deprecated arguments
     fp16_backend: str = field(
         default="auto",
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index eaf59ba50215..39508e18d222 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -207,7 +207,8 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> # target is "nice puppet"
-    >>> target_start_index, target_end_index = torch.tensor([14]), torch.tensor([15])
+    >>> target_start_index = torch.tensor([{qa_target_start_index}])
+    >>> target_end_index = torch.tensor([{qa_target_end_index}])
 
     >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
     >>> loss = outputs.loss
@@ -269,9 +270,10 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     ```python
     >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
     >>> num_labels = len(model.config.id2label)
-    >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels)
+    >>> model = {model_class}.from_pretrained(
+    ...     "{checkpoint}", num_labels=num_labels, problem_type="multi_label_classification"
+    ... )
 
-    >>> num_labels = len(model.config.id2label)
     >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
     ...     torch.float
     ... )
@@ -666,7 +668,8 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
 
     ```python
     >>> # target is "nice puppet"
-    >>> target_start_index, target_end_index = tf.constant([14]), tf.constant([15])
+    >>> target_start_index = tf.constant([{qa_target_start_index}])
+    >>> target_end_index = tf.constant([{qa_target_end_index}])
 
     >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
     >>> loss = tf.math.reduce_mean(outputs.loss)
@@ -1053,6 +1056,8 @@ def add_code_sample_docstrings(
     output_type=None,
     config_class=None,
     mask="[MASK]",
+    qa_target_start_index=14,
+    qa_target_end_index=15,
     model_cls=None,
     modality=None,
     expected_output="",
@@ -1077,6 +1082,8 @@ def docstring_decorator(fn):
             processor_class=processor_class,
             checkpoint=checkpoint,
             mask=mask,
+            qa_target_start_index=qa_target_start_index,
+            qa_target_end_index=qa_target_end_index,
             expected_output=expected_output,
             expected_loss=expected_loss,
         )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2bd71277b520..3233b44d896c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3350,6 +3350,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RegNetForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RegNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RegNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/auto/test_modeling_auto.py b/tests/auto/test_modeling_auto.py
index ae04501ea2dd..02ecb08e1e2f 100644
--- a/tests/auto/test_modeling_auto.py
+++ b/tests/auto/test_modeling_auto.py
@@ -74,12 +74,9 @@
         MODEL_FOR_MASKED_LM_MAPPING,
         MODEL_FOR_PRETRAINING_MAPPING,
         MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
-        MODEL_WITH_LM_HEAD_MAPPING,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
     from transformers.models.gpt2.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -251,40 +248,6 @@ def test_from_pretrained_with_tuple_values(self):
             model = AutoModel.from_pretrained(tmp_dir)
             self.assertIsInstance(model, FunnelBaseModel)
 
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-
-        mappings = (
-            MODEL_MAPPING,
-            MODEL_FOR_PRETRAINING_MAPPING,
-            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-            MODEL_WITH_LM_HEAD_MAPPING,
-            MODEL_FOR_CAUSAL_LM_MAPPING,
-            MODEL_FOR_MASKED_LM_MAPPING,
-            MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        )
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, child_model) in enumerate(mapping[1:]):
-                for parent_config, parent_model in mapping[: index + 1]:
-                    assert not issubclass(
-                        child_config, parent_config
-                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
-
-                    # Tuplify child_model and parent_model since some of them could be tuples.
-                    if not isinstance(child_model, (list, tuple)):
-                        child_model = (child_model,)
-                    if not isinstance(parent_model, (list, tuple)):
-                        parent_model = (parent_model,)
-
-                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
-                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
-
     def test_from_pretrained_dynamic_model_local(self):
         try:
             AutoConfig.register("custom", CustomConfig)
diff --git a/tests/auto/test_modeling_tf_auto.py b/tests/auto/test_modeling_tf_auto.py
index 04f2b4862cd7..a803a3451107 100644
--- a/tests/auto/test_modeling_tf_auto.py
+++ b/tests/auto/test_modeling_tf_auto.py
@@ -58,12 +58,9 @@
         TF_MODEL_FOR_MASKED_LM_MAPPING,
         TF_MODEL_FOR_PRETRAINING_MAPPING,
         TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         TF_MODEL_MAPPING,
-        TF_MODEL_WITH_LM_HEAD_MAPPING,
     )
     from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
     from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -218,38 +215,6 @@ def test_from_pretrained_with_tuple_values(self):
             model = TFAutoModel.from_pretrained(tmp_dir)
             self.assertIsInstance(model, TFFunnelBaseModel)
 
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-        mappings = (
-            TF_MODEL_MAPPING,
-            TF_MODEL_FOR_PRETRAINING_MAPPING,
-            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-            TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-            TF_MODEL_WITH_LM_HEAD_MAPPING,
-            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-            TF_MODEL_FOR_MASKED_LM_MAPPING,
-            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        )
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, child_model) in enumerate(mapping[1:]):
-                for parent_config, parent_model in mapping[: index + 1]:
-                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
-                        self.assertFalse(issubclass(child_config, parent_config))
-
-                    # Tuplify child_model and parent_model since some of them could be tuples.
-                    if not isinstance(child_model, (list, tuple)):
-                        child_model = (child_model,)
-                    if not isinstance(parent_model, (list, tuple)):
-                        parent_model = (parent_model,)
-
-                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
-                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
-
     def test_new_model_registration(self):
         try:
             AutoConfig.register("new-model", NewModelConfig)
diff --git a/tests/auto/test_tokenization_auto.py b/tests/auto/test_tokenization_auto.py
index ae4e5896508d..57041e583029 100644
--- a/tests/auto/test_tokenization_auto.py
+++ b/tests/auto/test_tokenization_auto.py
@@ -151,19 +151,6 @@ def test_tokenizer_identifier_non_existent(self):
             ):
                 _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
 
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-
-        mappings = (TOKENIZER_MAPPING,)
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, _) in enumerate(mapping[1:]):
-                for parent_config, _ in mapping[: index + 1]:
-                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
-                        self.assertFalse(issubclass(child_config, parent_config))
-
     def test_model_name_edge_cases_in_mappings(self):
         # tests: https://github.com/huggingface/transformers/pull/13251
         # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
diff --git a/tests/regnet/__init__.py b/tests/regnet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/regnet/test_modeling_regnet.py b/tests/regnet/test_modeling_regnet.py
new file mode 100644
index 000000000000..331e45296bc3
--- /dev/null
+++ b/tests/regnet/test_modeling_regnet.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch RegNet model. """
+
+
+import inspect
+import unittest
+
+from transformers import RegNetConfig
+from transformers.file_utils import cached_property, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from ..test_configuration_common import ConfigTester
+from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import RegNetForImageClassification, RegNetModel
+    from transformers.models.regnet.modeling_regnet import REGNET_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoFeatureExtractor
+
+
+class RegNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        embeddings_size=10,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[1, 1, 2, 1],
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.embeddings_size = embeddings_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.scope = scope
+        self.num_stages = len(hidden_sizes)
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return RegNetConfig(
+            num_channels=self.num_channels,
+            embeddings_size=self.embeddings_size,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            hidden_act=self.hidden_act,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = RegNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected last hidden states: B, C, H // 32, W // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = RegNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class RegNetModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as RegNet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (RegNetModel, RegNetForImageClassification) if is_torch_available() else ()
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = RegNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RegNetConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="RegNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="RegNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, module in model.named_modules():
+                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+                    self.assertTrue(
+                        torch.all(module.weight == 1),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+                    self.assertTrue(
+                        torch.all(module.bias == 0),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # RegNet's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        layers_type = ["basic", "bottleneck"]
+        for model_class in self.all_model_classes:
+            for layer_type in layers_type:
+                config.layer_type = layer_type
+                inputs_dict["output_hidden_states"] = True
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+                # check that output_hidden_states also work using config
+                del inputs_dict["output_hidden_states"]
+                config.output_hidden_states = True
+
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RegNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class RegNetModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            AutoFeatureExtractor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = RegNetForImageClassification.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.4180, -1.5051, -3.4836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/tapex/__init__.py b/tests/tapex/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/tapex/test_tokenization_tapex.py b/tests/tapex/test_tokenization_tapex.py
new file mode 100644
index 000000000000..dd9f3d4bcf25
--- /dev/null
+++ b/tests/tapex/test_tokenization_tapex.py
@@ -0,0 +1,909 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from typing import List
+
+import pandas as pd
+
+from transformers import AddedToken, TapexTokenizer
+from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
+from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
+
+from ..test_tokenization_common import TokenizerTesterMixin
+
+
+@require_pandas
+class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = TapexTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {"cls_token": "<s>"}
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]  # noqa: E231
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_table(self, tokenizer, length=5):
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+
+        if length == 0:
+            data = {}
+        else:
+            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
+
+        table = pd.DataFrame.from_dict(data)
+
+        return table
+
+    def get_table_and_query(self, tokenizer, length=5):
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+        table = self.get_table(tokenizer, length=length - 3)
+        query = " ".join(toks[:3])
+
+        return table, query
+
+    def get_clean_sequence(
+        self,
+        tokenizer,
+        with_prefix_space=False,
+        max_length=20,
+        min_length=5,
+        empty_table: bool = False,
+        add_special_tokens: bool = True,
+        return_table_and_query: bool = False,
+    ):
+
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+
+        if empty_table:
+            table = pd.DataFrame.from_dict({})
+            query = " ".join(toks[:min_length])
+        else:
+            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
+            table = pd.DataFrame.from_dict(data)
+            query = " ".join(toks[:3])
+
+        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
+        output_txt = tokenizer.decode(output_ids)
+
+        if len(output_ids) < min_length:
+            raise ValueError("Update the code to generate the sequences so that they are larger")
+        if len(output_ids) > max_length:
+            raise ValueError("Update the code to generate the sequences so that they are smaller")
+
+        if return_table_and_query:
+            return output_txt, output_ids, table, query
+
+        return output_txt, output_ids
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer_roberta(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def roberta_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+
+        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
+        self.assertListEqual(
+            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
+        )
+
+    def test_add_tokens_tokenizer(self):
+        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    table,
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
+                    add_special_tokens=False,
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokens[-3])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
+
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                empty_table = self.get_table(tokenizer, length=0)
+                seq_0 = "Test this method."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardless of whether the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
+
+                # Assert that the token type IDs have the same length as the input IDs
+                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
+                self.assertIn(0, output["token_type_ids"])
+
+    def test_add_special_tokens(self):
+        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_table = self.get_table(tokenizer, length=0)
+
+                special_token = "[SPECIAL_TOKEN]"
+
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+
+                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+
+    def test_batch_encode_plus_overflowing_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            table = self.get_table(tokenizer, length=10)
+            string_sequences = ["Testing the prepare_for_model method.", "Test"]
+
+            if tokenizer.pad_token is None:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+            tokenizer.batch_encode_plus(
+                table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
+            )
+
+    @is_pt_tf_cross_test
+    def test_batch_encode_plus_tensors(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                table = self.get_table(tokenizer, length=0)
+
+                # A Tensor cannot be build by sequences which are not the same size
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
+
+                if tokenizer.pad_token_id is None:
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding=True,
+                        return_tensors="pt",
+                    )
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding="longest",
+                        return_tensors="tf",
+                    )
+                else:
+                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
+                    tensorflow_tensor = tokenizer.batch_encode_plus(
+                        table, sequences, padding="longest", return_tensors="tf"
+                    )
+                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
+
+                    for key in encoded_sequences.keys():
+                        pytorch_value = pytorch_tensor[key].tolist()
+                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
+                        encoded_value = encoded_sequences[key]
+
+                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                # Test not batched
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
+                encoded_sequences_2 = tokenizer(table, sequences[0])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test not batched pairs
+                table = self.get_table(tokenizer, length=10)
+                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
+                encoded_sequences_2 = tokenizer(table, sequences[1])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test batched
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
+                encoded_sequences_2 = tokenizer(table, sequences)
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+    def test_internal_consistency(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                input_text, output_text = self.get_input_output_texts(tokenizer)
+
+                tokens = tokenizer.tokenize(input_text)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
+                self.assertListEqual(ids, ids_2)
+
+                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+                self.assertNotEqual(len(tokens_2), 0)
+                text_2 = tokenizer.decode(ids)
+                self.assertIsInstance(text_2, str)
+
+                self.assertEqual(text_2, output_text)
+
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                table = self.get_table(tokenizer, length=0)
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+
+                shutil.rmtree(tmpdirname)
+
+    def test_number_of_added_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                table, query = self.get_table_and_query(tokenizer)
+
+                sequences = tokenizer.encode(table, query, add_special_tokens=False)
+                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
+
+                self.assertEqual(2, len(attached_sequences) - len(sequences))
+
+    @unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
+    def test_prepare_for_model(self):
+        pass
+
+    @unittest.skip("TAPEX tokenizer does not support pairs.")
+    def test_maximum_encoding_length_pair_input(self):
+        pass
+
+    @unittest.skip("TAPEX tokenizer does not support pairs.")
+    def test_maximum_encoding_length_single_input(self):
+        pass
+
+    @unittest.skip("Not implemented")
+    def test_right_and_left_truncation(self):
+        pass
+
+    def test_encode_decode_with_spaces(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+
+                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
+                tokenizer.add_tokens(new_toks)
+                input = "[ABC][DEF][ABC][DEF]"
+                if self.space_between_special_tokens:
+                    output = "[ABC] [DEF] [ABC] [DEF]"
+                else:
+                    output = input
+                encoded = tokenizer.encode(table, input, add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+                self.assertIn(decoded, [output, output.lower()])
+
+    def test_tokenize_special_tokens(self):
+        """Test `tokenize` with special tokens."""
+        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
+                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
+
+                # TODO:
+                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
+                # with one variable(property) for a better maintainability?
+
+                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
+                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
+                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
+                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
+                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
+
+                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
+                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
+
+                self.assertEqual(len(token_1), 1)
+                self.assertEqual(len(token_2), 1)
+                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
+                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
+
+    def test_special_tokens_mask(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence_0 = "Encode this."
+                # Testing single inputs
+                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    def test_padding_to_max_length(self):
+        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer)
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    pad_to_max_length=True,
+                )
+                padded_sequence_length = len(padded_sequence)
+                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
+                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
+
+                # Check that nothing is done when a maximum length is not specified
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                self.assertEqual(sequence_length, padded_sequence_right_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_right)
+
+    def test_padding_to_multiple_of(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                if tokenizer.pad_token is None:
+                    self.skipTest("No padding token.")
+                else:
+                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
+                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
+                    for key, value in empty_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    # Should also work with truncation
+                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+    def test_right_and_left_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
+                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
+
+                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "left"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
+                self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
+
+                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                self.assertEqual(sequence_length, padded_sequence_right_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_right)
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
+                padded_sequence_left_length = len(padded_sequence_left)
+                self.assertEqual(sequence_length, padded_sequence_left_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_left)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence)
+                padded_sequence_right_length = len(padded_sequence_right)
+                self.assertEqual(sequence_length, padded_sequence_right_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_right)
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
+                padded_sequence_left_length = len(padded_sequence_left)
+                self.assertEqual(sequence_length, padded_sequence_left_length)
+                self.assertListEqual(encoded_sequence, padded_sequence_left)
+
+    def test_encode_plus_with_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence = "Sequence"
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_size = 10
+                padding_idx = tokenizer.pad_token_id
+                token_type_padding_idx = tokenizer.pad_token_type_id
+
+                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
+                input_ids = encoded_sequence["input_ids"]
+                special_tokens_mask = encoded_sequence["special_tokens_mask"]
+                sequence_length = len(input_ids)
+
+                # Test 'longest' and 'no_padding' don't do anything
+                tokenizer.padding_side = "right"
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                self.assertEqual(sequence_length, not_padded_sequence_length)
+                self.assertListEqual(input_ids, not_padded_input_ids)
+                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                self.assertEqual(sequence_length, not_padded_sequence_length)
+                self.assertListEqual(input_ids, not_padded_input_ids)
+                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
+
+                # Test right padding
+                tokenizer.padding_side = "right"
+
+                right_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                right_padded_input_ids = right_padded_sequence["input_ids"]
+
+                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
+                right_padded_sequence_length = len(right_padded_input_ids)
+
+                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
+                self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
+                self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
+
+                # Test left padding
+                tokenizer.padding_side = "left"
+                left_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                left_padded_input_ids = left_padded_sequence["input_ids"]
+                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
+                left_padded_sequence_length = len(left_padded_input_ids)
+
+                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
+                self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
+                self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
+
+                if "token_type_ids" in tokenizer.model_input_names:
+                    token_type_ids = encoded_sequence["token_type_ids"]
+                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
+                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
+
+                    self.assertListEqual(
+                        (token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
+                    )
+                    self.assertListEqual(
+                        [[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
+                    )
+
+                if "attention_mask" in tokenizer.model_input_names:
+                    attention_mask = encoded_sequence["attention_mask"]
+                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
+                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
+
+                    self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
+                    self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
+
+    def test_batch_encode_plus_padding(self):
+        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
+
+        # Right padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+        # Left padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                tokenizer.padding_side = "left"
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+    def test_batch_encode_plus_batch_sequence_length(self):
+        # Tests that all encoded values have the correct size
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+                maximum_length = len(
+                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
+                )
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences_padded = [
+                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
+                    for sequence in sequences
+                ]
+
+                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
+                self.assertListEqual(
+                    encoded_sequences_padded,
+                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
+                )
+
+                # check 'longest' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=maximum_length + 10, padding="longest"
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+                # check 'no_padding' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=maximum_length + 10, padding=False
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+    def test_special_tokens_mask_input_pairs(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                empty_table = self.get_table(tokenizer, length=0)
+                table = self.get_table(tokenizer, length=10)
+                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
+                number_of_tokens = len(encoded_sequence)
+                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    table,
+                    sequence_0,
+                    add_special_tokens=True,
+                    return_special_tokens_mask=True,
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [
+                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+                ]
+                # NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
+                # in order to have equivalent results with encoding an empty table or empty sequence
+                del filtered_sequence[number_of_tokens + 1]
+                filtered_sequence = [x for x in filtered_sequence if x is not None]
+                print("Encoded sequence:", encoded_sequence)
+                print("Filtered sequence:", filtered_sequence)
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    @slow
+    def test_full_tokenizer(self):
+        question = "Greece held its last Summer Olympics in 2004"
+        table_dict = {
+            "header": ["Year", "City", "Country", "Nations"],
+            "rows": [
+                [1896, "Athens", "Greece", 14],
+                [1900, "Paris", "France", 24],
+                [1904, "St. Louis", "USA", 12],
+                [2004, "Athens", "Greece", 201],
+                [2008, "Beijing", "China", 204],
+                [2012, "London", "UK", 204],
+            ],
+        }
+        table = pd.DataFrame.from_dict(table_dict["rows"])
+        table.columns = table_dict["header"]
+
+        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
+        encoding = tokenizer(table, question)
+
+        # fmt: off
+        expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
+        # fmt: on
+
+        self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
+
+    def test_tokenizer_as_target(self):
+        # by default the tokenizer do_lower_case
+        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
+        answer_text = "tapex is a good model!"
+        expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
+        with tokenizer.as_target_tokenizer():
+            answer_encoding = tokenizer(answer=answer_text)
+            self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
+
+    @slow
+    def test_tokenizer_lower_case(self):
+        cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
+        uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
+        answer_text = "Beijing, London, Paris"
+        answer_text_lower = "beijing, london, paris"
+
+        with cased_tokenizer.as_target_tokenizer():
+            with uncased_tokenizer.as_target_tokenizer():
+                self.assertNotEqual(
+                    cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
+                )
+                self.assertEqual(
+                    cased_tokenizer(answer=answer_text_lower).input_ids,
+                    uncased_tokenizer(answer=answer_text).input_ids,
+                )
+                # batched encoding assert
+                self.assertNotEqual(
+                    cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
+                )
+                self.assertEqual(
+                    cased_tokenizer(answer=[answer_text_lower]).input_ids,
+                    uncased_tokenizer(answer=[answer_text]).input_ids,
+                )
+        # test input encoding lowercase
+        question = "Greece held its last Summer Olympics in 2004"
+        table_dict = {
+            "header": ["Year", "City", "Country", "Nations"],
+            "rows": [
+                [1896, "Athens", "Greece", 14],
+                [1900, "Paris", "France", 24],
+                [1904, "St. Louis", "USA", 12],
+                [2004, "Athens", "Greece", 201],
+                [2008, "Beijing", "China", 204],
+                [2012, "London", "UK", 204],
+            ],
+        }
+        table = pd.DataFrame.from_dict(table_dict["rows"])
+        table.columns = table_dict["header"]
+
+        self.assertNotEqual(
+            cased_tokenizer(table=table, query=question).input_ids,
+            uncased_tokenizer(table=table, query=question).input_ids,
+        )
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 115026b2da1f..98fccf281718 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -1,6 +1,7 @@
 docs/source/en/quicktour.mdx
 docs/source/en/task_summary.mdx
 docs/source/en/model_doc/speech_to_text.mdx
+docs/source/en/model_doc/tapex.mdx
 src/transformers/generation_utils.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/beit/modeling_beit.py
@@ -10,7 +11,9 @@ src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
 src/transformers/models/convnext/modeling_convnext.py
 src/transformers/models/data2vec/modeling_data2vec_audio.py
 src/transformers/models/deit/modeling_deit.py
+src/transformers/models/electra/modeling_electra.py
 src/transformers/models/dpt/modeling_dpt.py
+src/transformers/models/electra/modeling_electra.py
 src/transformers/models/glpn/modeling_glpn.py
 src/transformers/models/hubert/modeling_hubert.py
 src/transformers/models/marian/modeling_marian.py
@@ -35,6 +38,7 @@ src/transformers/models/van/modeling_van.py
 src/transformers/models/vilt/modeling_vilt.py
 src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
 src/transformers/models/vit/modeling_vit.py
+src/transformers/models/vit/modeling_tf_vit.py
 src/transformers/models/vit_mae/modeling_vit_mae.py
 src/transformers/models/wav2vec2/modeling_wav2vec2.py
 src/transformers/models/wav2vec2/tokenization_wav2vec2.py
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 5ad7b4b1f788..16bf6348d387 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -466,6 +466,7 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None):
             # Example files are tested separately
             elif f.startswith("examples/pytorch"):
                 test_files_to_run.append("examples/pytorch/test_pytorch_examples.py")
+                test_files_to_run.append("examples/pytorch/test_accelerate_examples.py")
             elif f.startswith("examples/flax"):
                 test_files_to_run.append("examples/flax/test_flax_examples.py")
             else: