From f87ba2705c063a60f0dce98222c5f08808fcb9eb Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 09:25:02 +0100
Subject: [PATCH 01/88] First draft

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.mdx                      |   2 +
 docs/source/en/model_doc/bit.mdx              |  47 +
 src/transformers/__init__.py                  |  16 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/auto/modeling_auto.py |   3 +
 src/transformers/models/bit/__init__.py       |  58 ++
 .../models/bit/configuration_bit.py           | 108 +++
 .../models/bit/convert_bit_to_pytorch.py      | 156 ++++
 src/transformers/models/bit/modeling_bit.py   | 846 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 tests/models/bit/__init__.py                  |   0
 tests/models/bit/test_modeling_bit.py         | 308 +++++++
 21 files changed, 1580 insertions(+)
 create mode 100644 docs/source/en/model_doc/bit.mdx
 create mode 100644 src/transformers/models/bit/__init__.py
 create mode 100644 src/transformers/models/bit/configuration_bit.py
 create mode 100644 src/transformers/models/bit/convert_bit_to_pytorch.py
 create mode 100644 src/transformers/models/bit/modeling_bit.py
 create mode 100644 tests/models/bit/__init__.py
 create mode 100644 tests/models/bit/test_modeling_bit.py
diff --git a/README.md b/README.md
index 22056f739579..eddb41abe852 100644
--- a/README.md
+++ b/README.md
@@ -272,6 +272,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_es.md b/README_es.md
index f132aad1e4da..c5a40704e6e3 100644
--- a/README_es.md
+++ b/README_es.md
@@ -272,6 +272,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_ja.md b/README_ja.md
index 6062c458aca7..3af7a69af663 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -307,6 +307,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_ko.md b/README_ko.md
index d63ebc359252..3050b14e95ed 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -222,6 +222,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_zh-hans.md b/README_zh-hans.md
index e226ddf2fa47..dbbbf27ca2a9 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -246,6 +246,7 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 237afce47a74..cd5c1f920633 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -258,6 +258,7 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index fe3a2d20d903..38444d5deaa0 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -60,6 +60,7 @@ The documentation is organized into five sections:
 1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BiT](model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -229,6 +230,7 @@ Flax), PyTorch, and/or TensorFlow.
 |        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
 |        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/bit.mdx b/docs/source/en/model_doc/bit.mdx
new file mode 100644
index 000000000000..325c30669f16
--- /dev/null
+++ b/docs/source/en/model_doc/bit.mdx
@@ -0,0 +1,47 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# BiT
+
+## Overview
+
+The BiT model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## BitConfig
+
+[[autodoc]] BitConfig
+
+
+## BitModel
+
+[[autodoc]] BitModel
+    - forward
+
+
+## BitForImageClassification
+
+[[autodoc]] BitForImageClassification
+    - forward
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bf630e8a9cf0..acdf7ea6c531 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -160,6 +160,7 @@
         "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "BigBirdPegasusConfig",
     ],
+    "models.bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig"],
     "models.blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig", "BlenderbotTokenizer"],
     "models.blenderbot_small": [
         "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1046,6 +1047,14 @@
             "BigBirdPegasusPreTrainedModel",
         ]
     )
+    _import_structure["models.bit"].extend(
+        [
+            "BIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BitForImageClassification",
+            "BitModel",
+            "BitPreTrainedModel",
+        ]
+    )
     _import_structure["models.blenderbot"].extend(
         [
             "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3392,6 +3401,7 @@
     from .models.bertweet import BertweetTokenizer
     from .models.big_bird import BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdConfig
     from .models.bigbird_pegasus import BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdPegasusConfig
+    from .models.bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig
     from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
     from .models.blenderbot_small import (
         BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4166,6 +4176,12 @@
             BigBirdPegasusModel,
             BigBirdPegasusPreTrainedModel,
         )
+        from .models.bit import (
+            BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BitForImageClassification,
+            BitModel,
+            BitPreTrainedModel,
+        )
         from .models.blenderbot import (
             BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
             BlenderbotForCausalLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 774ac3eb1ebc..e30b66a6f23f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -30,6 +30,7 @@
     bertweet,
     big_bird,
     bigbird_pegasus,
+    bit,
     blenderbot,
     blenderbot_small,
     bloom,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d2c322c5b1f4..307af71c3763 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -37,6 +37,7 @@
         ("bert-generation", "BertGenerationConfig"),
         ("big_bird", "BigBirdConfig"),
         ("bigbird_pegasus", "BigBirdPegasusConfig"),
+        ("bit", "BitConfig"),
         ("blenderbot", "BlenderbotConfig"),
         ("blenderbot-small", "BlenderbotSmallConfig"),
         ("bloom", "BloomConfig"),
@@ -189,6 +190,7 @@
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bit", "BIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bloom", "BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -331,6 +333,7 @@
         ("bertweet", "BERTweet"),
         ("big_bird", "BigBird"),
         ("bigbird_pegasus", "BigBird-Pegasus"),
+        ("bit", "BiT"),
         ("blenderbot", "Blenderbot"),
         ("blenderbot-small", "BlenderbotSmall"),
         ("bloom", "BLOOM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index a5c25a7023f6..c2b98de13999 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,6 +39,7 @@
     [
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
+        ("bit", "ConvNextFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c088083b875d..58393ce0b824 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -38,6 +38,7 @@
 IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("beit", "BeitImageProcessor"),
+        ("bit", "ConvNextImageProcessor"),
         ("chinese_clip", "ChineseCLIPImageProcessor"),
         ("clip", "CLIPImageProcessor"),
         ("clipseg", "ViTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bb67dbfc5d31..a0af6df9ab6c 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -36,6 +36,7 @@
         ("bert-generation", "BertGenerationEncoder"),
         ("big_bird", "BigBirdModel"),
         ("bigbird_pegasus", "BigBirdPegasusModel"),
+        ("bit", "BitModel"),
         ("blenderbot", "BlenderbotModel"),
         ("blenderbot-small", "BlenderbotSmallModel"),
         ("bloom", "BloomModel"),
@@ -370,6 +371,7 @@
     [
         # Model for Image Classification mapping
         ("beit", "BeitForImageClassification"),
+        ("bit", "BitForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
         ("cvt", "CvtForImageClassification"),
         ("data2vec-vision", "Data2VecVisionForImageClassification"),
@@ -852,6 +854,7 @@
 MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
     [
         # Backbone mapping
+        ("bit", "BitBackbone"),
         ("maskformer-swin", "MaskFormerSwinBackbone"),
         ("resnet", "ResNetBackbone"),
     ]
diff --git a/src/transformers/models/bit/__init__.py b/src/transformers/models/bit/__init__.py
new file mode 100644
index 000000000000..44c25e154ad4
--- /dev/null
+++ b/src/transformers/models/bit/__init__.py
@@ -0,0 +1,58 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig", "BitOnnxConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_bit"] = [
+        "BIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "BitForImageClassification",
+        "BitModel",
+        "BitPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig, BitOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_bit import (
+            BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BitForImageClassification,
+            BitModel,
+            BitPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
new file mode 100644
index 000000000000..d165140b8150
--- /dev/null
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BiT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+RESNETV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/resnetv2-50": "https://huggingface.co/google/resnetv2-50/resolve/main/config.json",
+}
+
+
+class BitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BiT
+    [google/resnetnv2-50](https://huggingface.co/google/resnetnv2-50) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"preactivation"`):
+            The layer to use, it can be either `"preactivation"` or `"bottleneck"`.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop path rate for the stochastic depth.
+        output_stride (`int`, *optional*, defaults to 32):
+            The output stride of the model.
+        width_factor (`int`, *optional*, defaults to 1):
+            The width factor for the model.
+
+    Example:
+    ```python
+    >>> from transformers import BitConfig, BitModel
+
+    >>> # Initializing a BiT resnetv2-50 style configuration
+    >>> configuration = BitConfig()
+
+    >>> # Initializing a model (with random weights) from the resnetv2-50 style configuration
+    >>> model = BitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "resnetv2"
+    layer_types = ["preactivation", "bottleneck"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        hidden_sizes=[256, 512, 1024, 2048],
+        depths=[3, 4, 6, 3],
+        stem_type="",
+        layer_type="preactivation",
+        hidden_act="relu",
+        downsample_in_first_stage=False,
+        drop_path_rate=0.0,
+        output_stride=32,
+        width_factor=1,
+        conv_layer="",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.stem_type = stem_type
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        self.downsample_in_first_stage = downsample_in_first_stage
+        self.drop_path_rate = drop_path_rate
+        self.output_stride = output_stride
+        self.width_factor = width_factor
+        self.conv_layer = conv_layer
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
new file mode 100644
index 000000000000..040ed96b4c4e
--- /dev/null
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BiT checkpoints from the timm library."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import hf_hub_download
+from timm import create_model
+
+# from timm.data import resolve_data_config
+# from timm.data.transforms_factory import create_transform
+from transformers import BitConfig, BitForImageClassification
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_config(model_name):
+    repo_id = "huggingface/label-files"
+    filename = "imagenet-1k-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    label2id = {v: k for k, v in id2label.items()}
+
+    conv_layer = "std_conv" if "bit" in model_name else False
+    # for the ViT-hybrid checkpoints, one needs to additionally set config.layer_type = "bottleneck"
+    # and use a different conv_layer, namely StdConv2dSame
+    # and "stem_type": "same" in the data config
+    config = BitConfig(
+        conv_layer=conv_layer,
+        num_labels=1000,
+        id2label=id2label,
+        label2id=label2id,
+    )
+
+    return config
+
+
+def rename_key(name):
+    if "stem.conv" in name:
+        name = name.replace("stem.conv", "resnetv2.embedder.convolution")
+    if "blocks" in name:
+        name = name.replace("blocks", "layers")
+    if "head.fc" in name:
+        name = name.replace("head.fc", "classifier.1")
+    if name.startswith("norm"):
+        name = "resnetv2." + name
+    if "resnetv2" not in name and "classifier" not in name:
+        name = "resnetv2.encoder." + name
+
+    return name
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our BiT structure.
+    """
+
+    # define default BiT configuration
+    config = get_config(model_name)
+
+    # load original model from timm
+    timm_model = create_model(model_name, pretrained=True)
+    timm_model.eval()
+
+    # load state_dict of original model
+    state_dict = timm_model.state_dict()
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        state_dict[rename_key(key)] = val.squeeze() if "head" in key else val
+
+    # load HuggingFace model
+    model = BitForImageClassification(config)
+    model.eval()
+    model.load_state_dict(state_dict)
+
+    # TODO verify logits
+    # transform = create_transform(**resolve_data_config({}, model=model))
+    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # image = Image.open(requests.get(url, stream=True).raw)
+    # weird bug: we don't get the same pixel values as in Colab
+    # load pixel values from the hub for the moment
+    # pixel_values = transform(image).unsqueeze(0)
+
+    from huggingface_hub import hf_hub_download
+
+    pixel_values = torch.load(
+        hf_hub_download("nielsr/dummy-pixel-values", repo_type="dataset", filename="pixel_values.pt")
+    )
+
+    print("Shape of pixel values:", pixel_values.shape)
+    print("First values of pixel values:", pixel_values[0, 0, :3, :3])
+
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        logits = outputs.logits
+
+    print("Logits:", logits[0, :3])
+    print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
+    if model_name == "resnetv2_50x1_bitm":
+        expected_slice = torch.tensor([0.4306, -0.0052, -0.6205])
+    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="resnetv2_50x1_bitm",
+        type=str,
+        help="Name of the BiT timm model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_resnetv2_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
new file mode 100644
index 000000000000..5084be8bc209
--- /dev/null
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -0,0 +1,846 @@
+# coding=utf-8
+# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BiT model. Also supports backbones for ViT hybrid."""
+
+import collections
+import math
+from functools import partial
+from typing import List, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_bit import BitConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "BitConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/resnetnv2-50"
+_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/resnetnv2-50"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
+
+BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/resnetnv2-50",
+    # See all BiT models at https://huggingface.co/models?filter=resnetv2
+]
+
+
+# Can SAME padding for given args be done statically?
+def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == "same":
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == "valid":
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+
+
+class StdConv2dSame(nn.Conv2d):
+    """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
+    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
+        https://arxiv.org/abs/1903.10520v2
+    """
+
+    def __init__(
+        self,
+        in_channel,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding="SAME",
+        dilation=1,
+        groups=1,
+        bias=False,
+        eps=1e-6,
+    ):
+        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
+        super().__init__(
+            in_channel,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.same_pad = is_dynamic
+        self.eps = eps
+
+    def forward(self, x):
+        if self.same_pad:
+            x = pad_same(x, self.kernel_size, self.stride, self.dilation)
+        weight = nn.functional.batch_norm(
+            self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
+        ).reshape_as(self.weight)
+        x = nn.functional.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+        return x
+
+
+def _num_groups(num_channels, num_groups, group_size):
+    if group_size:
+        assert num_channels % group_size == 0
+        return num_channels // group_size
+    return num_groups
+
+
+class BitGroupNormActivation(nn.GroupNorm):
+    # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args
+    def __init__(
+        self,
+        num_channels,
+        num_groups=32,
+        eps=1e-5,
+        affine=True,
+        group_size=None,
+        apply_act=True,
+        act_layer=nn.ReLU,
+        inplace=True,
+        drop_layer=None,
+    ):
+        super(BitGroupNormActivation, self).__init__(
+            _num_groups(num_channels, num_groups, group_size), num_channels, eps=eps, affine=affine
+        )
+        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
+        # act_layer = get_act_layer(act_layer)  # string -> nn.Module
+        if act_layer is not None and apply_act:
+            act_args = dict(inplace=True) if inplace else {}
+            self.act = act_layer(**act_args)
+        else:
+            self.act = nn.Identity()
+        self._fast_norm = False  # TODO add support for fast norm
+
+    def forward(self, x):
+        # if self._fast_norm:
+        #     x = fast_group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
+        # else:
+        x = nn.functional.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
+        x = self.drop(x)
+        x = self.act(x)
+        return x
+
+
+# Calculate symmetric padding for a convolution
+def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+class StdConv2d(nn.Conv2d):
+    """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.
+
+    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
+        https://arxiv.org/abs/1903.10520v2
+    """
+
+    def __init__(
+        self, in_channel, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1, bias=False, eps=1e-6
+    ):
+        if padding is None:
+            padding = get_padding(kernel_size, stride, dilation)
+        super().__init__(
+            in_channel,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.eps = eps
+
+    def forward(self, x):
+        weight = nn.functional.batch_norm(
+            self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
+        ).reshape_as(self.weight)
+        x = nn.functional.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+        return x
+
+
+# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
+def get_same_padding(x: int, k: int, s: int, d: int):
+    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+
+
+# Dynamically pad input x with 'SAME' padding for conv with specified args
+def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
+    ih, iw = x.size()[-2:]
+    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
+    if pad_h > 0 or pad_w > 0:
+        x = nn.functional.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
+    return x
+
+
+class MaxPool2dSame(nn.MaxPool2d):
+    """Tensorflow like 'SAME' wrapper for 2D max pooling"""
+
+    def __init__(self, kernel_size: int, stride=None, dilation=1, ceil_mode=False):
+        kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
+        stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
+        dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
+        super(MaxPool2dSame, self).__init__(kernel_size, stride, (0, 0), dilation, ceil_mode)
+
+    def forward(self, x):
+        x = pad_same(x, self.kernel_size, self.stride, value=-float("inf"))
+        return nn.functional.max_pool2d(x, self.kernel_size, self.stride, (0, 0), self.dilation, self.ceil_mode)
+
+
+class BitEmbeddings(nn.Module):
+    """
+    BiT Embeddings (stem) composed of a single aggressive convolution.
+    """
+
+    def __init__(self, config: BitConfig):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            config.num_channels, config.embedding_size, kernel_size=7, stride=2, padding=3, bias=False
+        )
+        if config.stem_type == "same":
+            self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
+        else:
+            self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embedding = self.convolution(pixel_values)
+        embedding = self.pooler(embedding)
+
+        return embedding
+
+
+# Copied from transformers.models.convnext.modeling_convnext.drop_path
+def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath
+class BitDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+def make_div(v, divisor=8):
+    min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class BitPreActivationBottleneckLayer(nn.Module):
+    """Pre-activation (v2) bottleneck block.
+    Follows the implementation of "Identity Mappings in Deep Residual Networks":
+    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
+
+    Except it puts the stride on 3x3 conv when available.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        bottle_ratio=0.25,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        groups=1,
+        act_layer=None,
+        conv_layer=None,
+        norm_layer=None,
+        proj_layer=None,
+        drop_path_rate=0.0,
+    ):
+        super().__init__()
+
+        first_dilation = first_dilation or dilation
+        conv_layer = conv_layer or StdConv2d
+        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=32)
+        out_channels = out_channels or in_channels
+        mid_channels = make_div(out_channels * bottle_ratio)
+
+        if proj_layer is not None:
+            self.downsample = proj_layer(
+                in_channels,
+                out_channels,
+                stride=stride,
+                dilation=dilation,
+                first_dilation=first_dilation,
+                preact=True,
+                conv_layer=conv_layer,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.downsample = None
+
+        self.norm1 = norm_layer(in_channels)
+        self.conv1 = conv_layer(in_channels, mid_channels, 1)
+        self.norm2 = norm_layer(mid_channels)
+        self.conv2 = conv_layer(mid_channels, mid_channels, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.norm3 = norm_layer(mid_channels)
+        self.conv3 = conv_layer(mid_channels, out_channels, 1)
+        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def forward(self, x, print_values=False):
+        x_preact = self.norm1(x)
+
+        if print_values:
+            print("Hidden states after first norm:", x_preact[0, 0, :3, :3])
+
+        # shortcut branch
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x_preact, print_values)
+
+        if print_values:
+            print("Hidden states after downsample:", shortcut[0, 0, :3, :3])
+
+        # residual branch
+        x = self.conv1(x_preact)
+        x = self.conv2(self.norm2(x))
+        x = self.conv3(self.norm3(x))
+        x = self.drop_path(x)
+        return x + shortcut
+
+
+class BitDownsampleConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        preact=True,
+        conv_layer=None,
+        norm_layer=None,
+    ):
+        super(BitDownsampleConv, self).__init__()
+        self.conv_layer = conv_layer
+        self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
+        self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
+
+    def forward(self, x, print_values=False):
+        if print_values:
+            print("Conv layer:", self.conv_layer)
+            print("Hidden states before downsample conv:", x[0, 0, :3, :3])
+
+        z = self.conv(x)
+
+        if print_values:
+            print("Hidden states after downsample conv:", z[0, 0, :3, :3])
+
+        return self.norm(self.conv(x))
+
+
+class BitConvLayer(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class BitShortCut(nn.Module):
+    """
+    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+        self.normalization = nn.BatchNorm2d(out_channels)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class BitBottleneckLayer(nn.Module):
+    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        bottle_ratio=0.25,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        groups=1,
+        act_layer=None,
+        conv_layer=None,
+        norm_layer=None,
+        proj_layer=None,
+        drop_path_rate=0.0,
+    ):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+        act_layer = act_layer or nn.ReLU
+        conv_layer = conv_layer or StdConv2d
+        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=32)
+        out_channels = out_channels or in_channels
+        mid_chs = make_div(out_channels * bottle_ratio)
+
+        if proj_layer is not None:
+            self.downsample = proj_layer(
+                in_channels,
+                out_channels,
+                stride=stride,
+                dilation=dilation,
+                preact=False,
+                conv_layer=conv_layer,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.downsample = None
+
+        self.conv1 = conv_layer(in_channels, mid_chs, 1)
+        self.norm1 = norm_layer(mid_chs)
+        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.norm2 = norm_layer(mid_chs)
+        self.conv3 = conv_layer(mid_chs, out_channels, 1)
+        self.norm3 = norm_layer(out_channels, apply_act=False)
+        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.act3 = act_layer(inplace=True)
+
+    def forward(self, x):
+        # shortcut branch
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        # residual
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.conv3(x)
+        x = self.norm3(x)
+        x = self.drop_path(x)
+        x = self.act3(x + shortcut)
+        return x
+
+
+class BitStage(nn.Module):
+    """
+    A ResNet v2 stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        dilation,
+        depth,
+        bottle_ratio=0.25,
+        groups=1,
+        avg_down=False,
+        layer_dpr=None,
+        layer_fn=BitPreActivationBottleneckLayer,
+        act_layer=None,
+        conv_layer=None,
+        norm_layer=None,
+        **layer_kwargs
+    ):
+        super().__init__()
+
+        first_dilation = 1 if dilation in (1, 2) else 2
+        layer_kwargs = dict(act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer)
+        if avg_down:
+            # TODO add support for avg_down
+            raise NotImplementedError("avg_down is not implemented")
+        proj_layer = BitDownsampleConv
+        prev_chs = in_channels
+        self.layers = nn.Sequential()
+        for layer_idx in range(depth):
+            drop_path_rate = layer_dpr[layer_idx] if layer_dpr else 0.0
+            stride = stride if layer_idx == 0 else 1
+            self.layers.add_module(
+                str(layer_idx),
+                layer_fn(
+                    prev_chs,
+                    out_channels,
+                    stride=stride,
+                    dilation=dilation,
+                    bottle_ratio=bottle_ratio,
+                    groups=groups,
+                    first_dilation=first_dilation,
+                    proj_layer=proj_layer,
+                    drop_path_rate=drop_path_rate,
+                    **layer_kwargs,
+                ),
+            )
+            prev_chs = out_channels
+            first_dilation = dilation
+            proj_layer = None
+
+    def forward(self, input: Tensor, print_values=False) -> Tensor:
+        hidden_state = input
+        for idx, layer in enumerate(self.layers):
+            if idx == 0 and print_values:
+                print(f"Hidden states before block {idx}", hidden_state[0, 0, :3, :3])
+            hidden_state = layer(hidden_state, print_values=idx == 0)
+            if idx == 0 and print_values:
+                print(f"Hidden states after block {idx}", hidden_state[0, 0, :3, :3])
+        return hidden_state
+
+
+class BitEncoder(nn.Module):
+    def __init__(self, config: BitConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+
+        act_layer = nn.ReLU
+        if config.conv_layer == "std_conv":
+            conv_layer = partial(StdConv2d, eps=1e-8)
+        elif config.conv_layer == "std_conv_same":
+            conv_layer = partial(StdConv2dSame, eps=1e-8)
+
+        norm_layer = partial(BitGroupNormActivation, num_groups=32)
+
+        prev_chs = config.embedding_size
+        curr_stride = 4
+        dilation = 1
+        block_dprs = [
+            x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
+        ]
+        if config.layer_type == "bottleneck":
+            block_fn = BitBottleneckLayer
+        elif config.layer_type == "preactivation":
+            block_fn = BitPreActivationBottleneckLayer
+        else:
+            raise ValueError("Unknown layer type: {}".format(config.layer_type))
+
+        for stage_idx, (d, c, bdpr) in enumerate(zip(config.depths, config.hidden_sizes, block_dprs)):
+            out_channels = make_div(c * config.width_factor)
+            stride = 1 if stage_idx == 0 else 2
+            if curr_stride >= config.output_stride:
+                dilation *= stride
+                stride = 1
+            stage = BitStage(
+                prev_chs,
+                out_channels,
+                stride=stride,
+                dilation=dilation,
+                depth=d,
+                avg_down=False,
+                act_layer=act_layer,
+                conv_layer=conv_layer,
+                norm_layer=norm_layer,
+                block_dpr=bdpr,
+                block_fn=block_fn,
+            )
+            prev_chs = out_channels
+            curr_stride *= stride
+            self.stages.add_module(str(stage_idx), stage)
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for idx, stage_module in enumerate(self.stages):
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state, print_values=idx == 0)
+
+            print(f"Hidden states after stage {idx}: ", hidden_state.shape)
+            print(f"Hidden states after stage {idx}: ", hidden_state[0, 0, :3, :3])
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->resnetv2
+class BitPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BitConfig
+    base_model_prefix = "resnetv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BitModel):
+            module.gradient_checkpointing = value
+
+
+BIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`BitConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare BiT model outputting raw features without any specific head on top.",
+    BIT_START_DOCSTRING,
+)
+class BitModel(BitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embedder = BitEmbeddings(config)
+
+        self.encoder = BitEncoder(config)
+        norm_layer = partial(BitGroupNormActivation, num_groups=32)
+        self.norm = norm_layer(config.hidden_sizes[-1]) if config.layer_type == "preactivation" else nn.Identity()
+
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values)
+
+        print("Shape of embeddings:", embedding_output.shape)
+        print("First values of embeddings:", embedding_output[0, 0, :3, :3])
+
+        encoder_outputs = self.encoder(
+            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = self.norm(last_hidden_state)
+
+        print("Shape of final embeddings:", last_hidden_state.shape)
+        print("Final embeddings:", last_hidden_state[0, 0, :3, :3])
+
+        pooled_output = self.pooler(last_hidden_state)
+
+        print("Pooled output:", pooled_output.shape)
+        print("Pool output:", pooled_output[0, 0, :3, :3])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    BIT_START_DOCSTRING,
+)
+class BitForImageClassification(BitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.resnetv2 = BitModel(config)
+        # classification head
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
+        )
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.resnetv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index daaefd5297fa..f405dd6e401b 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -995,6 +995,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+BIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BitForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BitModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BitPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/bit/__init__.py b/tests/models/bit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
new file mode 100644
index 000000000000..7cb26e81d654
--- /dev/null
+++ b/tests/models/bit/test_modeling_bit.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Bit model. """
+
+
+import inspect
+import unittest
+
+from transformers import BitConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import BitBackbone, BitForImageClassification, BitModel
+    from transformers.models.bit.modeling_bit import BIT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoFeatureExtractor
+
+
+class BitModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        embeddings_size=10,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[1, 1, 2, 1],
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        scope=None,
+        out_features=["stage2", "stage3", "stage4"],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.embeddings_size = embeddings_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.num_labels = num_labels
+        self.scope = scope
+        self.num_stages = len(hidden_sizes)
+        self.out_features = out_features
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return BitConfig(
+            num_channels=self.num_channels,
+            embeddings_size=self.embeddings_size,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            hidden_act=self.hidden_act,
+            num_labels=self.num_labels,
+            out_features=self.out_features,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = BitModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected last hidden states: B, C, H // 32, W // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = BitForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = BitBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class BitModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Bit does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            BitModel,
+            BitForImageClassification,
+            BitBackbone,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = BitModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BitConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="Bit does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    @unittest.skip(reason="Bit does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Bit does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, module in model.named_modules():
+                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+                    self.assertTrue(
+                        torch.all(module.weight == 1),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+                    self.assertTrue(
+                        torch.all(module.bias == 0),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # Bit's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        layers_type = ["basic", "bottleneck"]
+        for model_class in self.all_model_classes:
+            for layer_type in layers_type:
+                config.layer_type = layer_type
+                inputs_dict["output_hidden_states"] = True
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+                # check that output_hidden_states also work using config
+                del inputs_dict["output_hidden_states"]
+                config.output_hidden_states = True
+
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip(reason="Bit does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = BitModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class BitModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            AutoFeatureExtractor.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = BitForImageClassification.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-11.1069, -9.7877, -8.3777]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From ae44ca32bf6c449763cd221ef751278befa17e1c Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 09:32:47 +0100
Subject: [PATCH 02/88] More improvements

---
 src/transformers/models/bit/configuration_bit.py | 2 +-
 src/transformers/models/bit/modeling_bit.py      | 7 +++++--
 src/transformers/models/bit/test.py              | 8 ++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/models/bit/test.py

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index d165140b8150..dbd0559f3748 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -88,7 +88,7 @@ def __init__(
         drop_path_rate=0.0,
         output_stride=32,
         width_factor=1,
-        conv_layer="",
+        conv_layer="std_conv",
         **kwargs
     ):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 5084be8bc209..3a6431fb9953 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -85,8 +85,9 @@ def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
 
 class StdConv2dSame(nn.Conv2d):
     """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
-    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
-        https://arxiv.org/abs/1903.10520v2
+
+    Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
+    Standardization](https://arxiv.org/abs/1903.10520v2)
     """
 
     def __init__(
@@ -244,6 +245,8 @@ def __init__(self, config: BitConfig):
         self.convolution = nn.Conv2d(
             config.num_channels, config.embedding_size, kernel_size=7, stride=2, padding=3, bias=False
         )
+        if not config.layer_type == "preactivation":
+            self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
         if config.stem_type == "same":
             self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
         else:
diff --git a/src/transformers/models/bit/test.py b/src/transformers/models/bit/test.py
new file mode 100644
index 000000000000..cbaa9d39a581
--- /dev/null
+++ b/src/transformers/models/bit/test.py
@@ -0,0 +1,8 @@
+from transformers import BitConfig, BitForImageClassification
+
+config = BitConfig(layer_type="bottleneck", stem_type="same", conv_layer="std_conv_same")
+
+model = BitForImageClassification(config)
+
+for name, param in model.named_parameters():
+    print(name, param.shape)
\ No newline at end of file

From f14fd838a3439527b2fbc6e27412ca5d1b2cf575 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 10:09:26 +0100
Subject: [PATCH 03/88] Add backbone, first draft of ViT hybrid

---
 src/transformers/__init__.py                  |  18 +
 src/transformers/models/__init__.py           |   1 +
 src/transformers/models/bit/__init__.py       |   2 +
 .../models/bit/configuration_bit.py           |  11 +
 src/transformers/models/bit/modeling_bit.py   | 188 +++--
 src/transformers/models/bit/test.py           |   3 +-
 .../models/vit_hybrid/__init__.py             |  63 ++
 .../vit_hybrid/configuration_vit_hybrid.py    | 123 ++++
 .../convert_vit_hybrid_timm_to_pytorch.py     | 202 ++++++
 .../models/vit_hybrid/modeling_vit_hybrid.py  | 648 ++++++++++++++++++
 src/transformers/models/vit_hybrid/test.py    |   9 +
 src/transformers/utils/dummy_pt_objects.py    |  31 +
 12 files changed, 1230 insertions(+), 69 deletions(-)
 create mode 100644 src/transformers/models/vit_hybrid/__init__.py
 create mode 100644 src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
 create mode 100644 src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
 create mode 100644 src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
 create mode 100644 src/transformers/models/vit_hybrid/test.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index acdf7ea6c531..9e7cc0e65c63 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -417,6 +417,7 @@
     "models.vision_text_dual_encoder": ["VisionTextDualEncoderConfig", "VisionTextDualEncoderProcessor"],
     "models.visual_bert": ["VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VisualBertConfig"],
     "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
+    "models.vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
     "models.wav2vec2": [
@@ -1050,6 +1051,7 @@
     _import_structure["models.bit"].extend(
         [
             "BIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BitBackbone",
             "BitForImageClassification",
             "BitModel",
             "BitPreTrainedModel",
@@ -2219,6 +2221,14 @@
             "ViTPreTrainedModel",
         ]
     )
+    _import_structure["models.vit_hybrid"].extend(
+        [
+            "VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ViTHybridForImageClassification",
+            "ViTHybridModel",
+            "ViTHybridPreTrainedModel",
+        ]
+    )
     _import_structure["models.vit_mae"].extend(
         [
             "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3634,6 +3644,7 @@
     from .models.vision_text_dual_encoder import VisionTextDualEncoderConfig, VisionTextDualEncoderProcessor
     from .models.visual_bert import VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, VisualBertConfig
     from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
+    from .models.vit_hybrid import VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTHybridConfig
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
     from .models.wav2vec2 import (
@@ -4178,6 +4189,7 @@
         )
         from .models.bit import (
             BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BitBackbone,
             BitForImageClassification,
             BitModel,
             BitPreTrainedModel,
@@ -5124,6 +5136,12 @@
             ViTModel,
             ViTPreTrainedModel,
         )
+        from .models.vit_hybrid import (
+            VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTHybirdModel,
+            ViTHybridForImageClassification,
+            ViTHybridPreTrainedModel,
+        )
         from .models.vit_mae import (
             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTMAEForPreTraining,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index e30b66a6f23f..149933365bf7 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -166,6 +166,7 @@
     vision_text_dual_encoder,
     visual_bert,
     vit,
+    vit_hybrid,
     vit_mae,
     vit_msn,
     wav2vec2,
diff --git a/src/transformers/models/bit/__init__.py b/src/transformers/models/bit/__init__.py
index 44c25e154ad4..ae7763201aaa 100644
--- a/src/transformers/models/bit/__init__.py
+++ b/src/transformers/models/bit/__init__.py
@@ -34,6 +34,7 @@
         "BitForImageClassification",
         "BitModel",
         "BitPreTrainedModel",
+        "BitBackbone",
     ]
 
 if TYPE_CHECKING:
@@ -47,6 +48,7 @@
     else:
         from .modeling_bit import (
             BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BitBackbone,
             BitForImageClassification,
             BitModel,
             BitPreTrainedModel,
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index dbd0559f3748..49f500d9e2a2 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -89,6 +89,7 @@ def __init__(
         output_stride=32,
         width_factor=1,
         conv_layer="std_conv",
+        out_features=None,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -106,3 +107,13 @@ def __init__(
         self.output_stride = output_stride
         self.width_factor = width_factor
         self.conv_layer = conv_layer
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None:
+            if not isinstance(out_features, list):
+                raise ValueError("out_features should be a list")
+            for feature in out_features:
+                if feature not in self.stage_names:
+                    raise ValueError(
+                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
+                    )
+        self.out_features = out_features
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 3a6431fb9953..46b01e46bbaa 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BiT model. Also supports backbones for ViT hybrid."""
+""" PyTorch BiT model. Also supports backbone for ViT hybrid."""
 
 import collections
 import math
@@ -24,14 +24,20 @@
 from torch import Tensor, nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
 from ...modeling_outputs import (
+    BackboneOutput,
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_bit import BitConfig
 
 
@@ -385,71 +391,6 @@ def forward(self, x, print_values=False):
         return x + shortcut
 
 
-class BitDownsampleConv(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        stride=1,
-        dilation=1,
-        first_dilation=None,
-        preact=True,
-        conv_layer=None,
-        norm_layer=None,
-    ):
-        super(BitDownsampleConv, self).__init__()
-        self.conv_layer = conv_layer
-        self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
-        self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
-
-    def forward(self, x, print_values=False):
-        if print_values:
-            print("Conv layer:", self.conv_layer)
-            print("Hidden states before downsample conv:", x[0, 0, :3, :3])
-
-        z = self.conv(x)
-
-        if print_values:
-            print("Hidden states after downsample conv:", z[0, 0, :3, :3])
-
-        return self.norm(self.conv(x))
-
-
-class BitConvLayer(nn.Module):
-    def __init__(
-        self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
-    ):
-        super().__init__()
-        self.convolution = nn.Conv2d(
-            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
-        )
-        self.normalization = nn.BatchNorm2d(out_channels)
-        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
-
-    def forward(self, input: Tensor) -> Tensor:
-        hidden_state = self.convolution(input)
-        hidden_state = self.normalization(hidden_state)
-        hidden_state = self.activation(hidden_state)
-        return hidden_state
-
-
-class BitShortCut(nn.Module):
-    """
-    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
-    downsample the input using `stride=2`.
-    """
-
-    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
-        super().__init__()
-        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
-        self.normalization = nn.BatchNorm2d(out_channels)
-
-    def forward(self, input: Tensor) -> Tensor:
-        hidden_state = self.convolution(input)
-        hidden_state = self.normalization(hidden_state)
-        return hidden_state
-
-
 class BitBottleneckLayer(nn.Module):
     """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
 
@@ -516,6 +457,36 @@ def forward(self, x):
         return x
 
 
+class BitDownsampleConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        preact=True,
+        conv_layer=None,
+        norm_layer=None,
+    ):
+        super(BitDownsampleConv, self).__init__()
+        self.conv_layer = conv_layer
+        self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
+        self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
+
+    def forward(self, x, print_values=False):
+        if print_values:
+            print("Conv layer:", self.conv_layer)
+            print("Hidden states before downsample conv:", x[0, 0, :3, :3])
+
+        z = self.conv(x)
+
+        if print_values:
+            print("Hidden states after downsample conv:", z[0, 0, :3, :3])
+
+        return self.norm(self.conv(x))
+
+
 class BitStage(nn.Module):
     """
     A ResNet v2 stage composed by stacked layers.
@@ -847,3 +818,84 @@ def forward(
             return (loss,) + output if loss is not None else output
 
         return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    BiT backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    BIT_START_DOCSTRING,
+)
+class BitBackbone(BitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.stage_names = config.stage_names
+        self.resnetv2 = BitModel(config)
+
+        self.out_features = config.out_features
+
+        out_feature_channels = {}
+        out_feature_channels["stem"] = config.embedding_size
+        for idx, stage in enumerate(self.stage_names[1:]):
+            out_feature_channels[stage] = config.hidden_sizes[idx]
+
+        self.out_feature_channels = out_feature_channels
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
+        >>> model = AutoBackbone.from_pretrained("microsoft/resnet-50")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.resnetv2(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/models/bit/test.py b/src/transformers/models/bit/test.py
index cbaa9d39a581..7fe1aeaeee3d 100644
--- a/src/transformers/models/bit/test.py
+++ b/src/transformers/models/bit/test.py
@@ -1,8 +1,9 @@
 from transformers import BitConfig, BitForImageClassification
 
+
 config = BitConfig(layer_type="bottleneck", stem_type="same", conv_layer="std_conv_same")
 
 model = BitForImageClassification(config)
 
 for name, param in model.named_parameters():
-    print(name, param.shape)
\ No newline at end of file
+    print(name, param.shape)
diff --git a/src/transformers/models/vit_hybrid/__init__.py b/src/transformers/models/vit_hybrid/__init__.py
new file mode 100644
index 000000000000..82432632a460
--- /dev/null
+++ b/src/transformers/models/vit_hybrid/__init__.py
@@ -0,0 +1,63 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {"configuration_vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vit_hybrid"] = [
+        "VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ViTHybridForImageClassification",
+        "ViTHybridModel",
+        "ViTHybridPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_vit_hybrid import VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTHybridConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vit_hybrid import (
+            VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTHybridForImageClassification,
+            ViTHybridModel,
+            ViTHybridPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
new file mode 100644
index 000000000000..3b9d59fe1240
--- /dev/null
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2021 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViT Hybrid model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/vit-base-patch16-224": "https://huggingface.co/vit-base-patch16-224/resolve/main/config.json",
+    # See all ViT models at https://huggingface.co/models?filter=vit
+}
+
+
+class ViTHybridConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTModel`]. It is used to instantiate an ViT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ViT
+    [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `1`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        encoder_stride (`int`, `optional`, defaults to 16):
+           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTHybridConfig, ViTModel
+
+    >>> # Initializing a ViT Hybrid vit-base-patch16-224 style configuration
+    >>> configuration = ViTHybridConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
+    >>> model = ViTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vit"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        is_encoder_decoder=False,
+        image_size=224,
+        patch_size=1,
+        num_channels=3,
+        qkv_bias=True,
+        encoder_stride=16,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.backbone_config = backbone_config
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.encoder_stride = encoder_stride
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
new file mode 100644
index 000000000000..e9290b2a4fff
--- /dev/null
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -0,0 +1,202 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT hybrid checkpoints from the timm library."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+import timm
+from huggingface_hub import hf_hub_download
+from transformers import BitConfig, ViTHybridConfig, ViTFeatureExtractor, ViTHybridForImageClassification, ViTHybridModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("cls_token", "vit.embeddings.cls_token"),
+            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
+            ("pos_embed", "vit.embeddings.position_embeddings"),
+        ]
+    )
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend(
+            [
+                ("norm.weight", "layernorm.weight"),
+                ("norm.bias", "layernorm.bias"),
+                ("pre_logits.fc.weight", "pooler.dense.weight"),
+                ("pre_logits.fc.bias", "pooler.dense.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "vit" from all keys that start with "vit"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "vit.layernorm.weight"),
+                ("norm.bias", "vit.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        if base_model:
+            prefix = ""
+        else:
+            prefix = "vit."
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.weight", "head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our ViT structure.
+    """
+
+    # define default ViT hybrid configuration
+    backbone_config = BitConfig()
+    config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
+    base_model = False
+
+    # load original model from timm
+    timm_model = timm.create_model(vit_name, pretrained=True)
+    timm_model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = timm_model.state_dict()
+    if base_model:
+        remove_classification_head_(state_dict)
+    rename_keys = create_rename_keys(config, base_model)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model)
+
+    # load HuggingFace model
+    if vit_name[-5:] == "in21k":
+        model = ViTHybridModel(config).eval()
+    else:
+        model = ViTHybridForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by ViTFeatureExtractor
+    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+
+    if base_model:
+        timm_pooled_output = timm_model.forward_features(pixel_values)
+        assert timm_pooled_output.shape == outputs.pooler_output.shape
+        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
+    else:
+        timm_logits = timm_model(pixel_values)
+        assert timm_logits.shape == outputs.logits.shape
+        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--vit_name",
+        default="vit_base_r50_s16_384",
+        type=str,
+        help="Name of the ViT timm model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
\ No newline at end of file
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
new file mode 100644
index 000000000000..c6db1dfa8517
--- /dev/null
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -0,0 +1,648 @@
+# coding=utf-8
+# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViT Hybrid model."""
+
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_vit_hybrid import ViTHybridConfig
+
+from ..auto import AutoBackbone
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTHybridConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+VIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/vit-base-patch16-224",
+    # See all ViTHybrid models at https://huggingface.co/models?filter=vit
+]
+
+
+class ViTHybridEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+    """
+
+    def __init__(self, config: ViTHybridConfig):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(
+            nn.init.trunc_normal_(
+                torch.zeros(1, 1, config.hidden_size, dtype=torch.float32), mean=0.0, std=config.initializer_range
+            )
+        )
+        self.patch_embeddings = ViTHybridPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            nn.init.trunc_normal_(
+                torch.zeros(1, num_patches + 1, config.hidden_size, dtype=torch.float32),
+                mean=0.0,
+                std=config.initializer_range,
+            )
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, pixel_values: torch.Tensor,) -> torch.Tensor:
+        embeddings = self.patch_embeddings(pixel_values)
+
+        # add the [CLS] token to the embedded patch tokens
+        batch_size = embeddings.shape[0]
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ViTHybridPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.backbone = AutoBackbone.from_config(config.backbone_config)
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        features = self.backbone(pixel_values).feature_maps
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(features).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class ViTHybridSelfAttention(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class ViTHybridSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class ViTHybridAttention(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.attention = ViTHybridSelfAttention(config)
+        self.output = ViTHybridSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ViTHybridIntermediate(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class ViTHybridOutput(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class ViTHybridLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTHybridAttention(config)
+        self.intermediate = ViTHybridIntermediate(config)
+        self.output = ViTHybridOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTHybrid, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViTHybrid, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class ViTHybridEncoder(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTHybridLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTHybridPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTHybridConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: ViTHybridEncoder, value: bool = False) -> None:
+        if isinstance(module, ViTHybridEncoder):
+            module.gradient_checkpointing = value
+
+
+VIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTHybridConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class ViTHybridModel(ViTHybridPreTrainedModel):
+    def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTHybridEmbeddings(config)
+        self.encoder = ViTHybridEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTHybridPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> ViTHybridPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class ViTHybridPooler(nn.Module):
+    def __init__(self, config: ViTHybridConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@add_start_docstrings(
+    """
+    ViT Hybrid Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    state of the [CLS] token) e.g. for ImageNet.
+    """,
+    VIT_START_DOCSTRING,
+)
+class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vit = ViTHybridModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
\ No newline at end of file
diff --git a/src/transformers/models/vit_hybrid/test.py b/src/transformers/models/vit_hybrid/test.py
new file mode 100644
index 000000000000..231747f4af8b
--- /dev/null
+++ b/src/transformers/models/vit_hybrid/test.py
@@ -0,0 +1,9 @@
+from transformers import BitConfig, ViTHybridConfig, ViTHybridForImageClassification
+
+backbone_config = BitConfig()
+config = ViTHybridConfig(backbone_config=backbone_config)
+
+model = ViTHybridForImageClassification(config)
+
+for name, param in model.named_parameters():
+    print(name, param.shape)
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f405dd6e401b..785e2fd08f5d 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -998,6 +998,13 @@ def __init__(self, *args, **kwargs):
 BIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class BitBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BitForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5811,6 +5818,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ViTHybirdModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ViTHybridForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ViTHybridPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 

From 0f47ac9546e7285e9ef6ccabebd80726522ce0ca Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 10:45:10 +0100
Subject: [PATCH 04/88] Add AutoBackbone

---
 .../models/vit_hybrid/__init__.py             |   8 +-
 .../convert_vit_hybrid_timm_to_pytorch.py     | 120 +++++++++++-------
 .../models/vit_hybrid/modeling_vit_hybrid.py  |  54 +++++---
 src/transformers/models/vit_hybrid/test.py    |  20 ++-
 4 files changed, 128 insertions(+), 74 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/__init__.py b/src/transformers/models/vit_hybrid/__init__.py
index 82432632a460..5b86bef38cb7 100644
--- a/src/transformers/models/vit_hybrid/__init__.py
+++ b/src/transformers/models/vit_hybrid/__init__.py
@@ -17,11 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {"configuration_vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"]}
@@ -60,4 +56,4 @@
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index e9290b2a4fff..646e24831111 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -24,8 +24,13 @@
 
 import requests
 import timm
-from huggingface_hub import hf_hub_download
-from transformers import BitConfig, ViTHybridConfig, ViTFeatureExtractor, ViTHybridForImageClassification, ViTHybridModel
+from transformers import (
+    BitConfig,
+    ViTFeatureExtractor,
+    ViTHybridConfig,
+    ViTHybridForImageClassification,
+    ViTHybridModel,
+)
 from transformers.utils import logging
 
 
@@ -36,52 +41,68 @@
 # here we list all keys to be renamed (original name on the left, our name on the right)
 def create_rename_keys(config, base_model=False):
     rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "vit.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
 
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
+    # stem:
+    rename_keys.append(("cls_token", "vit.embeddings.cls_token"))
+    rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings"))
 
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
+    rename_keys.append(("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"))
+
+    # backbone
+    rename_keys.append(
+        (
+            "patch_embed.backbone.stem.conv.weight",
+            "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.convolution.weight",
         )
+    )
+    rename_keys.append(
+        (
+            "patch_embed.backbone.stem.norm.weight",
+            "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.weight",
+        )
+    )
+    rename_keys.append(
+        ("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.bias")
+    )
+
+    # rename_keys = []
+    # for i in range(config.num_hidden_layers):
+    #     # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+    #     rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
+    #     rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
+    #     rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
+    #     rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
+    #     rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
+    #     rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
+    #     rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
+    #     rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
+    #     rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
+    #     rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
+
+    # if base_model:
+    #     # layernorm + pooler
+    #     rename_keys.extend(
+    #         [
+    #             ("norm.weight", "layernorm.weight"),
+    #             ("norm.bias", "layernorm.bias"),
+    #             ("pre_logits.fc.weight", "pooler.dense.weight"),
+    #             ("pre_logits.fc.bias", "pooler.dense.bias"),
+    #         ]
+    #     )
+
+    #     # if just the base model, we should remove "vit" from all keys that start with "vit"
+    #     rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
+    # else:
+    #     # layernorm + classification head
+    #     rename_keys.extend(
+    #         [
+    #             ("norm.weight", "vit.layernorm.weight"),
+    #             ("norm.bias", "vit.layernorm.bias"),
+    #             ("head.weight", "classifier.weight"),
+    #             ("head.bias", "classifier.bias"),
+    #         ]
+    #     )
 
     return rename_keys
 
@@ -138,7 +159,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     """
 
     # define default ViT hybrid configuration
-    backbone_config = BitConfig()
+    backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
     config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
     base_model = False
 
@@ -146,6 +167,9 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     timm_model = timm.create_model(vit_name, pretrained=True)
     timm_model.eval()
 
+    for name, param in timm_model.named_parameters():
+        print(name, param.shape)
+
     # load state_dict of original model, remove and rename some keys
     state_dict = timm_model.state_dict()
     if base_model:
@@ -160,7 +184,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         model = ViTHybridModel(config).eval()
     else:
         model = ViTHybridForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
     # Check outputs on an image, prepared by ViTFeatureExtractor
     feature_extractor = ViTFeatureExtractor(size=config.image_size)
@@ -199,4 +223,4 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     )
 
     args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
\ No newline at end of file
+    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index c6db1dfa8517..778cb0d832d0 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -28,15 +28,9 @@
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-)
-from .configuration_vit_hybrid import ViTHybridConfig
-
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from ..auto import AutoBackbone
+from .configuration_vit_hybrid import ViTHybridConfig
 
 
 logger = logging.get_logger(__name__)
@@ -85,7 +79,10 @@ def __init__(self, config: ViTHybridConfig):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
-    def forward(self, pixel_values: torch.Tensor,) -> torch.Tensor:
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
 
         # add the [CLS] token to the embedded patch tokens
@@ -108,25 +105,50 @@ class ViTHybridPatchEmbeddings(nn.Module):
     Transformer.
     """
 
-    def __init__(self, config):
+    def __init__(self, config, feature_size=None):
         super().__init__()
         image_size, patch_size = config.image_size, config.patch_size
         num_channels, hidden_size = config.num_channels, config.hidden_size
 
         image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
         patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.backbone = AutoBackbone.from_config(config.backbone_config)
+        feature_dim = self.backbone.channels[-1]
+
+        if feature_size is None:
+            with torch.no_grad():
+                # NOTE Most reliable way of determining output dims is to run forward pass
+                training = self.backbone.training
+                if training:
+                    self.backbone.eval()
+                feature_map = self.backbone(torch.zeros(1, num_channels, image_size[0], image_size[1])).feature_maps[
+                    -1
+                ]
+                feature_size = feature_map.shape[-2:]
+                feature_dim = feature_map.shape[1]
+                self.backbone.train(training)
+        else:
+            feature_size = (
+                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
+            )
+            if hasattr(self.backbone, "feature_info"):
+                feature_dim = self.backbone.feature_info.channels()[-1]
+            else:
+                feature_dim = self.backbone.num_features
+
+        assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
+        self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
-        self.num_patches = num_patches
 
-        self.backbone = AutoBackbone.from_config(config.backbone_config)
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+        self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=patch_size, stride=patch_size)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
-        features = self.backbone(pixel_values).feature_maps
+        features = self.backbone(pixel_values).feature_maps[-1]
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
@@ -645,4 +667,4 @@ def forward(
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )
\ No newline at end of file
+        )
diff --git a/src/transformers/models/vit_hybrid/test.py b/src/transformers/models/vit_hybrid/test.py
index 231747f4af8b..47371ac6a18a 100644
--- a/src/transformers/models/vit_hybrid/test.py
+++ b/src/transformers/models/vit_hybrid/test.py
@@ -1,9 +1,21 @@
-from transformers import BitConfig, ViTHybridConfig, ViTHybridForImageClassification
+import torch
 
-backbone_config = BitConfig()
-config = ViTHybridConfig(backbone_config=backbone_config)
+from transformers import BitBackbone, BitConfig, ViTHybridConfig, ViTHybridForImageClassification
+
+
+backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
+config = ViTHybridConfig(backbone_config=backbone_config, image_size=384)
+
+# model = BitBackbone(config=backbone_config)
+
+# for name, param in model.named_parameters():
+#     print(name, param.shape)
+
+# outputs = model(torch.randn(1, 3, 384, 384))
+
+# print(outputs.feature_maps[0].shape)
 
 model = ViTHybridForImageClassification(config)
 
 for name, param in model.named_parameters():
-    print(name, param.shape)
\ No newline at end of file
+    print(name, param.shape)

From 3d2945468be49ee7966cfa94ded651cc595f4ada Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 10:57:02 +0100
Subject: [PATCH 05/88] More improvements

---
 .../convert_vit_hybrid_timm_to_pytorch.py     | 59 +++++++++++--------
 .../models/vit_hybrid/modeling_vit_hybrid.py  |  3 +
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 646e24831111..6c306da704a5 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -42,6 +42,7 @@
 def create_rename_keys(config, base_model=False):
     rename_keys = []
 
+    # fmt: off
     # stem:
     rename_keys.append(("cls_token", "vit.embeddings.cls_token"))
     rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings"))
@@ -50,21 +51,15 @@ def create_rename_keys(config, base_model=False):
     rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"))
 
     # backbone
-    rename_keys.append(
-        (
-            "patch_embed.backbone.stem.conv.weight",
-            "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.convolution.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            "patch_embed.backbone.stem.norm.weight",
-            "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.weight",
-        )
-    )
-    rename_keys.append(
-        ("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.bias")
-    )
+    rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.convolution.weight"))
+    rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.weight"))
+    rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.bias"))
+
+    for stage_idx in range(len(config.backbone_config.depths)):
+        for layer_idx in range(config.backbone_config.depths[i]):
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.downsample.conv.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.downsample.norm.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.downsample.norm.bias"))
 
     # rename_keys = []
     # for i in range(config.num_hidden_layers):
@@ -103,6 +98,7 @@ def create_rename_keys(config, base_model=False):
     #             ("head.bias", "classifier.bias"),
     #         ]
     #     )
+    # fmt: on
 
     return rename_keys
 
@@ -167,9 +163,6 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     timm_model = timm.create_model(vit_name, pretrained=True)
     timm_model.eval()
 
-    for name, param in timm_model.named_parameters():
-        print(name, param.shape)
-
     # load state_dict of original model, remove and rename some keys
     state_dict = timm_model.state_dict()
     if base_model:
@@ -186,10 +179,28 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         model = ViTHybridForImageClassification(config).eval()
     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 
-    # Check outputs on an image, prepared by ViTFeatureExtractor
-    feature_extractor = ViTFeatureExtractor(size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
+    # Check outputs on an image
+    # TODO use feature extractor
+    # from huggingface_hub import hf_hub_download
+
+    # pixel_values = torch.load(
+    #     hf_hub_download("nielsr/dummy-pixel-values", repo_type="dataset", filename="pixel_values.pt")
+    # )
+    from timm.data import resolve_data_config
+    from timm.data.transforms_factory import create_transform
+
+    transform = create_transform(
+        **resolve_data_config({}, model=timm_model)
+    )
+
+    # load image
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    pixel_values = transform(image).unsqueeze(0)
+
+    print("First values of pixel values:", pixel_values.shape)
+    print(pixel_values[0, :5, :5, :5])
+
     outputs = model(pixel_values)
 
     if base_model:
@@ -205,8 +216,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 778cb0d832d0..281757999771 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -154,6 +154,9 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
         embeddings = self.projection(features).flatten(2).transpose(1, 2)
+
+        print("Shape of embeddings:", embeddings.shape)
+
         return embeddings
 
 

From 0f36a6bf6e1af09610394fa1b1316ee3791b359f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 11:06:23 +0100
Subject: [PATCH 06/88] Fix bug

---
 src/transformers/models/bit/modeling_bit.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 46b01e46bbaa..32a00d553d28 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -341,6 +341,8 @@ def __init__(
     ):
         super().__init__()
 
+        print("hey what's up")
+
         first_dilation = first_dilation or dilation
         conv_layer = conv_layer or StdConv2d
         norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=32)
@@ -472,6 +474,7 @@ def __init__(
         super(BitDownsampleConv, self).__init__()
         self.conv_layer = conv_layer
         self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
+        print("Preact:", preact)
         self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
 
     def forward(self, x, print_values=False):
@@ -568,17 +571,17 @@ def __init__(self, config: BitConfig):
         prev_chs = config.embedding_size
         curr_stride = 4
         dilation = 1
-        block_dprs = [
+        layer_dprs = [
             x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
         ]
         if config.layer_type == "bottleneck":
-            block_fn = BitBottleneckLayer
+            layer_fn = BitBottleneckLayer
         elif config.layer_type == "preactivation":
-            block_fn = BitPreActivationBottleneckLayer
+            layer_fn = BitPreActivationBottleneckLayer
         else:
             raise ValueError("Unknown layer type: {}".format(config.layer_type))
 
-        for stage_idx, (d, c, bdpr) in enumerate(zip(config.depths, config.hidden_sizes, block_dprs)):
+        for stage_idx, (d, c, bdpr) in enumerate(zip(config.depths, config.hidden_sizes, layer_dprs)):
             out_channels = make_div(c * config.width_factor)
             stride = 1 if stage_idx == 0 else 2
             if curr_stride >= config.output_stride:
@@ -594,8 +597,8 @@ def __init__(self, config: BitConfig):
                 act_layer=act_layer,
                 conv_layer=conv_layer,
                 norm_layer=norm_layer,
-                block_dpr=bdpr,
-                block_fn=block_fn,
+                layer_dpr=bdpr,
+                layer_fn=layer_fn,
             )
             prev_chs = out_channels
             curr_stride *= stride

From 07fa0059e7ad458c514d3ef267dfcf5648f42a63 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 11:31:31 +0100
Subject: [PATCH 07/88] More improvements

---
 .../models/bit/convert_bit_to_pytorch.py      |  2 +-
 src/transformers/models/bit/modeling_bit.py   | 63 ++++++++++---------
 src/transformers/models/bit/test.py           |  6 +-
 .../convert_vit_hybrid_timm_to_pytorch.py     | 20 ++++--
 .../models/vit_hybrid/modeling_vit_hybrid.py  |  1 +
 5 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 040ed96b4c4e..3b69437e0c4b 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -127,7 +127,7 @@ def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
     print("Logits:", logits[0, :3])
     print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
     if model_name == "resnetv2_50x1_bitm":
-        expected_slice = torch.tensor([0.4306, -0.0052, -0.6205])
+        expected_slice = torch.tensor([ 0.1665, -0.2718, -1.1446])
     assert torch.allclose(logits[0, :3], expected_slice, atol=1e-3)
     print("Looks ok!")
 
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 32a00d553d28..e8320797b6dd 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -248,11 +248,19 @@ class BitEmbeddings(nn.Module):
 
     def __init__(self, config: BitConfig):
         super().__init__()
-        self.convolution = nn.Conv2d(
+        if config.conv_layer == "std_conv":
+            conv_layer = partial(StdConv2d, eps=1e-8)
+        elif config.conv_layer == "std_conv_same":
+            conv_layer = partial(StdConv2dSame, eps=1e-8)
+
+        self.convolution = conv_layer(
             config.num_channels, config.embedding_size, kernel_size=7, stride=2, padding=3, bias=False
         )
+       
+        self.norm = None
         if not config.layer_type == "preactivation":
             self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
+        
         if config.stem_type == "same":
             self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
         else:
@@ -267,8 +275,18 @@ def forward(self, pixel_values: Tensor) -> Tensor:
             )
 
         embedding = self.convolution(pixel_values)
+
+        print("Shape of embeddings after conv2d:", embedding.shape)
+        print("First values:", embedding[0, 0, :3, :3])
+
+        if self.norm is not None:
+            embedding = self.norm(embedding)
+
         embedding = self.pooler(embedding)
 
+        print("Shape of BiT embeddings:", embedding.shape)
+        print("First values of BiT embeddings:", embedding[0,0,:3,:3])
+
         return embedding
 
 
@@ -341,8 +359,6 @@ def __init__(
     ):
         super().__init__()
 
-        print("hey what's up")
-
         first_dilation = first_dilation or dilation
         conv_layer = conv_layer or StdConv2d
         norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=32)
@@ -374,16 +390,16 @@ def __init__(
     def forward(self, x, print_values=False):
         x_preact = self.norm1(x)
 
-        if print_values:
-            print("Hidden states after first norm:", x_preact[0, 0, :3, :3])
+        # if print_values:
+        #     print("Hidden states after first norm:", x_preact[0, 0, :3, :3])
 
         # shortcut branch
         shortcut = x
         if self.downsample is not None:
             shortcut = self.downsample(x_preact, print_values)
 
-        if print_values:
-            print("Hidden states after downsample:", shortcut[0, 0, :3, :3])
+        # if print_values:
+        #     print("Hidden states after downsample:", shortcut[0, 0, :3, :3])
 
         # residual branch
         x = self.conv1(x_preact)
@@ -441,7 +457,7 @@ def __init__(
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
         self.act3 = act_layer(inplace=True)
 
-    def forward(self, x):
+    def forward(self, x, print_values=False):
         # shortcut branch
         shortcut = x
         if self.downsample is not None:
@@ -474,18 +490,17 @@ def __init__(
         super(BitDownsampleConv, self).__init__()
         self.conv_layer = conv_layer
         self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
-        print("Preact:", preact)
         self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
 
     def forward(self, x, print_values=False):
-        if print_values:
-            print("Conv layer:", self.conv_layer)
-            print("Hidden states before downsample conv:", x[0, 0, :3, :3])
+        # if print_values:
+        #     print("Conv layer:", self.conv_layer)
+        #     print("Hidden states before downsample conv:", x[0, 0, :3, :3])
 
         z = self.conv(x)
 
-        if print_values:
-            print("Hidden states after downsample conv:", z[0, 0, :3, :3])
+        # if print_values:
+        #     print("Hidden states after downsample conv:", z[0, 0, :3, :3])
 
         return self.norm(self.conv(x))
 
@@ -547,11 +562,11 @@ def __init__(
     def forward(self, input: Tensor, print_values=False) -> Tensor:
         hidden_state = input
         for idx, layer in enumerate(self.layers):
-            if idx == 0 and print_values:
-                print(f"Hidden states before block {idx}", hidden_state[0, 0, :3, :3])
+            # if idx == 0 and print_values:
+            #     print(f"Hidden states before block {idx}", hidden_state[0, 0, :3, :3])
             hidden_state = layer(hidden_state, print_values=idx == 0)
-            if idx == 0 and print_values:
-                print(f"Hidden states after block {idx}", hidden_state[0, 0, :3, :3])
+            # if idx == 0 and print_values:
+            #     print(f"Hidden states after block {idx}", hidden_state[0, 0, :3, :3])
         return hidden_state
 
 
@@ -615,9 +630,6 @@ def forward(
 
             hidden_state = stage_module(hidden_state, print_values=idx == 0)
 
-            print(f"Hidden states after stage {idx}: ", hidden_state.shape)
-            print(f"Hidden states after stage {idx}: ", hidden_state[0, 0, :3, :3])
-
         if output_hidden_states:
             hidden_states = hidden_states + (hidden_state,)
 
@@ -716,9 +728,6 @@ def forward(
 
         embedding_output = self.embedder(pixel_values)
 
-        print("Shape of embeddings:", embedding_output.shape)
-        print("First values of embeddings:", embedding_output[0, 0, :3, :3])
-
         encoder_outputs = self.encoder(
             embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
         )
@@ -727,14 +736,8 @@ def forward(
 
         last_hidden_state = self.norm(last_hidden_state)
 
-        print("Shape of final embeddings:", last_hidden_state.shape)
-        print("Final embeddings:", last_hidden_state[0, 0, :3, :3])
-
         pooled_output = self.pooler(last_hidden_state)
 
-        print("Pooled output:", pooled_output.shape)
-        print("Pool output:", pooled_output[0, 0, :3, :3])
-
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 
diff --git a/src/transformers/models/bit/test.py b/src/transformers/models/bit/test.py
index 7fe1aeaeee3d..d8e43bf91f74 100644
--- a/src/transformers/models/bit/test.py
+++ b/src/transformers/models/bit/test.py
@@ -1,9 +1,9 @@
-from transformers import BitConfig, BitForImageClassification
+from transformers import BitConfig, BitBackbone
 
 
-config = BitConfig(layer_type="bottleneck", stem_type="same", conv_layer="std_conv_same")
+backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
 
-model = BitForImageClassification(config)
+model = BitBackbone(backbone_config)
 
 for name, param in model.named_parameters():
     print(name, param.shape)
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 6c306da704a5..1ada191b5f8a 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -56,10 +56,20 @@ def create_rename_keys(config, base_model=False):
     rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.bias"))
 
     for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.downsample.conv.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.downsample.norm.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.downsample.norm.bias"))
+        for layer_idx in range(config.backbone_config.depths[stage_idx]):
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias"))
+
+        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight"))
+        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight"))
+        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias"))
 
     # rename_keys = []
     # for i in range(config.num_hidden_layers):
@@ -155,7 +165,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     """
 
     # define default ViT hybrid configuration
-    backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
+    backbone_config = BitConfig(stem_type="same", conv_layer="std_conv_same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
     config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
     base_model = False
 
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 281757999771..8f28c51642ad 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -156,6 +156,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.projection(features).flatten(2).transpose(1, 2)
 
         print("Shape of embeddings:", embeddings.shape)
+        print("First values of embeddings:", embeddings[0,:3,:3])
 
         return embeddings
 

From bc64ea75474ce50ba85ebd054e231ddb47b80f17 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 11:37:49 +0100
Subject: [PATCH 08/88] More improvements

---
 src/transformers/models/bit/modeling_bit.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index e8320797b6dd..0daeb3eef00e 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -253,9 +253,9 @@ def __init__(self, config: BitConfig):
         elif config.conv_layer == "std_conv_same":
             conv_layer = partial(StdConv2dSame, eps=1e-8)
 
-        self.convolution = conv_layer(
-            config.num_channels, config.embedding_size, kernel_size=7, stride=2, padding=3, bias=False
-        )
+        print("Conv layer:", conv_layer)
+
+        self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
        
         self.norm = None
         if not config.layer_type == "preactivation":
@@ -274,6 +274,9 @@ def forward(self, pixel_values: Tensor) -> Tensor:
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
 
+        print("Shape of pixel_values:", pixel_values.shape)
+        print("First vaues of pixel values:", pixel_values[0,0,:3,:3])
+
         embedding = self.convolution(pixel_values)
 
         print("Shape of embeddings after conv2d:", embedding.shape)

From 36bc9f5d2fca3e0b8ec7ac5ba91d92336372e4c4 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 11:44:00 +0100
Subject: [PATCH 09/88] Convert ViT-hybrid

---
 src/transformers/models/bit/modeling_bit.py   |  6 --
 .../convert_vit_hybrid_timm_to_pytorch.py     | 83 ++++++++++---------
 2 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 0daeb3eef00e..9f0a1ab10cf7 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -279,17 +279,11 @@ def forward(self, pixel_values: Tensor) -> Tensor:
 
         embedding = self.convolution(pixel_values)
 
-        print("Shape of embeddings after conv2d:", embedding.shape)
-        print("First values:", embedding[0, 0, :3, :3])
-
         if self.norm is not None:
             embedding = self.norm(embedding)
 
         embedding = self.pooler(embedding)
 
-        print("Shape of BiT embeddings:", embedding.shape)
-        print("First values of BiT embeddings:", embedding[0,0,:3,:3])
-
         return embedding
 
 
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 1ada191b5f8a..4faac12e8cda 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -71,43 +71,43 @@ def create_rename_keys(config, base_model=False):
         rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight"))
         rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias"))
 
-    # rename_keys = []
-    # for i in range(config.num_hidden_layers):
-    #     # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    #     rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-    #     rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-    #     rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-    #     rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-    #     rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-    #     rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-    #     rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-    #     rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-    #     rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-    #     rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # if base_model:
-    #     # layernorm + pooler
-    #     rename_keys.extend(
-    #         [
-    #             ("norm.weight", "layernorm.weight"),
-    #             ("norm.bias", "layernorm.bias"),
-    #             ("pre_logits.fc.weight", "pooler.dense.weight"),
-    #             ("pre_logits.fc.bias", "pooler.dense.bias"),
-    #         ]
-    #     )
-
-    #     # if just the base model, we should remove "vit" from all keys that start with "vit"
-    #     rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    # else:
-    #     # layernorm + classification head
-    #     rename_keys.extend(
-    #         [
-    #             ("norm.weight", "vit.layernorm.weight"),
-    #             ("norm.bias", "vit.layernorm.bias"),
-    #             ("head.weight", "classifier.weight"),
-    #             ("head.bias", "classifier.bias"),
-    #         ]
-    #     )
+    # transformer encoder
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend(
+            [
+                ("norm.weight", "layernorm.weight"),
+                ("norm.bias", "layernorm.bias"),
+                ("pre_logits.fc.weight", "pooler.dense.weight"),
+                ("pre_logits.fc.bias", "pooler.dense.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "vit" from all keys that start with "vit"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "vit.layernorm.weight"),
+                ("norm.bias", "vit.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
     # fmt: on
 
     return rename_keys
@@ -187,7 +187,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         model = ViTHybridModel(config).eval()
     else:
         model = ViTHybridForImageClassification(config).eval()
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    model.load_state_dict(state_dict)
 
     # Check outputs on an image
     # TODO use feature extractor
@@ -211,7 +211,11 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     print("First values of pixel values:", pixel_values.shape)
     print(pixel_values[0, :5, :5, :5])
 
-    outputs = model(pixel_values)
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        logits = outputs.logits
+
+    print("Predicted class:", logits.argmax(-1).item())
 
     if base_model:
         timm_pooled_output = timm_model.forward_features(pixel_values)
@@ -221,6 +225,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         timm_logits = timm_model(pixel_values)
         assert timm_logits.shape == outputs.logits.shape
         assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)

From 137baf0d0d0f8a60e0ba3255e756863f9d484812 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 11:52:15 +0100
Subject: [PATCH 10/88] More improvements

---
 .../models/bit/convert_bit_to_pytorch.py      | 28 ++++++++-----------
 src/transformers/models/bit/modeling_bit.py   |  7 +++--
 .../convert_vit_hybrid_timm_to_pytorch.py     | 15 +++-------
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 3b69437e0c4b..2378315d0ec2 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -25,6 +25,8 @@
 import requests
 from huggingface_hub import hf_hub_download
 from timm import create_model
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
 
 # from timm.data import resolve_data_config
 # from timm.data.transforms_factory import create_transform
@@ -104,21 +106,12 @@ def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
 
     # TODO verify logits
-    # transform = create_transform(**resolve_data_config({}, model=model))
-    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    # image = Image.open(requests.get(url, stream=True).raw)
-    # weird bug: we don't get the same pixel values as in Colab
-    # load pixel values from the hub for the moment
-    # pixel_values = transform(image).unsqueeze(0)
-
-    from huggingface_hub import hf_hub_download
-
-    pixel_values = torch.load(
-        hf_hub_download("nielsr/dummy-pixel-values", repo_type="dataset", filename="pixel_values.pt")
-    )
+    transform = create_transform(**resolve_data_config({}, model=timm_model))
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    pixel_values = transform(image).unsqueeze(0)
 
     print("Shape of pixel values:", pixel_values.shape)
-    print("First values of pixel values:", pixel_values[0, 0, :3, :3])
 
     with torch.no_grad():
         outputs = model(pixel_values)
@@ -126,9 +119,12 @@ def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
 
     print("Logits:", logits[0, :3])
     print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-    if model_name == "resnetv2_50x1_bitm":
-        expected_slice = torch.tensor([ 0.1665, -0.2718, -1.1446])
-    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-3)
+    # if model_name == "resnetv2_50x1_bitm":
+    #     expected_slice = torch.tensor([ 0.1665, -0.2718, -1.1446])
+    timm_logits = timm_model(pixel_values)
+    assert timm_logits.shape == outputs.logits.shape
+    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+    # assert torch.allclose(logits[0, :3], expected_slice, atol=1e-3)
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 9f0a1ab10cf7..115f164ee5a2 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -253,8 +253,6 @@ def __init__(self, config: BitConfig):
         elif config.conv_layer == "std_conv_same":
             conv_layer = partial(StdConv2dSame, eps=1e-8)
 
-        print("Conv layer:", conv_layer)
-
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
        
         self.norm = None
@@ -284,6 +282,9 @@ def forward(self, pixel_values: Tensor) -> Tensor:
 
         embedding = self.pooler(embedding)
 
+        print("Shape of embedding:", embedding.shape)
+        print("First values of embedding:", embedding[0,0,:3,:3])
+
         return embedding
 
 
@@ -626,6 +627,8 @@ def forward(
                 hidden_states = hidden_states + (hidden_state,)
 
             hidden_state = stage_module(hidden_state, print_values=idx == 0)
+            print("Shape of hidden states after stage", idx, hidden_state.shape)
+            print("Hidden states after stage", idx, hidden_state[0, 0, :3, :3])
 
         if output_hidden_states:
             hidden_states = hidden_states + (hidden_state,)
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 4faac12e8cda..a854d780719c 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -16,14 +16,16 @@
 
 
 import argparse
-import json
 from pathlib import Path
+import requests
 
 import torch
 from PIL import Image
 
-import requests
 import timm
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
+
 from transformers import (
     BitConfig,
     ViTFeatureExtractor,
@@ -190,15 +192,6 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
 
     # Check outputs on an image
-    # TODO use feature extractor
-    # from huggingface_hub import hf_hub_download
-
-    # pixel_values = torch.load(
-    #     hf_hub_download("nielsr/dummy-pixel-values", repo_type="dataset", filename="pixel_values.pt")
-    # )
-    from timm.data import resolve_data_config
-    from timm.data.transforms_factory import create_transform
-
     transform = create_transform(
         **resolve_data_config({}, model=timm_model)
     )

From 6284a7ca2d8f2d708153dca439414565a31b0268 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 30 Nov 2022 12:29:33 +0000
Subject: [PATCH 11/88] add patch bit

---
 src/transformers/models/bit/modeling_bit.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 115f164ee5a2..6f7141ed2e16 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -259,10 +259,12 @@ def __init__(self, config: BitConfig):
         if not config.layer_type == "preactivation":
             self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
         
+        self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
+        
         if config.stem_type == "same":
             self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
         else:
-            self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+            self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
         self.num_channels = config.num_channels
 
     def forward(self, pixel_values: Tensor) -> Tensor:
@@ -276,6 +278,7 @@ def forward(self, pixel_values: Tensor) -> Tensor:
         print("First vaues of pixel values:", pixel_values[0,0,:3,:3])
 
         embedding = self.convolution(pixel_values)
+        embedding = self.pad(embedding)
 
         if self.norm is not None:
             embedding = self.norm(embedding)

From 2e0be65625455e52c8abda8be9e8abd5c4802e11 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 13:40:23 +0100
Subject: [PATCH 12/88] Fix style

---
 .../models/bit/convert_bit_to_pytorch.py          |  7 +------
 src/transformers/models/bit/modeling_bit.py       |  6 +++---
 src/transformers/models/bit/test.py               |  2 +-
 .../convert_vit_hybrid_timm_to_pytorch.py         | 15 +++++++++------
 .../models/vit_hybrid/modeling_vit_hybrid.py      |  2 +-
 5 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 2378315d0ec2..0fd0634369ab 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -105,26 +105,21 @@ def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
     model.eval()
     model.load_state_dict(state_dict)
 
-    # TODO verify logits
+    # verify logits
     transform = create_transform(**resolve_data_config({}, model=timm_model))
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw)
     pixel_values = transform(image).unsqueeze(0)
 
-    print("Shape of pixel values:", pixel_values.shape)
-
     with torch.no_grad():
         outputs = model(pixel_values)
         logits = outputs.logits
 
     print("Logits:", logits[0, :3])
     print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-    # if model_name == "resnetv2_50x1_bitm":
-    #     expected_slice = torch.tensor([ 0.1665, -0.2718, -1.1446])
     timm_logits = timm_model(pixel_values)
     assert timm_logits.shape == outputs.logits.shape
     assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-    # assert torch.allclose(logits[0, :3], expected_slice, atol=1e-3)
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 6f7141ed2e16..22612c699072 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -254,7 +254,7 @@ def __init__(self, config: BitConfig):
             conv_layer = partial(StdConv2dSame, eps=1e-8)
 
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
-       
+
         self.norm = None
         if not config.layer_type == "preactivation":
             self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
@@ -275,7 +275,7 @@ def forward(self, pixel_values: Tensor) -> Tensor:
             )
 
         print("Shape of pixel_values:", pixel_values.shape)
-        print("First vaues of pixel values:", pixel_values[0,0,:3,:3])
+        print("First vaues of pixel values:", pixel_values[0, 0, :3, :3])
 
         embedding = self.convolution(pixel_values)
         embedding = self.pad(embedding)
@@ -286,7 +286,7 @@ def forward(self, pixel_values: Tensor) -> Tensor:
         embedding = self.pooler(embedding)
 
         print("Shape of embedding:", embedding.shape)
-        print("First values of embedding:", embedding[0,0,:3,:3])
+        print("First values of embedding:", embedding[0, 0, :3, :3])
 
         return embedding
 
diff --git a/src/transformers/models/bit/test.py b/src/transformers/models/bit/test.py
index d8e43bf91f74..de1d3d965a17 100644
--- a/src/transformers/models/bit/test.py
+++ b/src/transformers/models/bit/test.py
@@ -1,4 +1,4 @@
-from transformers import BitConfig, BitBackbone
+from transformers import BitBackbone, BitConfig
 
 
 backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index a854d780719c..1a27937907e9 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -17,15 +17,14 @@
 
 import argparse
 from pathlib import Path
-import requests
 
 import torch
 from PIL import Image
 
+import requests
 import timm
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
-
 from transformers import (
     BitConfig,
     ViTFeatureExtractor,
@@ -167,7 +166,13 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     """
 
     # define default ViT hybrid configuration
-    backbone_config = BitConfig(stem_type="same", conv_layer="std_conv_same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
+    backbone_config = BitConfig(
+        stem_type="same",
+        conv_layer="std_conv_same",
+        layer_type="bottleneck",
+        depths=(3, 4, 9),
+        out_features=["stage3"],
+    )
     config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
     base_model = False
 
@@ -192,9 +197,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
 
     # Check outputs on an image
-    transform = create_transform(
-        **resolve_data_config({}, model=timm_model)
-    )
+    transform = create_transform(**resolve_data_config({}, model=timm_model))
 
     # load image
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 8f28c51642ad..4f84c8905850 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -156,7 +156,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.projection(features).flatten(2).transpose(1, 2)
 
         print("Shape of embeddings:", embeddings.shape)
-        print("First values of embeddings:", embeddings[0,:3,:3])
+        print("First values of embeddings:", embeddings[0, :3, :3])
 
         return embeddings
 

From 98a0a6c819a28e6a7df63707b5a7260ae023693b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 13:55:05 +0100
Subject: [PATCH 13/88] Improve code

---
 .../models/bit/convert_bit_to_pytorch.py      | 13 +++-----
 src/transformers/models/bit/modeling_bit.py   | 33 ++++++-------------
 src/transformers/models/bit/test.py           |  6 ++--
 .../convert_vit_hybrid_timm_to_pytorch.py     |  8 +----
 src/transformers/models/vit_hybrid/test.py    | 13 +-------
 5 files changed, 21 insertions(+), 52 deletions(-)

diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 0fd0634369ab..8844f2ff33ae 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -27,9 +27,6 @@
 from timm import create_model
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
-
-# from timm.data import resolve_data_config
-# from timm.data.transforms_factory import create_transform
 from transformers import BitConfig, BitForImageClassification
 from transformers.utils import logging
 
@@ -46,9 +43,10 @@ def get_config(model_name):
     label2id = {v: k for k, v in id2label.items()}
 
     conv_layer = "std_conv" if "bit" in model_name else False
-    # for the ViT-hybrid checkpoints, one needs to additionally set config.layer_type = "bottleneck"
-    # and use a different conv_layer, namely StdConv2dSame
-    # and "stem_type": "same" in the data config
+
+    # note that when using BiT as backbone for ViT-hybrid checkpoints,
+    # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same",
+    # config.conv_layer = "std_conv_same"
     config = BitConfig(
         conv_layer=conv_layer,
         num_labels=1000,
@@ -107,8 +105,7 @@ def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
 
     # verify logits
     transform = create_transform(**resolve_data_config({}, model=timm_model))
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
+    image = prepare_img()
     pixel_values = transform(image).unsqueeze(0)
 
     with torch.no_grad():
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 22612c699072..e8d47190e349 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -61,7 +61,6 @@
 ]
 
 
-# Can SAME padding for given args be done statically?
 def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
     return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
 
@@ -255,16 +254,17 @@ def __init__(self, config: BitConfig):
 
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
 
-        self.norm = None
-        if not config.layer_type == "preactivation":
-            self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
-        
-        self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
-        
+        self.pad = None
         if config.stem_type == "same":
             self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
         else:
+            self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
             self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        self.norm = None
+        if not config.layer_type == "preactivation":
+            self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
+
         self.num_channels = config.num_channels
 
     def forward(self, pixel_values: Tensor) -> Tensor:
@@ -274,20 +274,16 @@ def forward(self, pixel_values: Tensor) -> Tensor:
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
 
-        print("Shape of pixel_values:", pixel_values.shape)
-        print("First vaues of pixel values:", pixel_values[0, 0, :3, :3])
-
         embedding = self.convolution(pixel_values)
-        embedding = self.pad(embedding)
+
+        if self.pad is not None:
+            embedding = self.pad(embedding)
 
         if self.norm is not None:
             embedding = self.norm(embedding)
 
         embedding = self.pooler(embedding)
 
-        print("Shape of embedding:", embedding.shape)
-        print("First values of embedding:", embedding[0, 0, :3, :3])
-
         return embedding
 
 
@@ -494,15 +490,6 @@ def __init__(
         self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
 
     def forward(self, x, print_values=False):
-        # if print_values:
-        #     print("Conv layer:", self.conv_layer)
-        #     print("Hidden states before downsample conv:", x[0, 0, :3, :3])
-
-        z = self.conv(x)
-
-        # if print_values:
-        #     print("Hidden states after downsample conv:", z[0, 0, :3, :3])
-
         return self.norm(self.conv(x))
 
 
diff --git a/src/transformers/models/bit/test.py b/src/transformers/models/bit/test.py
index de1d3d965a17..a6596073103f 100644
--- a/src/transformers/models/bit/test.py
+++ b/src/transformers/models/bit/test.py
@@ -1,3 +1,5 @@
+import torch
+
 from transformers import BitBackbone, BitConfig
 
 
@@ -5,5 +7,5 @@
 
 model = BitBackbone(backbone_config)
 
-for name, param in model.named_parameters():
-    print(name, param.shape)
+outputs = model(torch.rand(1, 3, 224, 224))
+print(outputs.feature_maps[-1].shape)
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 1a27937907e9..92cc12595d6f 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -25,13 +25,7 @@
 import timm
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
-from transformers import (
-    BitConfig,
-    ViTFeatureExtractor,
-    ViTHybridConfig,
-    ViTHybridForImageClassification,
-    ViTHybridModel,
-)
+from transformers import BitConfig, ViTHybridConfig, ViTHybridForImageClassification, ViTHybridModel
 from transformers.utils import logging
 
 
diff --git a/src/transformers/models/vit_hybrid/test.py b/src/transformers/models/vit_hybrid/test.py
index 47371ac6a18a..861ce31c82f9 100644
--- a/src/transformers/models/vit_hybrid/test.py
+++ b/src/transformers/models/vit_hybrid/test.py
@@ -1,20 +1,9 @@
-import torch
-
-from transformers import BitBackbone, BitConfig, ViTHybridConfig, ViTHybridForImageClassification
+from transformers import BitConfig, ViTHybridConfig, ViTHybridForImageClassification
 
 
 backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
 config = ViTHybridConfig(backbone_config=backbone_config, image_size=384)
 
-# model = BitBackbone(config=backbone_config)
-
-# for name, param in model.named_parameters():
-#     print(name, param.shape)
-
-# outputs = model(torch.randn(1, 3, 384, 384))
-
-# print(outputs.feature_maps[0].shape)
-
 model = ViTHybridForImageClassification(config)
 
 for name, param in model.named_parameters():

From 3047a7171e297169a66781e9c6e2f8fdbf06e711 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 30 Nov 2022 14:23:06 +0000
Subject: [PATCH 14/88] cleaned v1

---
 .../models/bit/configuration_bit.py           |   4 +
 src/transformers/models/bit/modeling_bit.py   | 313 ++++++++++--------
 tests/models/bit/test_modeling_bit.py         |   7 +-
 3 files changed, 183 insertions(+), 141 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 49f500d9e2a2..3f159fbe60aa 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -49,6 +49,8 @@ class BitConfig(PretrainedConfig):
         hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
             are supported.
+        num_groups (`int`, *optional*, defaults to `32`):
+            Number of groups used for the `BitGroupNormActivation` layers
         downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
             If `True`, the first stage will downsample the inputs using a `stride` of 2.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
@@ -89,6 +91,7 @@ def __init__(
         output_stride=32,
         width_factor=1,
         conv_layer="std_conv",
+        num_groups=32,
         out_features=None,
         **kwargs
     ):
@@ -102,6 +105,7 @@ def __init__(
         self.stem_type = stem_type
         self.layer_type = layer_type
         self.hidden_act = hidden_act
+        self.num_groups = num_groups
         self.downsample_in_first_stage = downsample_in_first_stage
         self.drop_path_rate = drop_path_rate
         self.output_stride = output_stride
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index e8d47190e349..aa469f4631d3 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -17,13 +17,14 @@
 import collections
 import math
 from functools import partial
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
 from torch import Tensor, nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
+from ...activations import ACT2FN
 from ...modeling_outputs import (
     BackboneOutput,
     BaseModelOutputWithNoAttention,
@@ -61,20 +62,30 @@
 ]
 
 
-def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
-    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tuple[Tuple, bool]:
+    r"""
+    Utility function to get the tuple padding value given the kernel_size and padding
 
-
-def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    Args:
+        padding, Union[`str`, `int`]:
+            padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
+            PyTorch is used.
+        kernel_size, `int`:
+            Kernel size of the convolution layers.
+    """
     dynamic = False
+    if padding is None:
+        padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+        return padding, dynamic
+
     if isinstance(padding, str):
         # for any string padding, the padding will be calculated for you, one of three ways
         padding = padding.lower()
         if padding == "same":
             # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
-            if is_static_pad(kernel_size, **kwargs):
+            if stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0:
                 # static case, no extra overhead
-                padding = get_padding(kernel_size, **kwargs)
+                padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
             else:
                 # dynamic 'SAME' padding, has runtime/GPU memory overhead
                 padding = 0
@@ -84,7 +95,7 @@ def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
             padding = 0
         else:
             # Default to PyTorch style 'same'-ish symmetric padding
-            padding = get_padding(kernel_size, **kwargs)
+            padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
     return padding, dynamic
 
 
@@ -118,12 +129,15 @@ def __init__(
             groups=groups,
             bias=bias,
         )
-        self.same_pad = is_dynamic
+        if is_dynamic:
+            self.pad = DynamicPad2d(kernel_size, stride, dilation)
+        else:
+            self.pad = None
         self.eps = eps
 
     def forward(self, x):
-        if self.same_pad:
-            x = pad_same(x, self.kernel_size, self.stride, self.dilation)
+        if self.pad is not None:
+            x = self.pad(x)
         weight = nn.functional.batch_norm(
             self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
         ).reshape_as(self.weight)
@@ -133,7 +147,8 @@ def forward(self, x):
 
 def _num_groups(num_channels, num_groups, group_size):
     if group_size:
-        assert num_channels % group_size == 0
+        if num_channels % group_size == 0:
+            raise ValueError("num_channels must divide group_size")
         return num_channels // group_size
     return num_groups
 
@@ -142,44 +157,31 @@ class BitGroupNormActivation(nn.GroupNorm):
     # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args
     def __init__(
         self,
+        config,
         num_channels,
         num_groups=32,
         eps=1e-5,
         affine=True,
         group_size=None,
         apply_act=True,
-        act_layer=nn.ReLU,
-        inplace=True,
         drop_layer=None,
     ):
         super(BitGroupNormActivation, self).__init__(
             _num_groups(num_channels, num_groups, group_size), num_channels, eps=eps, affine=affine
         )
         self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        # act_layer = get_act_layer(act_layer)  # string -> nn.Module
-        if act_layer is not None and apply_act:
-            act_args = dict(inplace=True) if inplace else {}
-            self.act = act_layer(**act_args)
+        if apply_act:
+            self.activation = ACT2FN[config.hidden_act]
         else:
-            self.act = nn.Identity()
-        self._fast_norm = False  # TODO add support for fast norm
+            self.activation = nn.Identity()
 
     def forward(self, x):
-        # if self._fast_norm:
-        #     x = fast_group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-        # else:
         x = nn.functional.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
         x = self.drop(x)
-        x = self.act(x)
+        x = self.activation(x)
         return x
 
 
-# Calculate symmetric padding for a convolution
-def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
 class StdConv2d(nn.Conv2d):
     """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.
 
@@ -191,7 +193,7 @@ def __init__(
         self, in_channel, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1, bias=False, eps=1e-6
     ):
         if padding is None:
-            padding = get_padding(kernel_size, stride, dilation)
+            padding, _ = get_padding_value(padding, kernel_size, stride, dilation)
         super().__init__(
             in_channel,
             out_channels,
@@ -212,32 +214,72 @@ def forward(self, x):
         return x
 
 
-# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
-def get_same_padding(x: int, k: int, s: int, d: int):
-    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
-
+class DynamicPad2d(nn.Module):
+    r"""
+    A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
+    hidden states.
+    """
 
-# Dynamically pad input x with 'SAME' padding for conv with specified args
-def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
-    ih, iw = x.size()[-2:]
-    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
-    if pad_h > 0 or pad_w > 0:
-        x = nn.functional.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
-    return x
+    def __init__(self, kernel_size, stride, dilation, value=-float("inf")):
+        super().__init__(self)
+        # Safety checkers
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+
+        if isinstance(stride, int):
+            stride = (stride, stride)
+
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation)
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.value = value
+
+        def compute_padding(x, kernel_size, stride, dilation):
+            return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
+
+        self.compute_padding = compute_padding
+
+    def __call__(self, input):
+        # Get width and height
+        input_height, input_width = input.size()[-2:]
+
+        # Compute the padding values
+        padding_height = self.compute_padding(input_height, self.kernel_size[0], self.stride[0], self.dilation[0])
+        padding_width = self.compute_padding(input_width, self.kernel_size[1], self.stride[1], self.dilation[1])
+
+        # apply pad
+        if padding_height > 0 or padding_width > 0:
+            x = nn.functional.pad(
+                input,
+                [
+                    padding_width // 2,
+                    padding_width - padding_width // 2,
+                    padding_height // 2,
+                    padding_height - padding_height // 2,
+                ],
+                value=self.value,
+            )
+        return x
 
 
 class MaxPool2dSame(nn.MaxPool2d):
     """Tensorflow like 'SAME' wrapper for 2D max pooling"""
 
-    def __init__(self, kernel_size: int, stride=None, dilation=1, ceil_mode=False):
+    def __init__(
+        self, kernel_size: int, stride=None, dilation=1, ceil_mode=False, padding=(0, 0), padding_value=-float("inf")
+    ):
         kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
         stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
         dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
-        super(MaxPool2dSame, self).__init__(kernel_size, stride, (0, 0), dilation, ceil_mode)
+        super(MaxPool2dSame, self).__init__(kernel_size, stride, padding, dilation, ceil_mode)
+        self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
 
     def forward(self, x):
-        x = pad_same(x, self.kernel_size, self.stride, value=-float("inf"))
-        return nn.functional.max_pool2d(x, self.kernel_size, self.stride, (0, 0), self.dilation, self.ceil_mode)
+        x = self.pad(x)
+        return nn.functional.max_pool2d(x, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode)
 
 
 class BitEmbeddings(nn.Module):
@@ -254,16 +296,17 @@ def __init__(self, config: BitConfig):
 
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
 
-        self.pad = None
         if config.stem_type == "same":
             self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
+            self.pad = nn.Identity()
         else:
             self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
             self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
 
-        self.norm = None
         if not config.layer_type == "preactivation":
-            self.norm = partial(BitGroupNormActivation, num_groups=32)(config.embedding_size)
+            self.norm = partial(BitGroupNormActivation, num_groups=config.num_groups)(config.embedding_size)
+        else:
+            self.norm = nn.Identity()
 
         self.num_channels = config.num_channels
 
@@ -274,14 +317,16 @@ def forward(self, pixel_values: Tensor) -> Tensor:
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
 
+        # Conv
         embedding = self.convolution(pixel_values)
 
-        if self.pad is not None:
-            embedding = self.pad(embedding)
+        # Eventually pad
+        embedding = self.pad(embedding)
 
-        if self.norm is not None:
-            embedding = self.norm(embedding)
+        # Eventually use BitGroupNorm
+        embedding = self.norm(embedding)
 
+        # and pool
         embedding = self.pooler(embedding)
 
         return embedding
@@ -323,12 +368,12 @@ def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
 
 
-def make_div(v, divisor=8):
+def make_div(value, divisor=8):
     min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    if new_value < 0.9 * value:
+        new_value += divisor
+    return new_value
 
 
 class BitPreActivationBottleneckLayer(nn.Module):
@@ -348,27 +393,25 @@ def __init__(
         dilation=1,
         first_dilation=None,
         groups=1,
-        act_layer=None,
         conv_layer=None,
         norm_layer=None,
-        proj_layer=None,
         drop_path_rate=0.0,
+        num_groups=32,
+        is_first_layer=False,
     ):
         super().__init__()
 
         first_dilation = first_dilation or dilation
         conv_layer = conv_layer or StdConv2d
-        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=32)
+        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=num_groups)
         out_channels = out_channels or in_channels
         mid_channels = make_div(out_channels * bottle_ratio)
 
-        if proj_layer is not None:
-            self.downsample = proj_layer(
+        if is_first_layer:
+            self.downsample = BitDownsampleConv(
                 in_channels,
                 out_channels,
                 stride=stride,
-                dilation=dilation,
-                first_dilation=first_dilation,
                 preact=True,
                 conv_layer=conv_layer,
                 norm_layer=norm_layer,
@@ -376,27 +419,21 @@ def __init__(
         else:
             self.downsample = None
 
-        self.norm1 = norm_layer(in_channels)
+        self.norm1 = norm_layer(num_channels=in_channels)
         self.conv1 = conv_layer(in_channels, mid_channels, 1)
-        self.norm2 = norm_layer(mid_channels)
+        self.norm2 = norm_layer(num_channels=mid_channels)
         self.conv2 = conv_layer(mid_channels, mid_channels, 3, stride=stride, dilation=first_dilation, groups=groups)
-        self.norm3 = norm_layer(mid_channels)
+        self.norm3 = norm_layer(num_channels=mid_channels)
         self.conv3 = conv_layer(mid_channels, out_channels, 1)
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
-    def forward(self, x, print_values=False):
+    def forward(self, x):
         x_preact = self.norm1(x)
 
-        # if print_values:
-        #     print("Hidden states after first norm:", x_preact[0, 0, :3, :3])
-
         # shortcut branch
         shortcut = x
         if self.downsample is not None:
-            shortcut = self.downsample(x_preact, print_values)
-
-        # if print_values:
-        #     print("Hidden states after downsample:", shortcut[0, 0, :3, :3])
+            shortcut = self.downsample(x_preact)
 
         # residual branch
         x = self.conv1(x_preact)
@@ -411,6 +448,7 @@ class BitBottleneckLayer(nn.Module):
 
     def __init__(
         self,
+        config,
         in_channels,
         out_channels=None,
         bottle_ratio=0.25,
@@ -418,26 +456,24 @@ def __init__(
         dilation=1,
         first_dilation=None,
         groups=1,
-        act_layer=None,
         conv_layer=None,
         norm_layer=None,
-        proj_layer=None,
         drop_path_rate=0.0,
+        num_groups=32,
+        is_first_layer=False,
     ):
         super().__init__()
         first_dilation = first_dilation or dilation
-        act_layer = act_layer or nn.ReLU
         conv_layer = conv_layer or StdConv2d
-        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=32)
+        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=num_groups)
         out_channels = out_channels or in_channels
         mid_chs = make_div(out_channels * bottle_ratio)
 
-        if proj_layer is not None:
-            self.downsample = proj_layer(
+        if is_first_layer:
+            self.downsample = BitDownsampleConv(
                 in_channels,
                 out_channels,
                 stride=stride,
-                dilation=dilation,
                 preact=False,
                 conv_layer=conv_layer,
                 norm_layer=norm_layer,
@@ -452,9 +488,9 @@ def __init__(
         self.conv3 = conv_layer(mid_chs, out_channels, 1)
         self.norm3 = norm_layer(out_channels, apply_act=False)
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-        self.act3 = act_layer(inplace=True)
+        self.activation = ACT2FN[config.hidden_act]
 
-    def forward(self, x, print_values=False):
+    def forward(self, x):
         # shortcut branch
         shortcut = x
         if self.downsample is not None:
@@ -463,12 +499,18 @@ def forward(self, x, print_values=False):
         # residual
         x = self.conv1(x)
         x = self.norm1(x)
+
+        # second step
         x = self.conv2(x)
         x = self.norm2(x)
+
+        # third step
         x = self.conv3(x)
         x = self.norm3(x)
+
+        # final step
         x = self.drop_path(x)
-        x = self.act3(x + shortcut)
+        x = self.activation(x + shortcut)
         return x
 
 
@@ -478,8 +520,6 @@ def __init__(
         in_channels,
         out_channels,
         stride=1,
-        dilation=1,
-        first_dilation=None,
         preact=True,
         conv_layer=None,
         norm_layer=None,
@@ -489,7 +529,7 @@ def __init__(
         self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
         self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
 
-    def forward(self, x, print_values=False):
+    def forward(self, x):
         return self.norm(self.conv(x))
 
 
@@ -500,6 +540,7 @@ class BitStage(nn.Module):
 
     def __init__(
         self,
+        config,
         in_channels,
         out_channels,
         stride,
@@ -507,22 +548,36 @@ def __init__(
         depth,
         bottle_ratio=0.25,
         groups=1,
-        avg_down=False,
         layer_dpr=None,
-        layer_fn=BitPreActivationBottleneckLayer,
-        act_layer=None,
-        conv_layer=None,
-        norm_layer=None,
-        **layer_kwargs
     ):
         super().__init__()
 
         first_dilation = 1 if dilation in (1, 2) else 2
-        layer_kwargs = dict(act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer)
-        if avg_down:
-            # TODO add support for avg_down
-            raise NotImplementedError("avg_down is not implemented")
-        proj_layer = BitDownsampleConv
+
+        # Step 1: Get the layer type
+        if config.layer_type == "bottleneck":
+            layer_fn = BitBottleneckLayer
+        elif config.layer_type == "preactivation":
+            layer_fn = BitPreActivationBottleneckLayer
+        else:
+            raise ValueError(
+                f"Unknown layer type: {config.layer_type}. Please use one of the following: [`'bottleneck'`,"
+                " `'preactivation`]"
+            )
+
+        # Step 2: Getting the convolution type
+        if config.conv_layer == "std_conv":
+            conv_layer = partial(StdConv2d, eps=1e-8)
+        elif config.conv_layer == "std_conv_same":
+            conv_layer = partial(StdConv2dSame, eps=1e-8)
+        else:
+            raise ValueError(
+                f"Convolutional layer {config.conv_layer} not supported! Please use one of the following:"
+                " [`'std_conv'`, `'std_conv_same`]"
+            )
+
+        norm_layer = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)
+
         prev_chs = in_channels
         self.layers = nn.Sequential()
         for layer_idx in range(depth):
@@ -538,23 +593,19 @@ def __init__(
                     bottle_ratio=bottle_ratio,
                     groups=groups,
                     first_dilation=first_dilation,
-                    proj_layer=proj_layer,
                     drop_path_rate=drop_path_rate,
-                    **layer_kwargs,
+                    conv_layer=conv_layer,
+                    norm_layer=norm_layer,
+                    is_first_layer=(layer_idx == 0),
                 ),
             )
             prev_chs = out_channels
             first_dilation = dilation
-            proj_layer = None
 
-    def forward(self, input: Tensor, print_values=False) -> Tensor:
+    def forward(self, input: Tensor) -> Tensor:
         hidden_state = input
-        for idx, layer in enumerate(self.layers):
-            # if idx == 0 and print_values:
-            #     print(f"Hidden states before block {idx}", hidden_state[0, 0, :3, :3])
-            hidden_state = layer(hidden_state, print_values=idx == 0)
-            # if idx == 0 and print_values:
-            #     print(f"Hidden states after block {idx}", hidden_state[0, 0, :3, :3])
+        for _, layer in enumerate(self.layers):
+            hidden_state = layer(hidden_state)
         return hidden_state
 
 
@@ -563,45 +614,27 @@ def __init__(self, config: BitConfig):
         super().__init__()
         self.stages = nn.ModuleList([])
 
-        act_layer = nn.ReLU
-        if config.conv_layer == "std_conv":
-            conv_layer = partial(StdConv2d, eps=1e-8)
-        elif config.conv_layer == "std_conv_same":
-            conv_layer = partial(StdConv2dSame, eps=1e-8)
-
-        norm_layer = partial(BitGroupNormActivation, num_groups=32)
-
         prev_chs = config.embedding_size
         curr_stride = 4
         dilation = 1
         layer_dprs = [
             x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
         ]
-        if config.layer_type == "bottleneck":
-            layer_fn = BitBottleneckLayer
-        elif config.layer_type == "preactivation":
-            layer_fn = BitPreActivationBottleneckLayer
-        else:
-            raise ValueError("Unknown layer type: {}".format(config.layer_type))
 
-        for stage_idx, (d, c, bdpr) in enumerate(zip(config.depths, config.hidden_sizes, layer_dprs)):
+        for stage_idx, (current_depth, c, bdpr) in enumerate(zip(config.depths, config.hidden_sizes, layer_dprs)):
             out_channels = make_div(c * config.width_factor)
             stride = 1 if stage_idx == 0 else 2
             if curr_stride >= config.output_stride:
                 dilation *= stride
                 stride = 1
             stage = BitStage(
+                config,
                 prev_chs,
                 out_channels,
                 stride=stride,
                 dilation=dilation,
-                depth=d,
-                avg_down=False,
-                act_layer=act_layer,
-                conv_layer=conv_layer,
-                norm_layer=norm_layer,
+                depth=current_depth,
                 layer_dpr=bdpr,
-                layer_fn=layer_fn,
             )
             prev_chs = out_channels
             curr_stride *= stride
@@ -612,13 +645,11 @@ def forward(
     ) -> BaseModelOutputWithNoAttention:
         hidden_states = () if output_hidden_states else None
 
-        for idx, stage_module in enumerate(self.stages):
+        for stage_module in self.stages:
             if output_hidden_states:
                 hidden_states = hidden_states + (hidden_state,)
 
-            hidden_state = stage_module(hidden_state, print_values=idx == 0)
-            print("Shape of hidden states after stage", idx, hidden_state.shape)
-            print("Hidden states after stage", idx, hidden_state[0, 0, :3, :3])
+            hidden_state = stage_module(hidden_state)
 
         if output_hidden_states:
             hidden_states = hidden_states + (hidden_state,)
@@ -692,8 +723,12 @@ def __init__(self, config):
         self.embedder = BitEmbeddings(config)
 
         self.encoder = BitEncoder(config)
-        norm_layer = partial(BitGroupNormActivation, num_groups=32)
-        self.norm = norm_layer(config.hidden_sizes[-1]) if config.layer_type == "preactivation" else nn.Identity()
+        norm_layer = partial(BitGroupNormActivation, num_groups=config.num_groups)
+        self.norm = (
+            norm_layer(config, num_channels=config.hidden_sizes[-1])
+            if config.layer_type == "preactivation"
+            else nn.Identity()
+        )
 
         self.pooler = nn.AdaptiveAvgPool2d((1, 1))
         # Initialize weights and apply final processing
@@ -864,8 +899,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
-        >>> model = AutoBackbone.from_pretrained("microsoft/resnet-50")
+        >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
+        >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")
 
         >>> inputs = processor(image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 7cb26e81d654..35d2871422d8 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -48,7 +48,7 @@ def __init__(
         image_size=32,
         num_channels=3,
         embeddings_size=10,
-        hidden_sizes=[10, 20, 30, 40],
+        hidden_sizes=[8, 16, 32, 64],
         depths=[1, 1, 2, 1],
         is_training=True,
         use_labels=True,
@@ -56,6 +56,7 @@ def __init__(
         num_labels=3,
         scope=None,
         out_features=["stage2", "stage3", "stage4"],
+        num_groups=1,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -71,6 +72,7 @@ def __init__(
         self.scope = scope
         self.num_stages = len(hidden_sizes)
         self.out_features = out_features
+        self.num_groups = num_groups
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -92,6 +94,7 @@ def get_config(self):
             hidden_act=self.hidden_act,
             num_labels=self.num_labels,
             out_features=self.out_features,
+            num_groups=self.num_groups,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -242,7 +245,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        layers_type = ["basic", "bottleneck"]
+        layers_type = ["preactivation", "bottleneck"]
         for model_class in self.all_model_classes:
             for layer_type in layers_type:
                 config.layer_type = layer_type

From fc17d5a590f07b008f0a0a0dd7c8d7066b467249 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 30 Nov 2022 14:47:37 +0000
Subject: [PATCH 15/88] more cleaning

---
 src/transformers/models/bit/modeling_bit.py | 120 +++++++++++++-------
 1 file changed, 80 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index aa469f4631d3..31a17a111606 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -386,6 +386,7 @@ class BitPreActivationBottleneckLayer(nn.Module):
 
     def __init__(
         self,
+        config,
         in_channels,
         out_channels=None,
         bottle_ratio=0.25,
@@ -393,17 +394,25 @@ def __init__(
         dilation=1,
         first_dilation=None,
         groups=1,
-        conv_layer=None,
-        norm_layer=None,
         drop_path_rate=0.0,
-        num_groups=32,
         is_first_layer=False,
     ):
         super().__init__()
 
         first_dilation = first_dilation or dilation
-        conv_layer = conv_layer or StdConv2d
-        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=num_groups)
+
+        if config.conv_layer == "std_conv":
+            conv_layer = partial(StdConv2d, eps=1e-8)
+        elif config.conv_layer == "std_conv_same":
+            conv_layer = partial(StdConv2dSame, eps=1e-8)
+        else:
+            raise ValueError(
+                f"Convolutional layer {config.conv_layer} not supported! Please use one of the following:"
+                " [`'std_conv'`, `'std_conv_same`]"
+            )
+
+        norm_layer = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)
+
         out_channels = out_channels or in_channels
         mid_channels = make_div(out_channels * bottle_ratio)
 
@@ -457,15 +466,25 @@ def __init__(
         first_dilation=None,
         groups=1,
         conv_layer=None,
-        norm_layer=None,
         drop_path_rate=0.0,
-        num_groups=32,
         is_first_layer=False,
     ):
         super().__init__()
         first_dilation = first_dilation or dilation
-        conv_layer = conv_layer or StdConv2d
-        norm_layer = norm_layer or partial(BitGroupNormActivation, num_groups=num_groups)
+
+        # Getting the convolution type
+        if config.conv_layer == "std_conv":
+            conv_layer = partial(StdConv2d, eps=1e-8)
+        elif config.conv_layer == "std_conv_same":
+            conv_layer = partial(StdConv2dSame, eps=1e-8)
+        else:
+            raise ValueError(
+                f"Convolutional layer {config.conv_layer} not supported! Please use one of the following:"
+                " [`'std_conv'`, `'std_conv_same`]"
+            )
+
+        norm_layer = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)
+
         out_channels = out_channels or in_channels
         mid_chs = make_div(out_channels * bottle_ratio)
 
@@ -483,10 +502,13 @@ def __init__(
 
         self.conv1 = conv_layer(in_channels, mid_chs, 1)
         self.norm1 = norm_layer(mid_chs)
+
         self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
         self.norm2 = norm_layer(mid_chs)
+
         self.conv3 = conv_layer(mid_chs, out_channels, 1)
         self.norm3 = norm_layer(out_channels, apply_act=False)
+
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
         self.activation = ACT2FN[config.hidden_act]
 
@@ -547,8 +569,7 @@ def __init__(
         dilation,
         depth,
         bottle_ratio=0.25,
-        groups=1,
-        layer_dpr=None,
+        layer_dropout=None,
     ):
         super().__init__()
 
@@ -565,43 +586,47 @@ def __init__(
                 " `'preactivation`]"
             )
 
-        # Step 2: Getting the convolution type
-        if config.conv_layer == "std_conv":
-            conv_layer = partial(StdConv2d, eps=1e-8)
-        elif config.conv_layer == "std_conv_same":
-            conv_layer = partial(StdConv2dSame, eps=1e-8)
-        else:
-            raise ValueError(
-                f"Convolutional layer {config.conv_layer} not supported! Please use one of the following:"
-                " [`'std_conv'`, `'std_conv_same`]"
-            )
-
-        norm_layer = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)
-
         prev_chs = in_channels
         self.layers = nn.Sequential()
         for layer_idx in range(depth):
-            drop_path_rate = layer_dpr[layer_idx] if layer_dpr else 0.0
-            stride = stride if layer_idx == 0 else 1
+            # Get the current hyper-parameters
+            stride, drop_path_rate, is_first_layer = self._get_updated_hyperparameters(
+                layer_idx, stride, layer_dropout
+            )
+
             self.layers.add_module(
                 str(layer_idx),
                 layer_fn(
+                    config,
                     prev_chs,
                     out_channels,
                     stride=stride,
                     dilation=dilation,
                     bottle_ratio=bottle_ratio,
-                    groups=groups,
                     first_dilation=first_dilation,
                     drop_path_rate=drop_path_rate,
-                    conv_layer=conv_layer,
-                    norm_layer=norm_layer,
-                    is_first_layer=(layer_idx == 0),
+                    is_first_layer=is_first_layer,
                 ),
             )
             prev_chs = out_channels
             first_dilation = dilation
 
+    def _get_updated_hyperparameters(self, layer_idx, stride, layer_dropout):
+        r"""
+        Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
+        """
+        if layer_dropout:
+            drop_path_rate = layer_dropout[layer_idx]
+        else:
+            drop_path_rate = 0.0
+
+        if layer_idx != 0:
+            stride = 1
+
+        is_first_layer = layer_idx == 0
+
+        return stride, drop_path_rate, is_first_layer
+
     def forward(self, input: Tensor) -> Tensor:
         hidden_state = input
         for _, layer in enumerate(self.layers):
@@ -615,18 +640,23 @@ def __init__(self, config: BitConfig):
         self.stages = nn.ModuleList([])
 
         prev_chs = config.embedding_size
-        curr_stride = 4
+
+        # These needs to stay hardcoded
+        current_stride = 4
         dilation = 1
-        layer_dprs = [
+
+        layer_dropouts = [
             x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
         ]
 
-        for stage_idx, (current_depth, c, bdpr) in enumerate(zip(config.depths, config.hidden_sizes, layer_dprs)):
-            out_channels = make_div(c * config.width_factor)
-            stride = 1 if stage_idx == 0 else 2
-            if curr_stride >= config.output_stride:
-                dilation *= stride
-                stride = 1
+        for stage_idx, (current_depth, current_hidden_size, layer_dropout) in enumerate(
+            zip(config.depths, config.hidden_sizes, layer_dropouts)
+        ):
+            # Get the updated hyper params
+            out_channels, stride, dilation = self._get_updated_hyperparameters(
+                stage_idx, current_stride, current_hidden_size, dilation, config
+            )
+
             stage = BitStage(
                 config,
                 prev_chs,
@@ -634,12 +664,22 @@ def __init__(self, config: BitConfig):
                 stride=stride,
                 dilation=dilation,
                 depth=current_depth,
-                layer_dpr=bdpr,
+                layer_dropout=layer_dropout,
             )
+
             prev_chs = out_channels
-            curr_stride *= stride
+            current_stride *= stride
+
             self.stages.add_module(str(stage_idx), stage)
 
+    def _get_updated_hyperparameters(self, stage_idx, current_stride, current_hidden_size, dilation, config):
+        out_channels = make_div(current_hidden_size * config.width_factor)
+        stride = 1 if stage_idx == 0 else 2
+        if current_stride >= config.output_stride:
+            dilation *= stride
+            stride = 1
+        return out_channels, stride, dilation
+
     def forward(
         self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
     ) -> BaseModelOutputWithNoAttention:

From d7e34e71e55add71a054d5cc6e946478f0b46b17 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 30 Nov 2022 14:58:22 +0000
Subject: [PATCH 16/88] more refactoring

---
 src/transformers/models/bit/modeling_bit.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 31a17a111606..b8542fcd6bcc 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -304,7 +304,9 @@ def __init__(self, config: BitConfig):
             self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
 
         if not config.layer_type == "preactivation":
-            self.norm = partial(BitGroupNormActivation, num_groups=config.num_groups)(config.embedding_size)
+            self.norm = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)(
+                num_channels=config.embedding_size
+            )
         else:
             self.norm = nn.Identity()
 
@@ -501,13 +503,13 @@ def __init__(
             self.downsample = None
 
         self.conv1 = conv_layer(in_channels, mid_chs, 1)
-        self.norm1 = norm_layer(mid_chs)
+        self.norm1 = norm_layer(num_channels=mid_chs)
 
         self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
-        self.norm2 = norm_layer(mid_chs)
+        self.norm2 = norm_layer(num_channels=mid_chs)
 
         self.conv3 = conv_layer(mid_chs, out_channels, 1)
-        self.norm3 = norm_layer(out_channels, apply_act=False)
+        self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
 
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
         self.activation = ACT2FN[config.hidden_act]
@@ -549,7 +551,7 @@ def __init__(
         super(BitDownsampleConv, self).__init__()
         self.conv_layer = conv_layer
         self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
-        self.norm = nn.Identity() if preact else norm_layer(out_channels, apply_act=False)
+        self.norm = nn.Identity() if preact else norm_layer(num_channels=out_channels, apply_act=False)
 
     def forward(self, x):
         return self.norm(self.conv(x))

From 33ec655f43aec4efd66751810a0765fab3698c04 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 14:29:55 +0100
Subject: [PATCH 17/88] Improve models, add tests

---
 .../models/bit/configuration_bit.py           |  1 -
 .../convert_vit_hybrid_timm_to_pytorch.py     |  4 ----
 .../models/vit_hybrid/modeling_vit_hybrid.py  | 20 ++++++-------------
 tests/models/bit/test_modeling_bit.py         | 15 ++++----------
 4 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 3f159fbe60aa..fd69e5d8ad11 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -86,7 +86,6 @@ def __init__(
         stem_type="",
         layer_type="preactivation",
         hidden_act="relu",
-        downsample_in_first_stage=False,
         drop_path_rate=0.0,
         output_stride=32,
         width_factor=1,
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 92cc12595d6f..9ff005521aa1 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -198,15 +198,11 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     image = Image.open(requests.get(url, stream=True).raw)
     pixel_values = transform(image).unsqueeze(0)
 
-    print("First values of pixel values:", pixel_values.shape)
-    print(pixel_values[0, :5, :5, :5])
-
     with torch.no_grad():
         outputs = model(pixel_values)
         logits = outputs.logits
 
     print("Predicted class:", logits.argmax(-1).item())
-
     if base_model:
         timm_pooled_output = timm_model.forward_features(pixel_values)
         assert timm_pooled_output.shape == outputs.pooler_output.shape
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 4f84c8905850..8d91faa61211 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,10 +79,7 @@ def __init__(self, config: ViTHybridConfig):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-    ) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
 
         # add the [CLS] token to the embedded patch tokens
@@ -132,10 +129,7 @@ def __init__(self, config, feature_size=None):
             feature_size = (
                 feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
             )
-            if hasattr(self.backbone, "feature_info"):
-                feature_dim = self.backbone.feature_info.channels()[-1]
-            else:
-                feature_dim = self.backbone.num_features
+            feature_dim = self.backbone.channels[-1]
 
         assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
         self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
@@ -147,16 +141,14 @@ def __init__(self, config, feature_size=None):
         self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=patch_size, stride=patch_size)
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, height, width = pixel_values.shape
-        features = self.backbone(pixel_values).feature_maps[-1]
+        num_channels = pixel_values.shape[1]
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
-        embeddings = self.projection(features).flatten(2).transpose(1, 2)
 
-        print("Shape of embeddings:", embeddings.shape)
-        print("First values of embeddings:", embeddings[0, :3, :3])
+        features = self.backbone(pixel_values).feature_maps[-1]
+        embeddings = self.projection(features).flatten(2).transpose(1, 2)
 
         return embeddings
 
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 35d2871422d8..71a7123d5fec 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Bit model. """
+""" Testing suite for the PyTorch BiT model. """
 
 
 import inspect
@@ -52,7 +52,6 @@ def __init__(
         depths=[1, 1, 2, 1],
         is_training=True,
         use_labels=True,
-        hidden_act="relu",
         num_labels=3,
         scope=None,
         out_features=["stage2", "stage3", "stage4"],
@@ -67,7 +66,6 @@ def __init__(
         self.depths = depths
         self.is_training = is_training
         self.use_labels = use_labels
-        self.hidden_act = hidden_act
         self.num_labels = num_labels
         self.scope = scope
         self.num_stages = len(hidden_sizes)
@@ -91,7 +89,6 @@ def get_config(self):
             embeddings_size=self.embeddings_size,
             hidden_sizes=self.hidden_sizes,
             depths=self.depths,
-            hidden_act=self.hidden_act,
             num_labels=self.num_labels,
             out_features=self.out_features,
             num_groups=self.num_groups,
@@ -102,11 +99,7 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], 4, 4))
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.num_labels
@@ -124,11 +117,11 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify hidden states
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[0], 8, 8])
 
         # verify channels
         self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
+        self.parent.assertListEqual(model.channels, config.hidden_sizes)
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 6002b4718b1c26d1bbffa532941d8adc509678d2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:01:53 +0100
Subject: [PATCH 18/88] Add docs and tests

---
 README.md                                     |   3 +-
 README_es.md                                  |   3 +-
 README_ja.md                                  |   3 +-
 README_ko.md                                  |   3 +-
 README_zh-hans.md                             |   3 +-
 README_zh-hant.md                             |   3 +-
 docs/source/en/index.mdx                      |   4 +-
 docs/source/en/model_doc/bit.mdx              |  16 +-
 docs/source/en/model_doc/vit_hybrid.mdx       |  57 ++++
 src/transformers/__init__.py                  |   2 +-
 .../models/auto/configuration_auto.py         |   3 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 .../vit_hybrid/configuration_vit_hybrid.py    |  29 ++-
 .../models/vit_hybrid/modeling_vit_hybrid.py  |   8 +-
 src/transformers/utils/dummy_pt_objects.py    |   4 +-
 tests/models/vit_hybrid/__init__.py           |   0
 .../vit_hybrid/test_modeling_vit_hybrid.py    | 246 ++++++++++++++++++
 utils/check_repo.py                           |   1 +
 18 files changed, 357 insertions(+), 33 deletions(-)
 create mode 100644 docs/source/en/model_doc/vit_hybrid.mdx
 create mode 100644 tests/models/vit_hybrid/__init__.py
 create mode 100644 tests/models/vit_hybrid/test_modeling_vit_hybrid.py

diff --git a/README.md b/README.md
index eddb41abe852..354437acc129 100644
--- a/README.md
+++ b/README.md
@@ -272,7 +272,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -404,6 +404,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_es.md b/README_es.md
index c5a40704e6e3..7166faaff3e0 100644
--- a/README_es.md
+++ b/README_es.md
@@ -272,7 +272,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -404,6 +404,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ja.md b/README_ja.md
index 3af7a69af663..c3fae80fecf8 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -307,7 +307,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -439,6 +439,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ko.md b/README_ko.md
index 3050b14e95ed..99ed10526c34 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -222,7 +222,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -354,6 +354,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index dbbbf27ca2a9..1a82ddc363c5 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -246,7 +246,7 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -378,6 +378,7 @@ conda install -c huggingface transformers
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
+1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index cd5c1f920633..b58f83d6a05d 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -258,7 +258,7 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -390,6 +390,7 @@ conda install -c huggingface transformers
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 38444d5deaa0..5bc76bd71c10 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -60,7 +60,7 @@ The documentation is organized into five sections:
 1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](model_doc/bit)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -192,6 +192,7 @@ The documentation is organized into five sections:
 1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
@@ -355,6 +356,7 @@ Flax), PyTorch, and/or TensorFlow.
 |     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          ViT hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/bit.mdx b/docs/source/en/model_doc/bit.mdx
index 325c30669f16..cbc34c9f8878 100644
--- a/docs/source/en/model_doc/bit.mdx
+++ b/docs/source/en/model_doc/bit.mdx
@@ -10,23 +10,25 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# BiT
+# Big Transfer (BiT)
 
 ## Overview
 
-The BiT model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The BiT model was proposed in [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+BiT is a simple recipe for scaling up pre-training of [ResNet](resnet)-like architectures (specifically, ResNetv2). The method results in significant improvements for transfer learning.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Transfer of pre-trained representations improves sample efficiency and simplifies hyperparameter tuning when training deep neural networks for vision. We revisit the paradigm of pre-training on large supervised datasets and fine-tuning the model on a target task. We scale up pre-training, and propose a simple recipe that we call Big Transfer (BiT). By combining a few carefully selected components, and transferring using a simple heuristic, we achieve strong performance on over 20 datasets. BiT performs well across a surprisingly wide range of data regimes -- from 1 example per class to 1M total examples. BiT achieves 87.5% top-1 accuracy on ILSVRC-2012, 99.4% on CIFAR-10, and 76.3% on the 19 task Visual Task Adaptation Benchmark (VTAB). On small datasets, BiT attains 76.8% on ILSVRC-2012 with 10 examples per class, and 97.0% on CIFAR-10 with 10 examples per class. We conduct detailed analysis of the main components that lead to high transfer performance.*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://arxiv.org/abs/1803.08494),
+2) [weight standardization](https://arxiv.org/abs/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant
+impact on transfer learning.
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/google-research/big_transfer).
 
 
 ## BitConfig
diff --git a/docs/source/en/model_doc/vit_hybrid.mdx b/docs/source/en/model_doc/vit_hybrid.mdx
new file mode 100644
index 000000000000..32c0964f7097
--- /dev/null
+++ b/docs/source/en/model_doc/vit_hybrid.mdx
@@ -0,0 +1,57 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Hybrid Vision Transformer (ViT Hybrid)
+
+## Overview
+
+The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
+at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
+Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
+Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
+very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
+by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
+
+
+The abstract from the paper is the following:
+
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
+applications to computer vision remain limited. In vision, attention is either applied in conjunction with
+convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
+structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
+sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
+data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
+Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
+substantially fewer computational resources to train.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
+found [here](https://github.com/google-research/vision_transformer).
+
+
+## ViTHybridConfig
+
+[[autodoc]] ViTHybridConfig
+
+## ViTHybridModel
+
+[[autodoc]] ViTHybridModel
+    - forward
+
+## ViTHybridForImageClassification
+
+[[autodoc]] ViTHybridForImageClassification
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9e7cc0e65c63..ae4853b645fd 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5138,8 +5138,8 @@
         )
         from .models.vit_hybrid import (
             VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ViTHybirdModel,
             ViTHybridForImageClassification,
+            ViTHybridModel,
             ViTHybridPreTrainedModel,
         )
         from .models.vit_mae import (
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 307af71c3763..bed97bcd5fae 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -162,6 +162,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
         ("visual_bert", "VisualBertConfig"),
         ("vit", "ViTConfig"),
+        ("vit_hybrid", "ViTHybridConfig"),
         ("vit_mae", "ViTMAEConfig"),
         ("vit_msn", "ViTMSNConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -302,6 +303,7 @@
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vit_hybrid", "VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_msn", "VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -474,6 +476,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
         ("visual_bert", "VisualBERT"),
         ("vit", "ViT"),
+        ("vit_hybrid", "ViT hybrid"),
         ("vit_mae", "ViTMAE"),
         ("vit_msn", "ViTMSN"),
         ("wav2vec2", "Wav2Vec2"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a0af6df9ab6c..c5341feed5c9 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -156,6 +156,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
         ("visual_bert", "VisualBertModel"),
         ("vit", "ViTModel"),
+        ("vit_hybrid", "ViTHybridModel"),
         ("vit_mae", "ViTMAEModel"),
         ("vit_msn", "ViTMSNModel"),
         ("wav2vec2", "Wav2Vec2Model"),
@@ -399,6 +400,7 @@
         ("swinv2", "Swinv2ForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
+        ("vit_hybrid", "ViTHybridForImageClassification"),
         ("vit_msn", "ViTMSNForImageClassification"),
     ]
 )
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 3b9d59fe1240..8dabdb18879b 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -16,13 +16,14 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..bit import BitConfig
 
 
 logger = logging.get_logger(__name__)
 
 VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/vit-base-patch16-224": "https://huggingface.co/vit-base-patch16-224/resolve/main/config.json",
-    # See all ViT models at https://huggingface.co/models?filter=vit
+    "google/vit-base-r50-s16-384": "https://huggingface.co/vit-base-r50-s16-384/resolve/main/config.json",
+    # See all ViT hybrid models at https://huggingface.co/models?filter=vit
 }
 
 
@@ -31,7 +32,7 @@ class ViTHybridConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`ViTModel`]. It is used to instantiate an ViT
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the ViT
-    [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) architecture.
+    [google/vit-base-r50-s16-384](https://huggingface.co/google/vit-base-r50-s16-384) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -65,19 +66,17 @@ class ViTHybridConfig(PretrainedConfig):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
-           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
 
     ```python
-    >>> from transformers import ViTHybridConfig, ViTModel
+    >>> from transformers import ViTHybridConfig, ViTHybridModel
 
-    >>> # Initializing a ViT Hybrid vit-base-patch16-224 style configuration
+    >>> # Initializing a ViT Hybrid vit-base-r50-s16-384 style configuration
     >>> configuration = ViTHybridConfig()
 
-    >>> # Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
-    >>> model = ViTModel(configuration)
+    >>> # Initializing a model (with random weights) from the vit-base-r50-s16-384 style configuration
+    >>> model = ViTHybridModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -96,16 +95,23 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        is_encoder_decoder=False,
         image_size=224,
         patch_size=1,
         num_channels=3,
         qkv_bias=True,
-        encoder_stride=16,
         **kwargs
     ):
         super().__init__(**kwargs)
 
+        if backbone_config is None:
+            backbone_config = BitConfig(
+                stem_type="same",
+                conv_layer="std_conv_same",
+                layer_type="bottleneck",
+                depths=(3, 4, 9),
+                out_features=["stage3"],
+            )
+
         self.backbone_config = backbone_config
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -120,4 +126,3 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-        self.encoder_stride = encoder_stride
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 8d91faa61211..11bbc406699a 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -44,13 +44,13 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
 
 # Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224"
+_IMAGE_CLASS_CHECKPOINT = "google/vit-base-r50-s16-384"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
 
 
-VIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/vit-base-patch16-224",
-    # See all ViTHybrid models at https://huggingface.co/models?filter=vit
+VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/vit-base-r50-s16-384",
+    # See all ViT hybrid models at https://huggingface.co/models?filter=vit
 ]
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 785e2fd08f5d..5638b3ae4064 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5821,14 +5821,14 @@ def __init__(self, *args, **kwargs):
 VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class ViTHybirdModel(metaclass=DummyObject):
+class ViTHybridForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTHybridForImageClassification(metaclass=DummyObject):
+class ViTHybridModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/vit_hybrid/__init__.py b/tests/models/vit_hybrid/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
new file mode 100644
index 000000000000..ca8b2dae1814
--- /dev/null
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViT Hybrid model. """
+
+
+import inspect
+import unittest
+
+from transformers import ViTHybridConfig
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import ViTHybridForImageClassification, ViTHybridModel
+    from transformers.models.vit_hybrid.modeling_vit_hybrid import VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTFeatureExtractor
+
+
+class ViTHybridModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ViTHybridConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = ViTHybridModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = ViTHybridForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = ViTHybridForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class ViTHybridModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (ViTHybridModel, ViTHybridForImageClassification) if is_torch_available() else ()
+    fx_compatible = True
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ViTHybridModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTHybridConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="ViT does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ViTHybridModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class ViTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = ViTHybridForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 93fafb5fc9c3..0f61fafd63da 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -672,6 +672,7 @@ def find_all_documented_objects():
     "PyTorchBenchmarkArguments",
     "TensorFlowBenchmark",
     "TensorFlowBenchmarkArguments",
+    "BitBackbone",
     "MaskFormerSwinBackbone",
     "ResNetBackbone",
     "AutoBackbone",

From 63a5aa855d4407393a04e55629b0690e4a225e51 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:13:18 +0100
Subject: [PATCH 19/88] Make more tests pass

---
 .../vit_hybrid/configuration_vit_hybrid.py      | 17 ++++++++++++++++-
 .../models/vit_hybrid/modeling_vit_hybrid.py    |  2 +-
 .../vit_hybrid/test_modeling_vit_hybrid.py      |  2 --
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 8dabdb18879b..7badc4a600cd 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ ViT Hybrid model configuration"""
 
+import copy
+from typing import Dict
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..bit import BitConfig
@@ -104,6 +107,7 @@ def __init__(
         super().__init__(**kwargs)
 
         if backbone_config is None:
+            # default to BiT backbone
             backbone_config = BitConfig(
                 stem_type="same",
                 conv_layer="std_conv_same",
@@ -126,3 +130,14 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
+
+    def to_dict(self) -> Dict[str, any]:
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
\ No newline at end of file
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 11bbc406699a..dba51d933419 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -115,7 +115,7 @@ def __init__(self, config, feature_size=None):
 
         if feature_size is None:
             with torch.no_grad():
-                # NOTE Most reliable way of determining output dims is to run forward pass
+                # NOTE Most reliable way of determining spatial output dimensions is to run forward pass
                 training = self.backbone.training
                 if training:
                     self.backbone.eval()
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index ca8b2dae1814..445f82d55c53 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -159,8 +159,6 @@ class ViTHybridModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (ViTHybridModel, ViTHybridForImageClassification) if is_torch_available() else ()
-    fx_compatible = True
-
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False

From 47af19ed2001722ce39752c6b04a4bfcf1ec5324 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:22:49 +0100
Subject: [PATCH 20/88] Improve default backbone config

---
 .../vit_hybrid/configuration_vit_hybrid.py    | 32 ++++++++++++-------
 .../vit_hybrid/test_modeling_vit_hybrid.py    | 17 +---------
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 7badc4a600cd..fde518ca5ff2 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -19,6 +19,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
 from ..bit import BitConfig
 
 
@@ -107,14 +108,24 @@ def __init__(
         super().__init__(**kwargs)
 
         if backbone_config is None:
-            # default to BiT backbone
-            backbone_config = BitConfig(
-                stem_type="same",
-                conv_layer="std_conv_same",
-                layer_type="bottleneck",
-                depths=(3, 4, 9),
-                out_features=["stage3"],
-            )
+            logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.")
+            backbone_config = {
+                "stem_type": "same",
+                "conv_layer": "std_conv_same",
+                "layer_type": "bottleneck",
+                "depths": (3, 4, 9),
+                "out_features": ["stage3"],
+            }
+
+        if isinstance(backbone_config, dict):
+            if "model_type" in backbone_config:
+                backbone_config_class = CONFIG_MAPPING[backbone_config["model_type"]]
+            else:
+                logger.info(
+                    "`model_type` is not found in `backbone_config`. Use `ResNet` as the backbone configuration class."
+                )
+                backbone_config_class = BitConfig
+            backbone_config = backbone_config_class(**backbone_config)
 
         self.backbone_config = backbone_config
         self.hidden_size = hidden_size
@@ -133,11 +144,10 @@ def __init__(
 
     def to_dict(self) -> Dict[str, any]:
         """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-        Returns:
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["backbone_config"] = self.backbone_config.to_dict()
         output["model_type"] = self.__class__.model_type
-        return output
\ No newline at end of file
+        return output
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index 445f82d55c53..d59546a1fe74 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -19,12 +19,7 @@
 import unittest
 
 from transformers import ViTHybridConfig
-from transformers.testing_utils import (
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -130,16 +125,6 @@ def create_and_check_for_image_classification(self, config, pixel_values, labels
         result = model(pixel_values, labels=labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTHybridForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (

From 8c6194fdff8ab837eb8af229b9863e6b6f3b0260 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:28:36 +0100
Subject: [PATCH 21/88] Update model_type

---
 .../models/bit/configuration_bit.py           |  2 +-
 .../models/bit/convert_bit_to_pytorch.py      | 12 +++----
 src/transformers/models/bit/modeling_bit.py   | 14 ++++----
 .../convert_vit_hybrid_timm_to_pytorch.py     | 32 +++++++++----------
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index fd69e5d8ad11..1abcb097d68d 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -74,7 +74,7 @@ class BitConfig(PretrainedConfig):
     >>> configuration = model.config
     ```
     """
-    model_type = "resnetv2"
+    model_type = "bit"
     layer_types = ["preactivation", "bottleneck"]
 
     def __init__(
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 8844f2ff33ae..1da473edb1d3 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -59,15 +59,15 @@ def get_config(model_name):
 
 def rename_key(name):
     if "stem.conv" in name:
-        name = name.replace("stem.conv", "resnetv2.embedder.convolution")
+        name = name.replace("stem.conv", "bit.embedder.convolution")
     if "blocks" in name:
         name = name.replace("blocks", "layers")
     if "head.fc" in name:
         name = name.replace("head.fc", "classifier.1")
     if name.startswith("norm"):
-        name = "resnetv2." + name
-    if "resnetv2" not in name and "classifier" not in name:
-        name = "resnetv2.encoder." + name
+        name = "bit." + name
+    if "bit" not in name and "classifier" not in name:
+        name = "bit.encoder." + name
 
     return name
 
@@ -80,7 +80,7 @@ def prepare_img():
 
 
 @torch.no_grad()
-def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
+def convert_bit_checkpoint(model_name, pytorch_dump_folder_path):
     """
     Copy/paste/tweak model's weights to our BiT structure.
     """
@@ -141,4 +141,4 @@ def convert_resnetv2_checkpoint(model_name, pytorch_dump_folder_path):
     )
 
     args = parser.parse_args()
-    convert_resnetv2_checkpoint(args.model_name, args.pytorch_dump_folder_path)
+    convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index b8542fcd6bcc..425f3e1a67c2 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -58,7 +58,7 @@
 
 BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/resnetnv2-50",
-    # See all BiT models at https://huggingface.co/models?filter=resnetv2
+    # See all BiT models at https://huggingface.co/models?filter=bit
 ]
 
 
@@ -705,7 +705,7 @@ def forward(
         )
 
 
-# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->resnetv2
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->bit
 class BitPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -713,7 +713,7 @@ class BitPreTrainedModel(PreTrainedModel):
     """
 
     config_class = BitConfig
-    base_model_prefix = "resnetv2"
+    base_model_prefix = "bit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 
@@ -826,7 +826,7 @@ class BitForImageClassification(BitPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.resnetv2 = BitModel(config)
+        self.bit = BitModel(config)
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
@@ -857,7 +857,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.resnetv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
 
         pooled_output = outputs.pooler_output if return_dict else outputs[1]
 
@@ -904,7 +904,7 @@ def __init__(self, config):
         super().__init__(config)
 
         self.stage_names = config.stage_names
-        self.resnetv2 = BitModel(config)
+        self.bit = BitModel(config)
 
         self.out_features = config.out_features
 
@@ -952,7 +952,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
 
-        outputs = self.resnetv2(pixel_values, output_hidden_states=True, return_dict=True)
+        outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True)
 
         hidden_states = outputs.hidden_states
 
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 9ff005521aa1..d346c295be62 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -46,25 +46,25 @@ def create_rename_keys(config, base_model=False):
     rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"))
 
     # backbone
-    rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.convolution.weight"))
-    rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.weight"))
-    rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.resnetv2.embedder.norm.bias"))
+    rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.convolution.weight"))
+    rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.weight"))
+    rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.bias"))
 
     for stage_idx in range(len(config.backbone_config.depths)):
         for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias"))
-
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight"))
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight"))
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.resnetv2.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight"))
+            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias"))
+
+        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight"))
+        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight"))
+        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias"))
 
     # transformer encoder
     for i in range(config.num_hidden_layers):

From b10e4172b916bd45e25eb5edad64bfaae333efcf Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:41:36 +0100
Subject: [PATCH 22/88] Fix more tests

---
 .../models/vit_hybrid/modeling_vit_hybrid.py       |  1 -
 .../models/vit_hybrid/test_modeling_vit_hybrid.py  | 14 ++++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index dba51d933419..2d4255914a2f 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -131,7 +131,6 @@ def __init__(self, config, feature_size=None):
             )
             feature_dim = self.backbone.channels[-1]
 
-        assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
         self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.image_size = image_size
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index d59546a1fe74..81c373a2c704 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -45,7 +45,7 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        image_size=30,
+        image_size=64,
         patch_size=2,
         num_channels=3,
         is_training=True,
@@ -79,8 +79,10 @@ def __init__(
         self.initializer_range = initializer_range
         self.scope = scope
 
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
+        # in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        # the number of patches is based on the feature map of the backbone, which by default uses an output stride
+        # of 32, which means that the feature map has a spatial resolution of 1/32 of the input image size
+        num_patches = (self.image_size // 32) ** 2
         self.seq_length = num_patches + 1
 
     def prepare_config_and_inputs(self):
@@ -127,11 +129,7 @@ def create_and_check_for_image_classification(self, config, pixel_values, labels
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
+        config, pixel_values, labels = config_and_inputs
         inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 

From b3ff469a1e8763a64fd49d17932a96287faf26d5 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:45:24 +0100
Subject: [PATCH 23/88] Add more copied from statements

---
 .../models/vit_hybrid/modeling_vit_hybrid.py             | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 2d4255914a2f..6a4602f0d313 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -152,6 +152,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTHybrid
 class ViTHybridSelfAttention(nn.Module):
     def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__()
@@ -212,6 +213,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTHybrid
 class ViTHybridSelfOutput(nn.Module):
     """
     The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the
@@ -231,6 +233,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTHybrid
 class ViTHybridAttention(nn.Module):
     def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__()
@@ -270,6 +273,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTHybrid
 class ViTHybridIntermediate(nn.Module):
     def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__()
@@ -287,6 +291,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTHybrid
 class ViTHybridOutput(nn.Module):
     def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__()
@@ -302,6 +307,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTHybrid
 class ViTHybridLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
 
@@ -344,6 +350,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTHybrid
 class ViTHybridEncoder(nn.Module):
     def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__()
@@ -401,6 +408,7 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->ViTHybrid
 class ViTHybridPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -558,6 +566,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->ViTHybrid
 class ViTHybridPooler(nn.Module):
     def __init__(self, config: ViTHybridConfig):
         super().__init__()

From 0b643a14becbdda60e0e537d1bbc79cc4d62fb38 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 15:57:38 +0100
Subject: [PATCH 24/88] More improvements

---
 .../models/vit_hybrid/modeling_vit_hybrid.py  | 86 ++++++++++++++++---
 1 file changed, 75 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 6a4602f0d313..ea72b117b583 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -54,12 +54,13 @@
 ]
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings with ViT->ViTHybrid
 class ViTHybridEmbeddings(nn.Module):
     """
-    Construct the CLS token, position and patch embeddings.
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
     """
 
-    def __init__(self, config: ViTHybridConfig):
+    def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None:
         super().__init__()
 
         self.cls_token = nn.Parameter(
@@ -67,6 +68,7 @@ def __init__(self, config: ViTHybridConfig):
                 torch.zeros(1, 1, config.hidden_size, dtype=torch.float32), mean=0.0, std=config.initializer_range
             )
         )
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
         self.patch_embeddings = ViTHybridPatchEmbeddings(config)
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(
@@ -79,16 +81,64 @@ def __init__(self, config: ViTHybridConfig):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        embeddings = self.patch_embeddings(pixel_values)
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
 
         # add the [CLS] token to the embedded patch tokens
-        batch_size = embeddings.shape[0]
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
 
         # add positional encoding to each token
-        embeddings = embeddings + self.position_embeddings
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
 
         embeddings = self.dropout(embeddings)
 
@@ -139,12 +189,18 @@ def __init__(self, config, feature_size=None):
 
         self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=patch_size, stride=patch_size)
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        num_channels = pixel_values.shape[1]
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
 
         features = self.backbone(pixel_values).feature_maps[-1]
         embeddings = self.projection(features).flatten(2).transpose(1, 2)
@@ -478,12 +534,13 @@ def _set_gradient_checkpointing(self, module: ViTHybridEncoder, value: bool = Fa
     "The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.",
     VIT_START_DOCSTRING,
 )
+# Copied from transformers.models.vit.modeling_vit.ViTModel with ViT->ViTHybrid
 class ViTHybridModel(ViTHybridPreTrainedModel):
-    def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True):
+    def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = ViTHybridEmbeddings(config)
+        self.embeddings = ViTHybridEmbeddings(config, use_mask_token=use_mask_token)
         self.encoder = ViTHybridEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -515,9 +572,11 @@ class PreTrainedModel
     def forward(
         self,
         pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -541,7 +600,9 @@ def forward(
         if pixel_values.dtype != expected_dtype:
             pixel_values = pixel_values.to(expected_dtype)
 
-        embedding_output = self.embeddings(pixel_values)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -589,6 +650,7 @@ def forward(self, hidden_states):
     """,
     VIT_START_DOCSTRING,
 )
+# Copied from transformers.models.vit.modeling_vit.ViTForImageClassification with ViT->ViTHybrid
 class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
     def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__(config)
@@ -617,6 +679,7 @@ def forward(
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, ImageClassifierOutput]:
         r"""
@@ -632,6 +695,7 @@ def forward(
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=return_dict,
         )
 

From be7986d2f77e21f8b3add8fac977172bc67b18c0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 30 Nov 2022 16:13:15 +0100
Subject: [PATCH 25/88] Add push to hub to conversion scripts

---
 .../models/bit/convert_bit_to_pytorch.py           | 14 ++++++++++++--
 .../convert_vit_hybrid_timm_to_pytorch.py          | 14 +++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 1da473edb1d3..a38aa242bf6e 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -80,7 +80,7 @@ def prepare_img():
 
 
 @torch.no_grad()
-def convert_bit_checkpoint(model_name, pytorch_dump_folder_path):
+def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
     """
     Copy/paste/tweak model's weights to our BiT structure.
     """
@@ -126,6 +126,11 @@ def convert_bit_checkpoint(model_name, pytorch_dump_folder_path):
         # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
         # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
+    if push_to_hub:
+        print(f"Pushing model and feature extractor to the hub {model_name}")
+        model.push_to_hub(f"nielsr/{model_name}")
+        # feature_extractor.push_to_hub(f"nielsr/{model_name}")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -139,6 +144,11 @@ def convert_bit_checkpoint(model_name, pytorch_dump_folder_path):
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model to the hub.",
+    )
 
     args = parser.parse_args()
-    convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path)
+    convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index d346c295be62..3673acd575eb 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -154,7 +154,7 @@ def prepare_img():
 
 
 @torch.no_grad()
-def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
+def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False):
     """
     Copy/paste/tweak model's weights to our ViT structure.
     """
@@ -220,6 +220,11 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
         # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
+    if push_to_hub:
+        print(f"Pushing model and feature extractor to the hub {vit_name}")
+        model.push_to_hub(f"nielsr/{vit_name}")
+        # feature_extractor.push_to_hub(f"nielsr/{vit_name}")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -228,11 +233,14 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         "--vit_name",
         default="vit_base_r50_s16_384",
         type=str,
-        help="Name of the ViT timm model you'd like to convert.",
+        help="Name of the hybrid ViT timm model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub."
+    )
 
     args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
+    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path, args.push_to_hub)

From 2725b8ac86c7094b93b6c28c1f9b81b82a199dc9 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 30 Nov 2022 15:35:48 +0000
Subject: [PATCH 26/88] clean

---
 src/transformers/models/bit/modeling_bit.py | 14 +++++++-------
 tests/models/bit/test_modeling_bit.py       | 15 +++++++++++----
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 425f3e1a67c2..b8542fcd6bcc 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -58,7 +58,7 @@
 
 BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/resnetnv2-50",
-    # See all BiT models at https://huggingface.co/models?filter=bit
+    # See all BiT models at https://huggingface.co/models?filter=resnetv2
 ]
 
 
@@ -705,7 +705,7 @@ def forward(
         )
 
 
-# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->bit
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->resnetv2
 class BitPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -713,7 +713,7 @@ class BitPreTrainedModel(PreTrainedModel):
     """
 
     config_class = BitConfig
-    base_model_prefix = "bit"
+    base_model_prefix = "resnetv2"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 
@@ -826,7 +826,7 @@ class BitForImageClassification(BitPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.bit = BitModel(config)
+        self.resnetv2 = BitModel(config)
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
@@ -857,7 +857,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        outputs = self.resnetv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
 
         pooled_output = outputs.pooler_output if return_dict else outputs[1]
 
@@ -904,7 +904,7 @@ def __init__(self, config):
         super().__init__(config)
 
         self.stage_names = config.stage_names
-        self.bit = BitModel(config)
+        self.resnetv2 = BitModel(config)
 
         self.out_features = config.out_features
 
@@ -952,7 +952,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
 
-        outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True)
+        outputs = self.resnetv2(pixel_values, output_hidden_states=True, return_dict=True)
 
         hidden_states = outputs.hidden_states
 
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 71a7123d5fec..35d2871422d8 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BiT model. """
+""" Testing suite for the PyTorch Bit model. """
 
 
 import inspect
@@ -52,6 +52,7 @@ def __init__(
         depths=[1, 1, 2, 1],
         is_training=True,
         use_labels=True,
+        hidden_act="relu",
         num_labels=3,
         scope=None,
         out_features=["stage2", "stage3", "stage4"],
@@ -66,6 +67,7 @@ def __init__(
         self.depths = depths
         self.is_training = is_training
         self.use_labels = use_labels
+        self.hidden_act = hidden_act
         self.num_labels = num_labels
         self.scope = scope
         self.num_stages = len(hidden_sizes)
@@ -89,6 +91,7 @@ def get_config(self):
             embeddings_size=self.embeddings_size,
             hidden_sizes=self.hidden_sizes,
             depths=self.depths,
+            hidden_act=self.hidden_act,
             num_labels=self.num_labels,
             out_features=self.out_features,
             num_groups=self.num_groups,
@@ -99,7 +102,11 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], 4, 4))
+        # expected last hidden states: B, C, H // 32, W // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.num_labels
@@ -117,11 +124,11 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify hidden states
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[0], 8, 8])
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
 
         # verify channels
         self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes)
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 0ef4ee4920848593ed4be1dbf138d6b0a0a332c5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 30 Nov 2022 17:02:02 +0000
Subject: [PATCH 27/88] more cleanup

---
 src/transformers/models/bit/configuration_bit.py |  3 ---
 src/transformers/models/bit/modeling_bit.py      | 14 ++++++--------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 1abcb097d68d..888cf6088971 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -51,8 +51,6 @@ class BitConfig(PretrainedConfig):
             are supported.
         num_groups (`int`, *optional*, defaults to `32`):
             Number of groups used for the `BitGroupNormActivation` layers
-        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
-            If `True`, the first stage will downsample the inputs using a `stride` of 2.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate for the stochastic depth.
         output_stride (`int`, *optional*, defaults to 32):
@@ -105,7 +103,6 @@ def __init__(
         self.layer_type = layer_type
         self.hidden_act = hidden_act
         self.num_groups = num_groups
-        self.downsample_in_first_stage = downsample_in_first_stage
         self.drop_path_rate = drop_path_rate
         self.output_stride = output_stride
         self.width_factor = width_factor
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index b8542fcd6bcc..918cc66aed1f 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -159,15 +159,15 @@ def __init__(
         self,
         config,
         num_channels,
-        num_groups=32,
         eps=1e-5,
         affine=True,
         group_size=None,
         apply_act=True,
         drop_layer=None,
     ):
+
         super(BitGroupNormActivation, self).__init__(
-            _num_groups(num_channels, num_groups, group_size), num_channels, eps=eps, affine=affine
+            _num_groups(num_channels, config.num_groups, group_size), num_channels, eps=eps, affine=affine
         )
         self.drop = drop_layer() if drop_layer is not None else nn.Identity()
         if apply_act:
@@ -304,9 +304,7 @@ def __init__(self, config: BitConfig):
             self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
 
         if not config.layer_type == "preactivation":
-            self.norm = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)(
-                num_channels=config.embedding_size
-            )
+            self.norm = partial(BitGroupNormActivation, config=config)(num_channels=config.embedding_size)
         else:
             self.norm = nn.Identity()
 
@@ -413,7 +411,7 @@ def __init__(
                 " [`'std_conv'`, `'std_conv_same`]"
             )
 
-        norm_layer = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)
+        norm_layer = partial(BitGroupNormActivation, config=config)
 
         out_channels = out_channels or in_channels
         mid_channels = make_div(out_channels * bottle_ratio)
@@ -485,7 +483,7 @@ def __init__(
                 " [`'std_conv'`, `'std_conv_same`]"
             )
 
-        norm_layer = partial(BitGroupNormActivation, config=config, num_groups=config.num_groups)
+        norm_layer = partial(BitGroupNormActivation, config=config)
 
         out_channels = out_channels or in_channels
         mid_chs = make_div(out_channels * bottle_ratio)
@@ -765,7 +763,7 @@ def __init__(self, config):
         self.embedder = BitEmbeddings(config)
 
         self.encoder = BitEncoder(config)
-        norm_layer = partial(BitGroupNormActivation, num_groups=config.num_groups)
+        norm_layer = BitGroupNormActivation
         self.norm = (
             norm_layer(config, num_channels=config.hidden_sizes[-1])
             if config.layer_type == "preactivation"

From d29cc43415ba1a6cc64f7b76532f879c9c9bec8e Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 08:04:34 +0000
Subject: [PATCH 28/88] clean

---
 src/transformers/models/bit/modeling_bit.py | 5 +++++
 tests/models/bit/test_modeling_bit.py       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 918cc66aed1f..50d7ce893f77 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -293,6 +293,11 @@ def __init__(self, config: BitConfig):
             conv_layer = partial(StdConv2d, eps=1e-8)
         elif config.conv_layer == "std_conv_same":
             conv_layer = partial(StdConv2dSame, eps=1e-8)
+        else:
+            raise ValueError(
+                f"Conv type {config.conv_layer} not supported, please use one of the following: [`'std_conv'`,"
+                " `'std_conv_same'`]"
+            )
 
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
 
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 35d2871422d8..4dab66c08143 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -306,6 +306,6 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-11.1069, -9.7877, -8.3777]).to(torch_device)
+        expected_slice = torch.tensor([[-0.6526, -0.5263, -1.4398]]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From c28a5ba8b58d0c242c18bac9f51dcb3b6d4df3e8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 08:06:31 +0000
Subject: [PATCH 29/88] replace to

---
 src/transformers/models/bit/modeling_bit.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 50d7ce893f77..b8d3fa73b0e6 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -829,7 +829,7 @@ class BitForImageClassification(BitPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.resnetv2 = BitModel(config)
+        self.bit = BitModel(config)
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
@@ -860,7 +860,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.resnetv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
 
         pooled_output = outputs.pooler_output if return_dict else outputs[1]
 
@@ -907,7 +907,7 @@ def __init__(self, config):
         super().__init__(config)
 
         self.stage_names = config.stage_names
-        self.resnetv2 = BitModel(config)
+        self.bit = BitModel(config)
 
         self.out_features = config.out_features
 
@@ -955,7 +955,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
 
-        outputs = self.resnetv2(pixel_values, output_hidden_states=True, return_dict=True)
+        outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True)
 
         hidden_states = outputs.hidden_states
 

From de9aeba6fab3da121727bfbac48660add4ecbebe Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 08:17:37 +0000
Subject: [PATCH 30/88] fix

---
 src/transformers/models/bit/modeling_bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index b8d3fa73b0e6..902e86b5dcdb 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -58,7 +58,7 @@
 
 BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/resnetnv2-50",
-    # See all BiT models at https://huggingface.co/models?filter=resnetv2
+    # See all BiT models at https://huggingface.co/models?filter=bit
 ]
 
 

From 60c6d2332d2b79254541904fe2474875c57162fa Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 1 Dec 2022 09:17:09 +0100
Subject: [PATCH 31/88] Update src/transformers/models/bit/configuration_bit.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/bit/configuration_bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 888cf6088971..0cab0f777d0e 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -50,7 +50,7 @@ class BitConfig(PretrainedConfig):
             The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
             are supported.
         num_groups (`int`, *optional*, defaults to `32`):
-            Number of groups used for the `BitGroupNormActivation` layers
+            Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate for the stochastic depth.
         output_stride (`int`, *optional*, defaults to 32):

From 77ccf05de822577523b0e793760146d17ffab2a4 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 08:18:48 +0000
Subject: [PATCH 32/88] fix base model prefix

---
 src/transformers/models/bit/modeling_bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 902e86b5dcdb..137b92064ad8 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -716,7 +716,7 @@ class BitPreTrainedModel(PreTrainedModel):
     """
 
     config_class = BitConfig
-    base_model_prefix = "resnetv2"
+    base_model_prefix = "bit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 

From f1354d856a9487659ba210bbb3ad73b538c49ccf Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 09:04:35 +0000
Subject: [PATCH 33/88] more cleaning

---
 .../models/bit/configuration_bit.py           |   8 +-
 src/transformers/models/bit/modeling_bit.py   | 255 +++++++-----------
 2 files changed, 106 insertions(+), 157 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 0cab0f777d0e..ccf0d278e85a 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -49,6 +49,8 @@ class BitConfig(PretrainedConfig):
         hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
             are supported.
+        convolutional_padding (`str`, *optional*, defaults to `"valid"`):
+            padding strategy to use for `StdConv2d` layers, it can be either `"valid"`, or `"same"`.
         num_groups (`int`, *optional*, defaults to `32`):
             Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
@@ -87,7 +89,7 @@ def __init__(
         drop_path_rate=0.0,
         output_stride=32,
         width_factor=1,
-        conv_layer="std_conv",
+        convolutional_padding=None,
         num_groups=32,
         out_features=None,
         **kwargs
@@ -106,7 +108,9 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.output_stride = output_stride
         self.width_factor = width_factor
-        self.conv_layer = conv_layer
+        self.convolutional_padding = (
+            convolutional_padding if convolutional_padding is None else convolutional_padding.upper()
+        )
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
         if out_features is not None:
             if not isinstance(out_features, list):
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 137b92064ad8..5aadaa846329 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -99,7 +99,7 @@ def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tupl
     return padding, dynamic
 
 
-class StdConv2dSame(nn.Conv2d):
+class WeightStandardizedConv2d(nn.Conv2d):
     """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
 
     Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
@@ -182,38 +182,6 @@ def forward(self, x):
         return x
 
 
-class StdConv2d(nn.Conv2d):
-    """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.
-
-    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
-        https://arxiv.org/abs/1903.10520v2
-    """
-
-    def __init__(
-        self, in_channel, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1, bias=False, eps=1e-6
-    ):
-        if padding is None:
-            padding, _ = get_padding_value(padding, kernel_size, stride, dilation)
-        super().__init__(
-            in_channel,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-        self.eps = eps
-
-    def forward(self, x):
-        weight = nn.functional.batch_norm(
-            self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
-        ).reshape_as(self.weight)
-        x = nn.functional.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-        return x
-
-
 class DynamicPad2d(nn.Module):
     r"""
     A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
@@ -221,7 +189,7 @@ class DynamicPad2d(nn.Module):
     """
 
     def __init__(self, kernel_size, stride, dilation, value=-float("inf")):
-        super().__init__(self)
+        super().__init__()
         # Safety checkers
         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size)
@@ -252,7 +220,7 @@ def __call__(self, input):
 
         # apply pad
         if padding_height > 0 or padding_width > 0:
-            x = nn.functional.pad(
+            input = nn.functional.pad(
                 input,
                 [
                     padding_width // 2,
@@ -262,7 +230,7 @@ def __call__(self, input):
                 ],
                 value=self.value,
             )
-        return x
+        return input
 
 
 class MaxPool2dSame(nn.MaxPool2d):
@@ -287,17 +255,8 @@ class BitEmbeddings(nn.Module):
     BiT Embeddings (stem) composed of a single aggressive convolution.
     """
 
-    def __init__(self, config: BitConfig):
+    def __init__(self, config: BitConfig, conv_layer: nn.Module):
         super().__init__()
-        if config.conv_layer == "std_conv":
-            conv_layer = partial(StdConv2d, eps=1e-8)
-        elif config.conv_layer == "std_conv_same":
-            conv_layer = partial(StdConv2dSame, eps=1e-8)
-        else:
-            raise ValueError(
-                f"Conv type {config.conv_layer} not supported, please use one of the following: [`'std_conv'`,"
-                " `'std_conv_same'`]"
-            )
 
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
 
@@ -305,8 +264,8 @@ def __init__(self, config: BitConfig):
             self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
             self.pad = nn.Identity()
         else:
-            self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
             self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
+            self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
 
         if not config.layer_type == "preactivation":
             self.norm = partial(BitGroupNormActivation, config=config)(num_channels=config.embedding_size)
@@ -381,7 +340,7 @@ def make_div(value, divisor=8):
     return new_value
 
 
-class BitPreActivationBottleneckLayer(nn.Module):
+class BitBottleneckLayer(nn.Module):
     """Pre-activation (v2) bottleneck block.
     Follows the implementation of "Identity Mappings in Deep Residual Networks":
     https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
@@ -401,20 +360,13 @@ def __init__(
         groups=1,
         drop_path_rate=0.0,
         is_first_layer=False,
+        use_activation=False,
     ):
         super().__init__()
 
         first_dilation = first_dilation or dilation
 
-        if config.conv_layer == "std_conv":
-            conv_layer = partial(StdConv2d, eps=1e-8)
-        elif config.conv_layer == "std_conv_same":
-            conv_layer = partial(StdConv2dSame, eps=1e-8)
-        else:
-            raise ValueError(
-                f"Convolutional layer {config.conv_layer} not supported! Please use one of the following:"
-                " [`'std_conv'`, `'std_conv_same`]"
-            )
+        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.convolutional_padding)
 
         norm_layer = partial(BitGroupNormActivation, config=config)
 
@@ -435,12 +387,20 @@ def __init__(
 
         self.norm1 = norm_layer(num_channels=in_channels)
         self.conv1 = conv_layer(in_channels, mid_channels, 1)
+
         self.norm2 = norm_layer(num_channels=mid_channels)
         self.conv2 = conv_layer(mid_channels, mid_channels, 3, stride=stride, dilation=first_dilation, groups=groups)
+
         self.norm3 = norm_layer(num_channels=mid_channels)
         self.conv3 = conv_layer(mid_channels, out_channels, 1)
+
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
+        if use_activation:
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = nn.Identity()
+
     def forward(self, x):
         x_preact = self.norm1(x)
 
@@ -454,91 +414,81 @@ def forward(self, x):
         x = self.conv2(self.norm2(x))
         x = self.conv3(self.norm3(x))
         x = self.drop_path(x)
-        return x + shortcut
-
-
-class BitBottleneckLayer(nn.Module):
-    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
-
-    def __init__(
-        self,
-        config,
-        in_channels,
-        out_channels=None,
-        bottle_ratio=0.25,
-        stride=1,
-        dilation=1,
-        first_dilation=None,
-        groups=1,
-        conv_layer=None,
-        drop_path_rate=0.0,
-        is_first_layer=False,
-    ):
-        super().__init__()
-        first_dilation = first_dilation or dilation
-
-        # Getting the convolution type
-        if config.conv_layer == "std_conv":
-            conv_layer = partial(StdConv2d, eps=1e-8)
-        elif config.conv_layer == "std_conv_same":
-            conv_layer = partial(StdConv2dSame, eps=1e-8)
-        else:
-            raise ValueError(
-                f"Convolutional layer {config.conv_layer} not supported! Please use one of the following:"
-                " [`'std_conv'`, `'std_conv_same`]"
-            )
-
-        norm_layer = partial(BitGroupNormActivation, config=config)
-
-        out_channels = out_channels or in_channels
-        mid_chs = make_div(out_channels * bottle_ratio)
-
-        if is_first_layer:
-            self.downsample = BitDownsampleConv(
-                in_channels,
-                out_channels,
-                stride=stride,
-                preact=False,
-                conv_layer=conv_layer,
-                norm_layer=norm_layer,
-            )
-        else:
-            self.downsample = None
-
-        self.conv1 = conv_layer(in_channels, mid_chs, 1)
-        self.norm1 = norm_layer(num_channels=mid_chs)
-
-        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
-        self.norm2 = norm_layer(num_channels=mid_chs)
-
-        self.conv3 = conv_layer(mid_chs, out_channels, 1)
-        self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
-
-        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-        self.activation = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        # shortcut branch
-        shortcut = x
-        if self.downsample is not None:
-            shortcut = self.downsample(x)
-
-        # residual
-        x = self.conv1(x)
-        x = self.norm1(x)
-
-        # second step
-        x = self.conv2(x)
-        x = self.norm2(x)
-
-        # third step
-        x = self.conv3(x)
-        x = self.norm3(x)
-
-        # final step
-        x = self.drop_path(x)
-        x = self.activation(x + shortcut)
-        return x
+        return self.activation(x + shortcut)
+
+
+# class BitBottleneckLayer(nn.Module):
+#     """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
+
+#     def __init__(
+#         self,
+#         config,
+#         in_channels,
+#         out_channels=None,
+#         bottle_ratio=0.25,
+#         stride=1,
+#         dilation=1,
+#         first_dilation=None,
+#         groups=1,
+#         drop_path_rate=0.0,
+#         is_first_layer=False,
+#     ):
+#         super().__init__()
+#         first_dilation = first_dilation or dilation
+
+#         conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.convolutional_padding)
+
+#         norm_layer = partial(BitGroupNormActivation, config=config)
+
+#         out_channels = out_channels or in_channels
+#         mid_chs = make_div(out_channels * bottle_ratio)
+
+#         if is_first_layer:
+#             self.downsample = BitDownsampleConv(
+#                 in_channels,
+#                 out_channels,
+#                 stride=stride,
+#                 preact=False,
+#                 conv_layer=conv_layer,
+#                 norm_layer=norm_layer,
+#             )
+#         else:
+#             self.downsample = None
+
+#         self.conv1 = conv_layer(in_channels, mid_chs, 1)
+#         self.norm1 = norm_layer(num_channels=mid_chs)
+
+#         self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+#         self.norm2 = norm_layer(num_channels=mid_chs)
+
+#         self.conv3 = conv_layer(mid_chs, out_channels, 1)
+#         self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
+
+#         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+#         self.activation = ACT2FN[config.hidden_act]
+
+#     def forward(self, x):
+#         # shortcut branch
+#         shortcut = x
+#         if self.downsample is not None:
+#             shortcut = self.downsample(x)
+
+#         # residual
+#         x = self.conv1(x)
+#         x = self.norm1(x)
+
+#         # second step
+#         x = self.conv2(x)
+#         x = self.norm2(x)
+
+#         # third step
+#         x = self.conv3(x)
+#         x = self.norm3(x)
+
+#         # final step
+#         x = self.drop_path(x)
+#         x = self.activation(x + shortcut)
+#         return x
 
 
 class BitDownsampleConv(nn.Module):
@@ -580,16 +530,8 @@ def __init__(
 
         first_dilation = 1 if dilation in (1, 2) else 2
 
-        # Step 1: Get the layer type
-        if config.layer_type == "bottleneck":
-            layer_fn = BitBottleneckLayer
-        elif config.layer_type == "preactivation":
-            layer_fn = BitPreActivationBottleneckLayer
-        else:
-            raise ValueError(
-                f"Unknown layer type: {config.layer_type}. Please use one of the following: [`'bottleneck'`,"
-                " `'preactivation`]"
-            )
+        # Get the layer type
+        layer_fn = partial(BitBottleneckLayer, use_activation=config.layer_type == "bottleneck")
 
         prev_chs = in_channels
         self.layers = nn.Sequential()
@@ -640,7 +582,7 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class BitEncoder(nn.Module):
-    def __init__(self, config: BitConfig):
+    def __init__(self, config: BitConfig, conv_layer: nn.Module):
         super().__init__()
         self.stages = nn.ModuleList([])
 
@@ -765,9 +707,12 @@ class BitModel(BitPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        self.embedder = BitEmbeddings(config)
 
-        self.encoder = BitEncoder(config)
+        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.convolutional_padding)
+
+        self.embedder = BitEmbeddings(config, conv_layer)
+
+        self.encoder = BitEncoder(config, conv_layer)
         norm_layer = BitGroupNormActivation
         self.norm = (
             norm_layer(config, num_channels=config.hidden_sizes[-1])

From 379d9b816d51910b3bba07041108b7cff227cb6d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 09:26:15 +0000
Subject: [PATCH 34/88] get rid of stem

---
 .../models/bit/configuration_bit.py           | 10 +++++---
 src/transformers/models/bit/modeling_bit.py   | 25 +++++++++++++------
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index ccf0d278e85a..27e5209e0134 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -49,12 +49,14 @@ class BitConfig(PretrainedConfig):
         hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
             are supported.
-        convolutional_padding (`str`, *optional*, defaults to `"valid"`):
-            padding strategy to use for `StdConv2d` layers, it can be either `"valid"`, or `"same"`.
+        convolutional_padding (`str`, *optional*, defaults to `None`):
+            padding strategy to use for `StdConv2d` layers, it can be either `"valid"`, `"same"`, or `None`.
         num_groups (`int`, *optional*, defaults to `32`):
             Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate for the stochastic depth.
+        embedding_dynamic_padding (`bool`, *optional*, defaults to `False`):
+            Make use of dynamic padding for the embedding layer
         output_stride (`int`, *optional*, defaults to 32):
             The output stride of the model.
         width_factor (`int`, *optional*, defaults to 1):
@@ -83,7 +85,6 @@ def __init__(
         embedding_size=64,
         hidden_sizes=[256, 512, 1024, 2048],
         depths=[3, 4, 6, 3],
-        stem_type="",
         layer_type="preactivation",
         hidden_act="relu",
         drop_path_rate=0.0,
@@ -92,6 +93,7 @@ def __init__(
         convolutional_padding=None,
         num_groups=32,
         out_features=None,
+        embedding_dynamic_padding=False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -101,13 +103,13 @@ def __init__(
         self.embedding_size = embedding_size
         self.hidden_sizes = hidden_sizes
         self.depths = depths
-        self.stem_type = stem_type
         self.layer_type = layer_type
         self.hidden_act = hidden_act
         self.num_groups = num_groups
         self.drop_path_rate = drop_path_rate
         self.output_stride = output_stride
         self.width_factor = width_factor
+        self.embedding_dynamic_padding = embedding_dynamic_padding
         self.convolutional_padding = (
             convolutional_padding if convolutional_padding is None else convolutional_padding.upper()
         )
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 5aadaa846329..8f936ba4b23c 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -233,17 +233,27 @@ def __call__(self, input):
         return input
 
 
-class MaxPool2dSame(nn.MaxPool2d):
+class BitMaxPool2d(nn.MaxPool2d):
     """Tensorflow like 'SAME' wrapper for 2D max pooling"""
 
     def __init__(
-        self, kernel_size: int, stride=None, dilation=1, ceil_mode=False, padding=(0, 0), padding_value=-float("inf")
+        self,
+        kernel_size: int,
+        stride=None,
+        dilation=1,
+        ceil_mode=False,
+        padding=(0, 0),
+        padding_value=-float("inf"),
+        use_dynamic_padding=True,
     ):
         kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
         stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
         dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
-        super(MaxPool2dSame, self).__init__(kernel_size, stride, padding, dilation, ceil_mode)
-        self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
+        super(BitMaxPool2d, self).__init__(kernel_size, stride, padding, dilation, ceil_mode)
+        if use_dynamic_padding:
+            self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
+        else:
+            self.pad = nn.Identity()
 
     def forward(self, x):
         x = self.pad(x)
@@ -260,11 +270,12 @@ def __init__(self, config: BitConfig, conv_layer: nn.Module):
 
         self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
 
-        if config.stem_type == "same":
-            self.pooler = MaxPool2dSame(kernel_size=3, stride=2)
+        self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)
+
+        # Use the same padding strategy as convolutional layers
+        if config.convolutional_padding == "SAME":
             self.pad = nn.Identity()
         else:
-            self.pooler = nn.MaxPool2d(kernel_size=3, stride=2)
             self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
 
         if not config.layer_type == "preactivation":

From 0b678137bcbdd312719d115fe3709bfd14b66592 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 09:30:56 +0000
Subject: [PATCH 35/88] clean

---
 src/transformers/models/bit/modeling_bit.py | 74 ---------------------
 1 file changed, 74 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 8f936ba4b23c..70be0823a498 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -428,80 +428,6 @@ def forward(self, x):
         return self.activation(x + shortcut)
 
 
-# class BitBottleneckLayer(nn.Module):
-#     """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
-
-#     def __init__(
-#         self,
-#         config,
-#         in_channels,
-#         out_channels=None,
-#         bottle_ratio=0.25,
-#         stride=1,
-#         dilation=1,
-#         first_dilation=None,
-#         groups=1,
-#         drop_path_rate=0.0,
-#         is_first_layer=False,
-#     ):
-#         super().__init__()
-#         first_dilation = first_dilation or dilation
-
-#         conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.convolutional_padding)
-
-#         norm_layer = partial(BitGroupNormActivation, config=config)
-
-#         out_channels = out_channels or in_channels
-#         mid_chs = make_div(out_channels * bottle_ratio)
-
-#         if is_first_layer:
-#             self.downsample = BitDownsampleConv(
-#                 in_channels,
-#                 out_channels,
-#                 stride=stride,
-#                 preact=False,
-#                 conv_layer=conv_layer,
-#                 norm_layer=norm_layer,
-#             )
-#         else:
-#             self.downsample = None
-
-#         self.conv1 = conv_layer(in_channels, mid_chs, 1)
-#         self.norm1 = norm_layer(num_channels=mid_chs)
-
-#         self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
-#         self.norm2 = norm_layer(num_channels=mid_chs)
-
-#         self.conv3 = conv_layer(mid_chs, out_channels, 1)
-#         self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
-
-#         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-#         self.activation = ACT2FN[config.hidden_act]
-
-#     def forward(self, x):
-#         # shortcut branch
-#         shortcut = x
-#         if self.downsample is not None:
-#             shortcut = self.downsample(x)
-
-#         # residual
-#         x = self.conv1(x)
-#         x = self.norm1(x)
-
-#         # second step
-#         x = self.conv2(x)
-#         x = self.norm2(x)
-
-#         # third step
-#         x = self.conv3(x)
-#         x = self.norm3(x)
-
-#         # final step
-#         x = self.drop_path(x)
-#         x = self.activation(x + shortcut)
-#         return x
-
-
 class BitDownsampleConv(nn.Module):
     def __init__(
         self,

From a06a70ee8c220bd5f8c9e13b7ad431f3f1f55bc8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 09:34:59 +0000
Subject: [PATCH 36/88] replace flag

---
 src/transformers/models/bit/configuration_bit.py | 10 ++++------
 src/transformers/models/bit/modeling_bit.py      |  6 +++---
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 27e5209e0134..c621eb07da0d 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -49,8 +49,8 @@ class BitConfig(PretrainedConfig):
         hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
             are supported.
-        convolutional_padding (`str`, *optional*, defaults to `None`):
-            padding strategy to use for `StdConv2d` layers, it can be either `"valid"`, `"same"`, or `None`.
+        global_padding (`str`, *optional*, defaults to `None`):
+            padding strategy to use for most of the layers, it can be either `"valid"`, `"same"`, or `None`.
         num_groups (`int`, *optional*, defaults to `32`):
             Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
@@ -90,7 +90,7 @@ def __init__(
         drop_path_rate=0.0,
         output_stride=32,
         width_factor=1,
-        convolutional_padding=None,
+        global_padding=None,
         num_groups=32,
         out_features=None,
         embedding_dynamic_padding=False,
@@ -110,9 +110,7 @@ def __init__(
         self.output_stride = output_stride
         self.width_factor = width_factor
         self.embedding_dynamic_padding = embedding_dynamic_padding
-        self.convolutional_padding = (
-            convolutional_padding if convolutional_padding is None else convolutional_padding.upper()
-        )
+        self.global_padding = global_padding if global_padding is None else global_padding.upper()
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
         if out_features is not None:
             if not isinstance(out_features, list):
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 70be0823a498..5c050b183aef 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -273,7 +273,7 @@ def __init__(self, config: BitConfig, conv_layer: nn.Module):
         self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)
 
         # Use the same padding strategy as convolutional layers
-        if config.convolutional_padding == "SAME":
+        if config.global_padding == "SAME":
             self.pad = nn.Identity()
         else:
             self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
@@ -377,7 +377,7 @@ def __init__(
 
         first_dilation = first_dilation or dilation
 
-        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.convolutional_padding)
+        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
 
         norm_layer = partial(BitGroupNormActivation, config=config)
 
@@ -645,7 +645,7 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
 
-        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.convolutional_padding)
+        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
 
         self.embedder = BitEmbeddings(config, conv_layer)
 

From 8c8d3d1ae34ccd3e2127f3d6b79df94a48453cae Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 1 Dec 2022 11:14:31 +0100
Subject: [PATCH 37/88] Update src/transformers/models/bit/configuration_bit.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/bit/configuration_bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index c621eb07da0d..e2c6f9d4d504 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -56,7 +56,7 @@ class BitConfig(PretrainedConfig):
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate for the stochastic depth.
         embedding_dynamic_padding (`bool`, *optional*, defaults to `False`):
-            Make use of dynamic padding for the embedding layer
+            Whether or not to make use of dynamic padding for the embedding layer.
         output_stride (`int`, *optional*, defaults to 32):
             The output stride of the model.
         width_factor (`int`, *optional*, defaults to 1):

From 01024fadbd5bdd6ec92353c392bcb27ba0cc74dc Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 1 Dec 2022 11:22:42 +0100
Subject: [PATCH 38/88] Update src/transformers/models/bit/configuration_bit.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/bit/configuration_bit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index e2c6f9d4d504..0bbdaac0a832 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -49,8 +49,8 @@ class BitConfig(PretrainedConfig):
         hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
             are supported.
-        global_padding (`str`, *optional*, defaults to `None`):
-            padding strategy to use for most of the layers, it can be either `"valid"`, `"same"`, or `None`.
+        global_padding (`str`, *optional*):
+            Padding strategy to use for the convolutional layers. Can be either `"valid"`, `"same"`, or `None`.
         num_groups (`int`, *optional*, defaults to `32`):
             Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):

From 04d3c870eff74bc79c71dd96cb8eb9db33a1d389 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 10:36:34 +0000
Subject: [PATCH 39/88] add check

---
 src/transformers/models/bit/configuration_bit.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 0bbdaac0a832..c496467ed415 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -78,6 +78,7 @@ class BitConfig(PretrainedConfig):
     """
     model_type = "bit"
     layer_types = ["preactivation", "bottleneck"]
+    supported_padding = ["SAME", "VALID"]
 
     def __init__(
         self,
@@ -99,6 +100,8 @@ def __init__(
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
             raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        if global_padding is not None and global_padding.upper() in self.supported_padding:
+            global_padding = global_padding.upper()
         self.num_channels = num_channels
         self.embedding_size = embedding_size
         self.hidden_sizes = hidden_sizes
@@ -110,7 +113,8 @@ def __init__(
         self.output_stride = output_stride
         self.width_factor = width_factor
         self.embedding_dynamic_padding = embedding_dynamic_padding
-        self.global_padding = global_padding if global_padding is None else global_padding.upper()
+
+        self.global_padding = global_padding
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
         if out_features is not None:
             if not isinstance(out_features, list):

From d960869c833d78b4266b9c14e15065ada71f51fe Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 10:39:00 +0000
Subject: [PATCH 40/88] another check

---
 src/transformers/models/bit/configuration_bit.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index c496467ed415..37ab6dbba80e 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -100,8 +100,11 @@ def __init__(
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
             raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
-        if global_padding is not None and global_padding.upper() in self.supported_padding:
-            global_padding = global_padding.upper()
+        if global_padding is not None:
+            if global_padding.upper() in self.supported_padding:
+                global_padding = global_padding.upper()
+            else:
+                raise ValueError(f"Padding strategy {global_padding} not supported")
         self.num_channels = num_channels
         self.embedding_size = embedding_size
         self.hidden_sizes = hidden_sizes

From a2e2c705384efab7426df0e5893d357bc806235c Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 14:39:39 +0000
Subject: [PATCH 41/88] fix for hybrid vit

---
 src/transformers/models/bit/modeling_bit.py | 78 ++++++++++++++++++++-
 1 file changed, 76 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 5c050b183aef..56c4377d0d70 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -351,7 +351,7 @@ def make_div(value, divisor=8):
     return new_value
 
 
-class BitBottleneckLayer(nn.Module):
+class BitPreActBottleneckLayer(nn.Module):
     """Pre-activation (v2) bottleneck block.
     Follows the implementation of "Identity Mappings in Deep Residual Networks":
     https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
@@ -428,6 +428,77 @@ def forward(self, x):
         return self.activation(x + shortcut)
 
 
+class BitBottleneckLayer(nn.Module):
+    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
+
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels=None,
+        bottle_ratio=0.25,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        groups=1,
+        drop_path_rate=0.0,
+        is_first_layer=False,
+        use_activation=False,
+    ):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
+
+        norm_layer = partial(BitGroupNormActivation, config=config)
+        out_channels = out_channels or in_channels
+        mid_chs = make_div(out_channels * bottle_ratio)
+
+        if is_first_layer and config.downsample_in_first_stage:
+            self.downsample = BitDownsampleConv(
+                in_channels,
+                out_channels,
+                stride=stride,
+                preact=False,
+                conv_layer=conv_layer,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.downsample = None
+
+        self.conv1 = conv_layer(in_channels, mid_chs, 1)
+        self.norm1 = norm_layer(num_channels=mid_chs)
+        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.norm2 = norm_layer(num_channels=mid_chs)
+        self.conv3 = conv_layer(mid_chs, out_channels, 1)
+        self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
+        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+        if use_activation:
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = nn.Identity()
+
+    def forward(self, x):
+        # shortcut branch
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        # residual
+        x = self.conv1(x)
+        x = self.norm1(x)
+
+        x = self.conv2(x)
+        x = self.norm2(x)
+        
+        x = self.conv3(x)
+        x = self.norm3(x)
+        
+        x = self.drop_path(x)
+        x = self.activation(x + shortcut)
+        return x
+
+
 class BitDownsampleConv(nn.Module):
     def __init__(
         self,
@@ -468,7 +539,10 @@ def __init__(
         first_dilation = 1 if dilation in (1, 2) else 2
 
         # Get the layer type
-        layer_fn = partial(BitBottleneckLayer, use_activation=config.layer_type == "bottleneck")
+        if config.layer_type == "bottleneck":
+            layer_fn = partial(BitBottleneckLayer, use_activation=True)
+        else:
+            layer_fn = partial(BitPreActBottleneckLayer, use_activation=False)
 
         prev_chs = in_channels
         self.layers = nn.Sequential()

From bae635359d66b7af0f1547de5f09562037bfb895 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 15:49:01 +0000
Subject: [PATCH 42/88] final fix

---
 .../models/bit/configuration_bit.py           |  7 +++++++
 src/transformers/models/bit/modeling_bit.py   | 21 ++++++++-----------
 .../convert_vit_hybrid_timm_to_pytorch.py     |  4 ++--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 37ab6dbba80e..cfbe12d36ff0 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -57,6 +57,8 @@ class BitConfig(PretrainedConfig):
             The drop path rate for the stochastic depth.
         embedding_dynamic_padding (`bool`, *optional*, defaults to `False`):
             Whether or not to make use of dynamic padding for the embedding layer.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer on the first stage
         output_stride (`int`, *optional*, defaults to 32):
             The output stride of the model.
         width_factor (`int`, *optional*, defaults to 1):
@@ -95,6 +97,8 @@ def __init__(
         num_groups=32,
         out_features=None,
         embedding_dynamic_padding=False,
+        downsample_in_first_stage=True,
+        stem_type="same",
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -116,6 +120,9 @@ def __init__(
         self.output_stride = output_stride
         self.width_factor = width_factor
         self.embedding_dynamic_padding = embedding_dynamic_padding
+        self.downsample_in_first_stage = downsample_in_first_stage
+
+        self.stem_type = stem_type
 
         self.global_padding = global_padding
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 56c4377d0d70..19aa85c12f07 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -188,7 +188,7 @@ class DynamicPad2d(nn.Module):
     hidden states.
     """
 
-    def __init__(self, kernel_size, stride, dilation, value=-float("inf")):
+    def __init__(self, kernel_size, stride, dilation, value=0):
         super().__init__()
         # Safety checkers
         if isinstance(kernel_size, int):
@@ -243,7 +243,7 @@ def __init__(
         dilation=1,
         ceil_mode=False,
         padding=(0, 0),
-        padding_value=-float("inf"),
+        padding_value=0,
         use_dynamic_padding=True,
     ):
         kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
@@ -273,7 +273,7 @@ def __init__(self, config: BitConfig, conv_layer: nn.Module):
         self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)
 
         # Use the same padding strategy as convolutional layers
-        if config.global_padding == "SAME":
+        if config.global_padding is not None and config.global_padding.upper() == "SAME":
             self.pad = nn.Identity()
         else:
             self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
@@ -443,17 +443,17 @@ def __init__(
         groups=1,
         drop_path_rate=0.0,
         is_first_layer=False,
-        use_activation=False,
     ):
         super().__init__()
         first_dilation = first_dilation or dilation
+
         conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
 
         norm_layer = partial(BitGroupNormActivation, config=config)
         out_channels = out_channels or in_channels
         mid_chs = make_div(out_channels * bottle_ratio)
 
-        if is_first_layer and config.downsample_in_first_stage:
+        if is_first_layer:
             self.downsample = BitDownsampleConv(
                 in_channels,
                 out_channels,
@@ -473,10 +473,7 @@ def __init__(
         self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
-        if use_activation:
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = nn.Identity()
+        self.activation = ACT2FN[config.hidden_act]
 
     def forward(self, x):
         # shortcut branch
@@ -490,10 +487,10 @@ def forward(self, x):
 
         x = self.conv2(x)
         x = self.norm2(x)
-        
+
         x = self.conv3(x)
         x = self.norm3(x)
-        
+
         x = self.drop_path(x)
         x = self.activation(x + shortcut)
         return x
@@ -540,7 +537,7 @@ def __init__(
 
         # Get the layer type
         if config.layer_type == "bottleneck":
-            layer_fn = partial(BitBottleneckLayer, use_activation=True)
+            layer_fn = partial(BitBottleneckLayer)
         else:
             layer_fn = partial(BitPreActBottleneckLayer, use_activation=False)
 
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 3673acd575eb..3503533055eb 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -161,11 +161,11 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False
 
     # define default ViT hybrid configuration
     backbone_config = BitConfig(
-        stem_type="same",
-        conv_layer="std_conv_same",
+        global_padding="same",
         layer_type="bottleneck",
         depths=(3, 4, 9),
         out_features=["stage3"],
+        embedding_dynamic_padding=True,
     )
     config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
     base_model = False

From 4f730af8e319efe08d8ddec84f75a2157e3f4278 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 1 Dec 2022 15:51:51 +0000
Subject: [PATCH 43/88] update config

---
 src/transformers/models/bit/configuration_bit.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index cfbe12d36ff0..37ab6dbba80e 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -57,8 +57,6 @@ class BitConfig(PretrainedConfig):
             The drop path rate for the stochastic depth.
         embedding_dynamic_padding (`bool`, *optional*, defaults to `False`):
             Whether or not to make use of dynamic padding for the embedding layer.
-        downsample_in_first_stage (`bool`, *optional*, defaults to `True`):
-            Whether to add downsampling layer on the first stage
         output_stride (`int`, *optional*, defaults to 32):
             The output stride of the model.
         width_factor (`int`, *optional*, defaults to 1):
@@ -97,8 +95,6 @@ def __init__(
         num_groups=32,
         out_features=None,
         embedding_dynamic_padding=False,
-        downsample_in_first_stage=True,
-        stem_type="same",
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -120,9 +116,6 @@ def __init__(
         self.output_stride = output_stride
         self.width_factor = width_factor
         self.embedding_dynamic_padding = embedding_dynamic_padding
-        self.downsample_in_first_stage = downsample_in_first_stage
-
-        self.stem_type = stem_type
 
         self.global_padding = global_padding
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]

From dc8ba59834c3aad8fa6116eea0a183e3545ed2ad Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 09:41:33 +0100
Subject: [PATCH 44/88] fix class name

---
 src/transformers/models/bit/modeling_bit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 19aa85c12f07..48b5a95aa2c2 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -351,7 +351,7 @@ def make_div(value, divisor=8):
     return new_value
 
 
-class BitPreActBottleneckLayer(nn.Module):
+class BitPreActivationBottleneckLayer(nn.Module):
     """Pre-activation (v2) bottleneck block.
     Follows the implementation of "Identity Mappings in Deep Residual Networks":
     https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
@@ -539,7 +539,7 @@ def __init__(
         if config.layer_type == "bottleneck":
             layer_fn = partial(BitBottleneckLayer)
         else:
-            layer_fn = partial(BitPreActBottleneckLayer, use_activation=False)
+            layer_fn = partial(BitPreActivationBottleneckLayer, use_activation=False)
 
         prev_chs = in_channels
         self.layers = nn.Sequential()

From b53a44ba340448ef91be839cb7b4a278276fac96 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 09:42:54 +0100
Subject: [PATCH 45/88] fix `make fix-copies`

---
 src/transformers/models/bit/modeling_bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 48b5a95aa2c2..483e8f3856e5 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -658,7 +658,7 @@ def forward(
         )
 
 
-# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->resnetv2
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->bit
 class BitPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained

From a21e03c5ce21a810aa804797dd9c92540d54d8a4 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 09:45:52 +0100
Subject: [PATCH 46/88] remove `use_activation`

---
 src/transformers/models/bit/modeling_bit.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 483e8f3856e5..ba30beac9ed4 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -371,7 +371,6 @@ def __init__(
         groups=1,
         drop_path_rate=0.0,
         is_first_layer=False,
-        use_activation=False,
     ):
         super().__init__()
 
@@ -407,11 +406,6 @@ def __init__(
 
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
-        if use_activation:
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = nn.Identity()
-
     def forward(self, x):
         x_preact = self.norm1(x)
 
@@ -425,7 +419,7 @@ def forward(self, x):
         x = self.conv2(self.norm2(x))
         x = self.conv3(self.norm3(x))
         x = self.drop_path(x)
-        return self.activation(x + shortcut)
+        return x + shortcut
 
 
 class BitBottleneckLayer(nn.Module):
@@ -537,9 +531,9 @@ def __init__(
 
         # Get the layer type
         if config.layer_type == "bottleneck":
-            layer_fn = partial(BitBottleneckLayer)
+            layer_fn = BitBottleneckLayer
         else:
-            layer_fn = partial(BitPreActivationBottleneckLayer, use_activation=False)
+            layer_fn = BitPreActivationBottleneckLayer
 
         prev_chs = in_channels
         self.layers = nn.Sequential()

From 218d9d53c510e705434705ffe0ebb5299b0f5089 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 2 Dec 2022 11:07:11 +0100
Subject: [PATCH 47/88] Update src/transformers/models/bit/configuration_bit.py

---
 src/transformers/models/bit/configuration_bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 37ab6dbba80e..65da2e638189 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -20,7 +20,7 @@
 
 logger = logging.get_logger(__name__)
 
-RESNETV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+BIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "google/resnetv2-50": "https://huggingface.co/google/resnetv2-50/resolve/main/config.json",
 }
 

From d374194c6d628dba63ee428ed3e13970ecd87129 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 11:08:20 +0100
Subject: [PATCH 48/88] rm unneeded file

---
 src/transformers/models/bit/test.py | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 src/transformers/models/bit/test.py

diff --git a/src/transformers/models/bit/test.py b/src/transformers/models/bit/test.py
deleted file mode 100644
index a6596073103f..000000000000
--- a/src/transformers/models/bit/test.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-
-from transformers import BitBackbone, BitConfig
-
-
-backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
-
-model = BitBackbone(backbone_config)
-
-outputs = model(torch.rand(1, 3, 224, 224))
-print(outputs.feature_maps[-1].shape)

From 22ed211e83f9284ba6457a332536dd0d7d618551 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 11:09:15 +0100
Subject: [PATCH 49/88] Add BiT image processor

---
 docs/source/en/model_doc/bit.mdx              |   6 +
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/bit/__init__.py       |  20 +-
 .../models/bit/image_processing_bit.py        | 345 ++++++++++++++++++
 .../utils/dummy_vision_objects.py             |   7 +
 5 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/models/bit/image_processing_bit.py

diff --git a/docs/source/en/model_doc/bit.mdx b/docs/source/en/model_doc/bit.mdx
index cbc34c9f8878..d499f21632a7 100644
--- a/docs/source/en/model_doc/bit.mdx
+++ b/docs/source/en/model_doc/bit.mdx
@@ -36,6 +36,12 @@ The original code can be found [here](https://github.com/google-research/big_tra
 [[autodoc]] BitConfig
 
 
+## BitImageProcessor
+
+[[autodoc]] BitImageProcessor
+    - preprocess
+
+
 ## BitModel
 
 [[autodoc]] BitModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ae4853b645fd..027760542e14 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -746,6 +746,7 @@
     _import_structure["image_transforms"] = ["rescale", "resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
+    _import_structure["models.bit"].extend(["BitImageProcessor"])
     _import_structure["models.chinese_clip"].extend(["ChineseCLIPFeatureExtractor", "ChineseCLIPImageProcessor"])
     _import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"])
     _import_structure["models.conditional_detr"].extend(
@@ -3929,6 +3930,7 @@
         from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor, BeitImageProcessor
+        from .models.bit import BitImageProcessor
         from .models.chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor
         from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor
diff --git a/src/transformers/models/bit/__init__.py b/src/transformers/models/bit/__init__.py
index ae7763201aaa..53118146ad14 100644
--- a/src/transformers/models/bit/__init__.py
+++ b/src/transformers/models/bit/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {"configuration_bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig", "BitOnnxConfig"]}
@@ -37,6 +37,16 @@
         "BitBackbone",
     ]
 
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_bit"] = ["BitImageProcessor"]
+
+
 if TYPE_CHECKING:
     from .configuration_bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig, BitOnnxConfig
 
@@ -54,6 +64,14 @@
             BitPreTrainedModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_bit import BitImageProcessor
+
 else:
     import sys
 
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
new file mode 100644
index 000000000000..8bd451791ef5
--- /dev/null
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -0,0 +1,345 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BiT."""
+
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    center_crop,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
+from ...utils import logging
+from ...utils.import_utils import is_vision_available
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+def convert_to_rgb(image: Union[Any, PIL.Image.Image]) -> Union[Any, PIL.Image.Image]:
+    """
+    Converts `PIL.Image.Image` to RGB format. Images in other formats are returned as is.
+
+    Args:
+        image (`PIL.Image.Image`):
+            The image to convert.
+    """
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    return image.convert("RGB")
+
+
+class BitImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BiT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index b3e13ce7225b..b0fd8b6d5abe 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -43,6 +43,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class BitImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ChineseCLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 0e2207b86833de454cafaf152694461d0b649274 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 11:09:45 +0100
Subject: [PATCH 50/88] rm unneeded file

---
 src/transformers/models/vit_hybrid/test.py | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 src/transformers/models/vit_hybrid/test.py

diff --git a/src/transformers/models/vit_hybrid/test.py b/src/transformers/models/vit_hybrid/test.py
deleted file mode 100644
index 861ce31c82f9..000000000000
--- a/src/transformers/models/vit_hybrid/test.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from transformers import BitConfig, ViTHybridConfig, ViTHybridForImageClassification
-
-
-backbone_config = BitConfig(stem_type="same", layer_type="bottleneck", depths=(3, 4, 9), out_features=["stage3"])
-config = ViTHybridConfig(backbone_config=backbone_config, image_size=384)
-
-model = ViTHybridForImageClassification(config)
-
-for name, param in model.named_parameters():
-    print(name, param.shape)

From 665179e3796968ebd7a45f21a27dc32be1bcd64d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 11:17:40 +0100
Subject: [PATCH 51/88] add doc

---
 docs/source/en/_toctree.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 62cbe467d81b..1196af976eb7 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -209,6 +209,8 @@
         title: BigBird
       - local: model_doc/bigbird_pegasus
         title: BigBirdPegasus
+      - local: model_doc/bit
+        title: BiT
       - local: model_doc/blenderbot
         title: Blenderbot
       - local: model_doc/blenderbot-small
@@ -440,6 +442,8 @@
         title: VideoMAE
       - local: model_doc/vit
         title: Vision Transformer (ViT)
+      - local: model_doc/vit_hybrid
+        title: ViT-Hybrid
       - local: model_doc/vit_mae
         title: ViTMAE
       - local: model_doc/vit_msn

From 76196d4f196b69be063da0b234c046838faffb21 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 11:30:09 +0100
Subject: [PATCH 52/88] Add image processor to conversion script

---
 .../models/bit/convert_bit_to_pytorch.py      | 31 +++++++++++++++++--
 .../models/bit/image_processing_bit.py        |  2 +-
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index a38aa242bf6e..c30f0e3ba612 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -27,7 +27,8 @@
 from timm import create_model
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
-from transformers import BitConfig, BitForImageClassification
+from transformers import BitConfig, BitForImageClassification, BitImageProcessor
+from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
 
 
@@ -103,11 +104,35 @@ def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
     model.eval()
     model.load_state_dict(state_dict)
 
-    # verify logits
+    # create image processor
     transform = create_transform(**resolve_data_config({}, model=timm_model))
+    timm_transforms = transform.transforms
+
+    pillow_resamplings = {
+        "bilinear": PILImageResampling.BILINEAR,
+        "bicubic": PILImageResampling.BICUBIC,
+        "nearest": PILImageResampling.NEAREST,
+    }
+
+    processor = BitImageProcessor(
+        do_resize=True,
+        size={"shortest_edge": timm_transforms[0].size},
+        resample=pillow_resamplings[timm_transforms[0].interpolation.value],
+        do_center_crop=True,
+        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
+        do_normalize=True,
+        image_mean=timm_transforms[-1].mean,
+        image_std=timm_transforms[-1].std,
+    )
+
     image = prepare_img()
-    pixel_values = transform(image).unsqueeze(0)
+    timm_pixel_values = transform(image).unsqueeze(0)
+    pixel_values = processor(image, return_tensors="pt").pixel_values
 
+    # verify pixel values
+    assert torch.allclose(timm_pixel_values, pixel_values)
+
+    # verify logits
     with torch.no_grad():
         outputs = model(pixel_values)
         logits = outputs.logits
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 8bd451791ef5..8f2fa9524c39 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -342,4 +342,4 @@ def preprocess(
         images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
\ No newline at end of file
+        return BatchFeature(data=data, tensor_type=return_tensors)

From 04aa0c2b7e92443b7f31d95cabe94c55828589fd Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 11:46:10 +0100
Subject: [PATCH 53/88] Add ViTHybrid image processor

---
 docs/source/en/model_doc/vit_hybrid.mdx       |   5 +
 src/transformers/__init__.py                  |   2 +
 .../models/vit_hybrid/__init__.py             |  18 +-
 .../convert_vit_hybrid_timm_to_pytorch.py     |  40 +-
 .../vit_hybrid/image_processing_vit_hybrid.py | 345 ++++++++++++++++++
 .../utils/dummy_vision_objects.py             |   7 +
 6 files changed, 410 insertions(+), 7 deletions(-)
 create mode 100644 src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py

diff --git a/docs/source/en/model_doc/vit_hybrid.mdx b/docs/source/en/model_doc/vit_hybrid.mdx
index 32c0964f7097..3404105fa3b1 100644
--- a/docs/source/en/model_doc/vit_hybrid.mdx
+++ b/docs/source/en/model_doc/vit_hybrid.mdx
@@ -46,6 +46,11 @@ found [here](https://github.com/google-research/vision_transformer).
 
 [[autodoc]] ViTHybridConfig
 
+## ViTHybridImageProcessor
+
+[[autodoc]] ViTHybridImageProcessor
+    - preprocess
+
 ## ViTHybridModel
 
 [[autodoc]] ViTHybridModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 027760542e14..b7ba7916d15a 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -777,6 +777,7 @@
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
+    _import_structure["models.vit_hybrid"].extend(["ViTHybridImageProcessor"])
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
 # Timm-backed objects
@@ -3957,6 +3958,7 @@
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
+        from .models.vit_hybrid import ViTHybridImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
 
     # Modeling
diff --git a/src/transformers/models/vit_hybrid/__init__.py b/src/transformers/models/vit_hybrid/__init__.py
index 5b86bef38cb7..b50378682a83 100644
--- a/src/transformers/models/vit_hybrid/__init__.py
+++ b/src/transformers/models/vit_hybrid/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {"configuration_vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"]}
@@ -35,6 +35,14 @@
         "ViTHybridPreTrainedModel",
     ]
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_vit_hybrid"] = ["ViTHybridImageProcessor"]
+
 
 if TYPE_CHECKING:
     from .configuration_vit_hybrid import VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTHybridConfig
@@ -52,6 +60,14 @@
             ViTHybridPreTrainedModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_vit_hybrid import ViTHybridImageProcessor
+
 
 else:
     import sys
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index 3503533055eb..e6e54c4a333b 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -25,7 +25,14 @@
 import timm
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
-from transformers import BitConfig, ViTHybridConfig, ViTHybridForImageClassification, ViTHybridModel
+from transformers import (
+    BitConfig,
+    ViTHybridConfig,
+    ViTHybridForImageClassification,
+    ViTHybridImageProcessor,
+    ViTHybridModel,
+)
+from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
 
 
@@ -190,14 +197,35 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False
         model = ViTHybridForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image
+    # create image processor
     transform = create_transform(**resolve_data_config({}, model=timm_model))
+    timm_transforms = transform.transforms
+
+    pillow_resamplings = {
+        "bilinear": PILImageResampling.BILINEAR,
+        "bicubic": PILImageResampling.BICUBIC,
+        "nearest": PILImageResampling.NEAREST,
+    }
+
+    processor = ViTHybridImageProcessor(
+        do_resize=True,
+        size={"shortest_edge": timm_transforms[0].size},
+        resample=pillow_resamplings[timm_transforms[0].interpolation.value],
+        do_center_crop=True,
+        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
+        do_normalize=True,
+        image_mean=timm_transforms[-1].mean,
+        image_std=timm_transforms[-1].std,
+    )
 
-    # load image
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    pixel_values = transform(image).unsqueeze(0)
+    image = prepare_img()
+    timm_pixel_values = transform(image).unsqueeze(0)
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    # verify pixel values
+    assert torch.allclose(timm_pixel_values, pixel_values)
 
+    # verify logits
     with torch.no_grad():
         outputs = model(pixel_values)
         logits = outputs.logits
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
new file mode 100644
index 000000000000..bd797ffcb68f
--- /dev/null
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -0,0 +1,345 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViT hybrid."""
+
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    center_crop,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
+from ...utils import logging
+from ...utils.import_utils import is_vision_available
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+def convert_to_rgb(image: Union[Any, PIL.Image.Image]) -> Union[Any, PIL.Image.Image]:
+    """
+    Converts `PIL.Image.Image` to RGB format. Images in other formats are returned as is.
+
+    Args:
+        image (`PIL.Image.Image`):
+            The image to convert.
+    """
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    return image.convert("RGB")
+
+
+class ViTHybridImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViT Hybrid image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index b0fd8b6d5abe..1aa5e29a7d3a 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -428,6 +428,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ViTHybridImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class YolosFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 74796c575151ae6b4f3b617669975b54ad441b87 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 12:07:50 +0100
Subject: [PATCH 54/88] Add resources

---
 docs/source/en/model_doc/bit.mdx                 | 10 ++++++++++
 docs/source/en/model_doc/vit_hybrid.mdx          | 16 +++++++++++-----
 .../models/bit/convert_bit_to_pytorch.py         |  9 ++++-----
 .../convert_vit_hybrid_timm_to_pytorch.py        |  8 ++++----
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/bit.mdx b/docs/source/en/model_doc/bit.mdx
index d499f21632a7..7190db9c7859 100644
--- a/docs/source/en/model_doc/bit.mdx
+++ b/docs/source/en/model_doc/bit.mdx
@@ -31,6 +31,16 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/google-research/big_transfer).
 
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BiT.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`BitForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
 ## BitConfig
 
 [[autodoc]] BitConfig
diff --git a/docs/source/en/model_doc/vit_hybrid.mdx b/docs/source/en/model_doc/vit_hybrid.mdx
index 3404105fa3b1..8885af0dfe0f 100644
--- a/docs/source/en/model_doc/vit_hybrid.mdx
+++ b/docs/source/en/model_doc/vit_hybrid.mdx
@@ -33,15 +33,21 @@ data and transferred to multiple mid-sized or small image recognition benchmarks
 Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
 substantially fewer computational resources to train.*
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper.</a> </small>
-
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
 found [here](https://github.com/google-research/vision_transformer).
 
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
 ## ViTHybridConfig
 
 [[autodoc]] ViTHybridConfig
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index c30f0e3ba612..6b55bfa66ac8 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -146,15 +146,14 @@ def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and feature extractor to the hub {model_name}")
+        print(f"Pushing model {model_name} and processor to the hub")
         model.push_to_hub(f"nielsr/{model_name}")
-        # feature_extractor.push_to_hub(f"nielsr/{model_name}")
+        processor.push_to_hub(f"nielsr/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index e6e54c4a333b..67fcad58e3a6 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -245,13 +245,13 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and feature extractor to the hub {vit_name}")
+        print(f"Pushing model and processor to the hub {vit_name}")
         model.push_to_hub(f"nielsr/{vit_name}")
-        # feature_extractor.push_to_hub(f"nielsr/{vit_name}")
+        processor.push_to_hub(f"nielsr/{vit_name}")
 
 
 if __name__ == "__main__":

From 1ea0c00fbdea23af2868e68d7b2cde2162ee0e5f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 12:10:20 +0100
Subject: [PATCH 55/88] Move bit to correct position

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1196af976eb7..4f2909d26c29 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -209,8 +209,6 @@
         title: BigBird
       - local: model_doc/bigbird_pegasus
         title: BigBirdPegasus
-      - local: model_doc/bit
-        title: BiT
       - local: model_doc/blenderbot
         title: Blenderbot
       - local: model_doc/blenderbot-small
@@ -386,6 +384,8 @@
       sections:
       - local: model_doc/beit
         title: BEiT
+      - local: model_doc/bit
+        title: BiT
       - local: model_doc/conditional_detr
         title: Conditional DETR
       - local: model_doc/convnext

From aad36ed1f4e6a331ef61ef2dd8a7f87a47405339 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 12:17:04 +0100
Subject: [PATCH 56/88] Fix auto mapping

---
 src/transformers/models/auto/feature_extraction_auto.py | 1 -
 src/transformers/models/auto/image_processing_auto.py   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index c2b98de13999..a5c25a7023f6 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,7 +39,6 @@
     [
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
-        ("bit", "ConvNextFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 58393ce0b824..ea08c0fe8dc5 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -38,7 +38,7 @@
 IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("beit", "BeitImageProcessor"),
-        ("bit", "ConvNextImageProcessor"),
+        ("bit", "BitImageProcessor"),
         ("chinese_clip", "ChineseCLIPImageProcessor"),
         ("clip", "CLIPImageProcessor"),
         ("clipseg", "ViTImageProcessor"),
@@ -80,6 +80,7 @@
         ("videomae", "VideoMAEImageProcessor"),
         ("vilt", "ViltImageProcessor"),
         ("vit", "ViTImageProcessor"),
+        ("vit_hybrid", "ViTHybridImageProcessor"),
         ("vit_mae", "ViTImageProcessor"),
         ("vit_msn", "ViTImageProcessor"),
         ("xclip", "CLIPImageProcessor"),

From 34ea003745053c5a8b652b45106de978da263b49 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 12:59:38 +0100
Subject: [PATCH 57/88] Rename hybrid to Hybrid

---
 README.md                                          | 1 +
 README_es.md                                       | 1 +
 README_ja.md                                       | 1 +
 README_ko.md                                       | 1 +
 README_zh-hans.md                                  | 1 +
 README_zh-hant.md                                  | 1 +
 docs/source/en/index.mdx                           | 3 ++-
 src/transformers/models/auto/configuration_auto.py | 2 +-
 8 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 354437acc129..9dc03b4d3800 100644
--- a/README.md
+++ b/README.md
@@ -405,6 +405,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_es.md b/README_es.md
index 7166faaff3e0..f3fe0106fa51 100644
--- a/README_es.md
+++ b/README_es.md
@@ -405,6 +405,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ja.md b/README_ja.md
index c3fae80fecf8..5de33c09a27f 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -440,6 +440,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ko.md b/README_ko.md
index 99ed10526c34..04556a1903a7 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -355,6 +355,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 1a82ddc363c5..72d522d2993a 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -379,6 +379,7 @@ conda install -c huggingface transformers
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
 1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b58f83d6a05d..6a19ed11f6f4 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -391,6 +391,7 @@ conda install -c huggingface transformers
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 5bc76bd71c10..ec7125d5829a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -193,6 +193,7 @@ The documentation is organized into five sections:
 1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViT Hybrid](model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
@@ -356,7 +357,7 @@ Flax), PyTorch, and/or TensorFlow.
 |     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          ViT hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index bed97bcd5fae..ee5bfde08978 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -476,7 +476,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
         ("visual_bert", "VisualBERT"),
         ("vit", "ViT"),
-        ("vit_hybrid", "ViT hybrid"),
+        ("vit_hybrid", "ViT Hybrid"),
         ("vit_mae", "ViTMAE"),
         ("vit_msn", "ViTMSN"),
         ("wav2vec2", "Wav2Vec2"),

From f3d8c4b6d6b3d1b398d6c1e8bdb6a02298c9a041 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 13:00:20 +0100
Subject: [PATCH 58/88] Fix name in toctree

---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 4f2909d26c29..d5bb0ad854fc 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -443,7 +443,7 @@
       - local: model_doc/vit
         title: Vision Transformer (ViT)
       - local: model_doc/vit_hybrid
-        title: ViT-Hybrid
+        title: ViT Hybrid
       - local: model_doc/vit_mae
         title: ViTMAE
       - local: model_doc/vit_msn

From a8e90d3c36f88313efd7de85c056665c1802a00d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 13:01:39 +0100
Subject: [PATCH 59/88] Fix READMEs'

---
 README.md                | 3 +--
 README_es.md             | 3 +--
 README_ja.md             | 3 +--
 README_ko.md             | 3 +--
 README_zh-hans.md        | 3 +--
 README_zh-hant.md        | 3 +--
 docs/source/en/index.mdx | 3 +--
 7 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 9dc03b4d3800..72de939d6fae 100644
--- a/README.md
+++ b/README.md
@@ -404,8 +404,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_es.md b/README_es.md
index f3fe0106fa51..8541fe9defd9 100644
--- a/README_es.md
+++ b/README_es.md
@@ -404,8 +404,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ja.md b/README_ja.md
index 5de33c09a27f..ac8da6b22d29 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -439,8 +439,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_ko.md b/README_ko.md
index 04556a1903a7..c3ea75584175 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -354,8 +354,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 72d522d2993a..61fe85f2b412 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -378,8 +378,7 @@ conda install -c huggingface transformers
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
-1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 6a19ed11f6f4..f1079bfdccda 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -390,8 +390,7 @@ conda install -c huggingface transformers
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index ec7125d5829a..49de09b24aa9 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -192,8 +192,7 @@ The documentation is organized into five sections:
 1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[ViT hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[ViT Hybrid](model_doc/vit_hybrid)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.

From 89b4885c0d4bfbda2bcfaeda881d0749b311087d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 13:30:33 +0100
Subject: [PATCH 60/88] Improve config

---
 src/transformers/models/bit/configuration_bit.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 65da2e638189..3857d4b46bc3 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -61,6 +61,9 @@ class BitConfig(PretrainedConfig):
             The output stride of the model.
         width_factor (`int`, *optional*, defaults to 1):
             The width factor for the model.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has).
 
     Example:
     ```python
@@ -88,13 +91,13 @@ def __init__(
         depths=[3, 4, 6, 3],
         layer_type="preactivation",
         hidden_act="relu",
+        global_padding=None,
+        num_groups=32,
         drop_path_rate=0.0,
+        embedding_dynamic_padding=False,
         output_stride=32,
         width_factor=1,
-        global_padding=None,
-        num_groups=32,
         out_features=None,
-        embedding_dynamic_padding=False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -111,13 +114,13 @@ def __init__(
         self.depths = depths
         self.layer_type = layer_type
         self.hidden_act = hidden_act
+        self.global_padding = global_padding
         self.num_groups = num_groups
         self.drop_path_rate = drop_path_rate
+        self.embedding_dynamic_padding = embedding_dynamic_padding
         self.output_stride = output_stride
         self.width_factor = width_factor
-        self.embedding_dynamic_padding = embedding_dynamic_padding
 
-        self.global_padding = global_padding
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
         if out_features is not None:
             if not isinstance(out_features, list):

From 209f7d21ac84001b119428e27205cf2ff22a3eb9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 2 Dec 2022 14:11:52 +0100
Subject: [PATCH 61/88] Simplify GroupNormActivation layer

---
 src/transformers/models/bit/modeling_bit.py | 69 +++++++++------------
 1 file changed, 28 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index ba30beac9ed4..a12106a48d3e 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -64,14 +64,18 @@
 
 def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tuple[Tuple, bool]:
     r"""
-    Utility function to get the tuple padding value given the kernel_size and padding
+    Utility function to get the tuple padding value given the kernel_size and padding.
 
     Args:
-        padding, Union[`str`, `int`]:
-            padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
+        padding (Union[`str`, `int`], *optional*):
+            Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
             PyTorch is used.
-        kernel_size, `int`:
+        kernel_size (`int`, *optional*, defaults to 7):
             Kernel size of the convolution layers.
+        stride (`int`, *optional*, defaults to 1):
+            Stride value of the convolution layers.
+        dilation (`int`, *optional*, defaults to 1):
+            Dilation value of the convolution layers.
     """
     dynamic = False
     if padding is None:
@@ -100,7 +104,7 @@ def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tupl
 
 
 class WeightStandardizedConv2d(nn.Conv2d):
-    """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
+    """Conv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.
 
     Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
     Standardization](https://arxiv.org/abs/1903.10520v2)
@@ -135,51 +139,34 @@ def __init__(
             self.pad = None
         self.eps = eps
 
-    def forward(self, x):
+    def forward(self, hidden_state):
         if self.pad is not None:
-            x = self.pad(x)
+            hidden_state = self.pad(hidden_state)
         weight = nn.functional.batch_norm(
             self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
         ).reshape_as(self.weight)
-        x = nn.functional.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-        return x
-
-
-def _num_groups(num_channels, num_groups, group_size):
-    if group_size:
-        if num_channels % group_size == 0:
-            raise ValueError("num_channels must divide group_size")
-        return num_channels // group_size
-    return num_groups
+        hidden_state = nn.functional.conv2d(
+            hidden_state, weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        return hidden_state
 
 
 class BitGroupNormActivation(nn.GroupNorm):
-    # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args
-    def __init__(
-        self,
-        config,
-        num_channels,
-        eps=1e-5,
-        affine=True,
-        group_size=None,
-        apply_act=True,
-        drop_layer=None,
-    ):
+    r"""
+    A module that combines group normalization with an activation function.
+    """
 
-        super(BitGroupNormActivation, self).__init__(
-            _num_groups(num_channels, config.num_groups, group_size), num_channels, eps=eps, affine=affine
-        )
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        if apply_act:
+    def __init__(self, config, num_channels, eps=1e-5, affine=True, apply_activation=True):
+        super(BitGroupNormActivation, self).__init__(config.num_groups, num_channels, eps=eps, affine=affine)
+        if apply_activation:
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = nn.Identity()
 
-    def forward(self, x):
-        x = nn.functional.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-        x = self.drop(x)
-        x = self.activation(x)
-        return x
+    def forward(self, hidden_state):
+        hidden_state = nn.functional.group_norm(hidden_state, self.num_groups, self.weight, self.bias, self.eps)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
 
 
 class DynamicPad2d(nn.Module):
@@ -423,7 +410,7 @@ def forward(self, x):
 
 
 class BitBottleneckLayer(nn.Module):
-    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT."""
+    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid."""
 
     def __init__(
         self,
@@ -464,7 +451,7 @@ def __init__(
         self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
         self.norm2 = norm_layer(num_channels=mid_chs)
         self.conv3 = conv_layer(mid_chs, out_channels, 1)
-        self.norm3 = norm_layer(num_channels=out_channels, apply_act=False)
+        self.norm3 = norm_layer(num_channels=out_channels, apply_activation=False)
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
         self.activation = ACT2FN[config.hidden_act]
@@ -503,7 +490,7 @@ def __init__(
         super(BitDownsampleConv, self).__init__()
         self.conv_layer = conv_layer
         self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
-        self.norm = nn.Identity() if preact else norm_layer(num_channels=out_channels, apply_act=False)
+        self.norm = nn.Identity() if preact else norm_layer(num_channels=out_channels, apply_activation=False)
 
     def forward(self, x):
         return self.norm(self.conv(x))

From a7a2ffcc1dc3865e67f6964b6cabe47d40809946 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 2 Dec 2022 17:23:50 +0100
Subject: [PATCH 62/88] fix test + make style

---
 src/transformers/models/bit/image_processing_bit.py         | 2 +-
 .../models/vit_hybrid/configuration_vit_hybrid.py           | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 8bd451791ef5..8f2fa9524c39 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -342,4 +342,4 @@ def preprocess(
         images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
\ No newline at end of file
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index fde518ca5ff2..5ddc367d1b87 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -110,11 +110,11 @@ def __init__(
         if backbone_config is None:
             logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.")
             backbone_config = {
-                "stem_type": "same",
-                "conv_layer": "std_conv_same",
+                "global_padding": "same",
                 "layer_type": "bottleneck",
-                "depths": (3, 4, 9),
+                "depths": [3, 4, 9],
                 "out_features": ["stage3"],
+                "embedding_dynamic_padding": True,
             }
 
         if isinstance(backbone_config, dict):

From e5e5145a6d3019cb50fddfb8034ac6518a6a410a Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 4 Dec 2022 18:01:09 +0100
Subject: [PATCH 63/88] Improve config

---
 .../models/vit_hybrid/configuration_vit_hybrid.py          | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index fde518ca5ff2..3636005c62b6 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -33,15 +33,14 @@
 
 class ViTHybridConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`ViTModel`]. It is used to instantiate an ViT
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the ViT
+    This is the configuration class to store the configuration of a [`ViTHybridModel`]. It is used to instantiate a ViT
+    Hybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ViT Hybrid
     [google/vit-base-r50-s16-384](https://huggingface.co/google/vit-base-r50-s16-384) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.

From f784cb78ca5c4738596ce15a1cd55917b0212871 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 5 Dec 2022 11:36:03 +0100
Subject: [PATCH 64/88] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/bit/modeling_bit.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index a12106a48d3e..f6b51972e895 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -242,9 +242,9 @@ def __init__(
         else:
             self.pad = nn.Identity()
 
-    def forward(self, x):
-        x = self.pad(x)
-        return nn.functional.max_pool2d(x, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode)
+    def forward(self, hidden_states):
+        hidden_states = self.pad(hidden_states)
+        return nn.functional.max_pool2d(hidden_states, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode)
 
 
 class BitEmbeddings(nn.Module):

From 4ac832497ba307d3c9769710751fc476fb921ed9 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 11:37:49 +0100
Subject: [PATCH 65/88] remove comment

---
 src/transformers/models/bit/modeling_bit.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index f6b51972e895..2216f6d0e7e6 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -279,7 +279,6 @@ def forward(self, pixel_values: Tensor) -> Tensor:
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
 
-        # Conv
         embedding = self.convolution(pixel_values)
 
         # Eventually pad

From abcbe409cefa97274aeaa1f23799c4d4fde5db5e Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 11:38:36 +0100
Subject: [PATCH 66/88] remove comment

---
 src/transformers/models/bit/modeling_bit.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 2216f6d0e7e6..3c2646de3de6 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -281,13 +281,10 @@ def forward(self, pixel_values: Tensor) -> Tensor:
 
         embedding = self.convolution(pixel_values)
 
-        # Eventually pad
         embedding = self.pad(embedding)
 
-        # Eventually use BitGroupNorm
         embedding = self.norm(embedding)
 
-        # and pool
         embedding = self.pooler(embedding)
 
         return embedding

From 98b9f3bd4494a8177001cceddb328129d03ca718 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 11:41:42 +0100
Subject: [PATCH 67/88] replace

---
 src/transformers/models/bit/modeling_bit.py | 22 ++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 3c2646de3de6..626514fc3517 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -319,8 +319,8 @@ def __init__(self, drop_prob: Optional[float] = None) -> None:
         super().__init__()
         self.drop_prob = drop_prob
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return drop_path(x, self.drop_prob, self.training)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
@@ -389,20 +389,20 @@ def __init__(
 
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
-    def forward(self, x):
-        x_preact = self.norm1(x)
+    def forward(self, hidden_states):
+        hidden_states_preact = self.norm1(hidden_states)
 
         # shortcut branch
-        shortcut = x
+        shortcut = hidden_states
         if self.downsample is not None:
-            shortcut = self.downsample(x_preact)
+            shortcut = self.downsample(hidden_states_preact)
 
         # residual branch
-        x = self.conv1(x_preact)
-        x = self.conv2(self.norm2(x))
-        x = self.conv3(self.norm3(x))
-        x = self.drop_path(x)
-        return x + shortcut
+        hidden_states = self.conv1(hidden_states_preact)
+        hidden_states = self.conv2(self.norm2(hidden_states))
+        hidden_states = self.conv3(self.norm3(hidden_states))
+        hidden_states = self.drop_path(hidden_states)
+        return hidden_states + shortcut
 
 
 class BitBottleneckLayer(nn.Module):

From 53c646ad086644e866d3d66f02e2a9f498a22dbd Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 11:44:17 +0100
Subject: [PATCH 68/88] replace

---
 src/transformers/models/bit/modeling_bit.py | 24 ++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 626514fc3517..94faa167e2ba 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -452,25 +452,25 @@ def __init__(
 
         self.activation = ACT2FN[config.hidden_act]
 
-    def forward(self, x):
+    def forward(self, hidden_states):
         # shortcut branch
-        shortcut = x
+        shortcut = hidden_states
         if self.downsample is not None:
-            shortcut = self.downsample(x)
+            shortcut = self.downsample(hidden_states)
 
         # residual
-        x = self.conv1(x)
-        x = self.norm1(x)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm1(hidden_states)
 
-        x = self.conv2(x)
-        x = self.norm2(x)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.norm2(hidden_states)
 
-        x = self.conv3(x)
-        x = self.norm3(x)
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = self.norm3(hidden_states)
 
-        x = self.drop_path(x)
-        x = self.activation(x + shortcut)
-        return x
+        hidden_states = self.drop_path(hidden_states)
+        hidden_states = self.activation(hidden_states + shortcut)
+        return hidden_states
 
 
 class BitDownsampleConv(nn.Module):

From 992d8ebc6f24b4f58b4c7d3df2e8a0cf4145d845 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 11:57:19 +0100
Subject: [PATCH 69/88] remove all conv_layer

---
 src/transformers/models/bit/modeling_bit.py | 59 +++++++++++++--------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 94faa167e2ba..d216f8ab9e30 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -244,7 +244,9 @@ def __init__(
 
     def forward(self, hidden_states):
         hidden_states = self.pad(hidden_states)
-        return nn.functional.max_pool2d(hidden_states, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode)
+        return nn.functional.max_pool2d(
+            hidden_states, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode
+        )
 
 
 class BitEmbeddings(nn.Module):
@@ -252,10 +254,17 @@ class BitEmbeddings(nn.Module):
     BiT Embeddings (stem) composed of a single aggressive convolution.
     """
 
-    def __init__(self, config: BitConfig, conv_layer: nn.Module):
+    def __init__(self, config: BitConfig):
         super().__init__()
 
-        self.convolution = conv_layer(config.num_channels, config.embedding_size, kernel_size=7, stride=2)
+        self.convolution = WeightStandardizedConv2d(
+            config.num_channels,
+            config.embedding_size,
+            kernel_size=7,
+            stride=2,
+            eps=1e-8,
+            padding=config.global_padding,
+        )
 
         self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)
 
@@ -359,8 +368,6 @@ def __init__(
 
         first_dilation = first_dilation or dilation
 
-        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
-
         norm_layer = partial(BitGroupNormActivation, config=config)
 
         out_channels = out_channels or in_channels
@@ -372,20 +379,22 @@ def __init__(
                 out_channels,
                 stride=stride,
                 preact=True,
-                conv_layer=conv_layer,
+                padding=config.global_padding,
                 norm_layer=norm_layer,
             )
         else:
             self.downsample = None
 
         self.norm1 = norm_layer(num_channels=in_channels)
-        self.conv1 = conv_layer(in_channels, mid_channels, 1)
+        self.conv1 = WeightStandardizedConv2d(in_channels, mid_channels, 1, eps=1e-8, padding=config.global_padding)
 
         self.norm2 = norm_layer(num_channels=mid_channels)
-        self.conv2 = conv_layer(mid_channels, mid_channels, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.conv2 = WeightStandardizedConv2d(
+            mid_channels, mid_channels, 3, stride=stride, groups=groups, eps=1e-8, padding=config.global_padding
+        )
 
         self.norm3 = norm_layer(num_channels=mid_channels)
-        self.conv3 = conv_layer(mid_channels, out_channels, 1)
+        self.conv3 = WeightStandardizedConv2d(mid_channels, out_channels, 1, eps=1e-8, padding=config.global_padding)
 
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
@@ -424,8 +433,6 @@ def __init__(
         super().__init__()
         first_dilation = first_dilation or dilation
 
-        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
-
         norm_layer = partial(BitGroupNormActivation, config=config)
         out_channels = out_channels or in_channels
         mid_chs = make_div(out_channels * bottle_ratio)
@@ -436,17 +443,26 @@ def __init__(
                 out_channels,
                 stride=stride,
                 preact=False,
-                conv_layer=conv_layer,
+                padding=config.global_padding,
                 norm_layer=norm_layer,
             )
         else:
             self.downsample = None
 
-        self.conv1 = conv_layer(in_channels, mid_chs, 1)
+        self.conv1 = WeightStandardizedConv2d(in_channels, mid_chs, 1, eps=1e-8, padding=config.global_padding)
         self.norm1 = norm_layer(num_channels=mid_chs)
-        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
+        self.conv2 = WeightStandardizedConv2d(
+            mid_chs,
+            mid_chs,
+            3,
+            stride=stride,
+            dilation=first_dilation,
+            groups=groups,
+            eps=1e-8,
+            padding=config.global_padding,
+        )
         self.norm2 = norm_layer(num_channels=mid_chs)
-        self.conv3 = conv_layer(mid_chs, out_channels, 1)
+        self.conv3 = WeightStandardizedConv2d(mid_chs, out_channels, 1, eps=1e-8, padding=config.global_padding)
         self.norm3 = norm_layer(num_channels=out_channels, apply_activation=False)
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
@@ -480,12 +496,11 @@ def __init__(
         out_channels,
         stride=1,
         preact=True,
-        conv_layer=None,
+        padding=None,
         norm_layer=None,
     ):
         super(BitDownsampleConv, self).__init__()
-        self.conv_layer = conv_layer
-        self.conv = conv_layer(in_channels, out_channels, 1, stride=stride)
+        self.conv = WeightStandardizedConv2d(in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=padding)
         self.norm = nn.Identity() if preact else norm_layer(num_channels=out_channels, apply_activation=False)
 
     def forward(self, x):
@@ -567,7 +582,7 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class BitEncoder(nn.Module):
-    def __init__(self, config: BitConfig, conv_layer: nn.Module):
+    def __init__(self, config: BitConfig):
         super().__init__()
         self.stages = nn.ModuleList([])
 
@@ -693,11 +708,9 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
 
-        conv_layer = partial(WeightStandardizedConv2d, eps=1e-8, padding=config.global_padding)
-
-        self.embedder = BitEmbeddings(config, conv_layer)
+        self.embedder = BitEmbeddings(config)
 
-        self.encoder = BitEncoder(config, conv_layer)
+        self.encoder = BitEncoder(config)
         norm_layer = BitGroupNormActivation
         self.norm = (
             norm_layer(config, num_channels=config.hidden_sizes[-1])

From a427497f2db3356d7ba0ca1100a3f8e1748ba6da Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 12:08:09 +0100
Subject: [PATCH 70/88] refactor norm_layer

---
 src/transformers/models/bit/modeling_bit.py | 39 ++++++++++-----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index d216f8ab9e30..be370a7d1386 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -275,7 +275,7 @@ def __init__(self, config: BitConfig):
             self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
 
         if not config.layer_type == "preactivation":
-            self.norm = partial(BitGroupNormActivation, config=config)(num_channels=config.embedding_size)
+            self.norm = BitGroupNormActivation(config, num_channels=config.embedding_size)
         else:
             self.norm = nn.Identity()
 
@@ -368,32 +368,29 @@ def __init__(
 
         first_dilation = first_dilation or dilation
 
-        norm_layer = partial(BitGroupNormActivation, config=config)
-
         out_channels = out_channels or in_channels
         mid_channels = make_div(out_channels * bottle_ratio)
 
         if is_first_layer:
             self.downsample = BitDownsampleConv(
+                config,
                 in_channels,
                 out_channels,
                 stride=stride,
                 preact=True,
-                padding=config.global_padding,
-                norm_layer=norm_layer,
             )
         else:
             self.downsample = None
 
-        self.norm1 = norm_layer(num_channels=in_channels)
+        self.norm1 = BitGroupNormActivation(config, in_channels)
         self.conv1 = WeightStandardizedConv2d(in_channels, mid_channels, 1, eps=1e-8, padding=config.global_padding)
 
-        self.norm2 = norm_layer(num_channels=mid_channels)
+        self.norm2 = BitGroupNormActivation(config, num_channels=mid_channels)
         self.conv2 = WeightStandardizedConv2d(
             mid_channels, mid_channels, 3, stride=stride, groups=groups, eps=1e-8, padding=config.global_padding
         )
 
-        self.norm3 = norm_layer(num_channels=mid_channels)
+        self.norm3 = BitGroupNormActivation(config, mid_channels)
         self.conv3 = WeightStandardizedConv2d(mid_channels, out_channels, 1, eps=1e-8, padding=config.global_padding)
 
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
@@ -433,24 +430,22 @@ def __init__(
         super().__init__()
         first_dilation = first_dilation or dilation
 
-        norm_layer = partial(BitGroupNormActivation, config=config)
         out_channels = out_channels or in_channels
         mid_chs = make_div(out_channels * bottle_ratio)
 
         if is_first_layer:
             self.downsample = BitDownsampleConv(
+                config,
                 in_channels,
                 out_channels,
                 stride=stride,
                 preact=False,
-                padding=config.global_padding,
-                norm_layer=norm_layer,
             )
         else:
             self.downsample = None
 
         self.conv1 = WeightStandardizedConv2d(in_channels, mid_chs, 1, eps=1e-8, padding=config.global_padding)
-        self.norm1 = norm_layer(num_channels=mid_chs)
+        self.norm1 = BitGroupNormActivation(config, num_channels=mid_chs)
         self.conv2 = WeightStandardizedConv2d(
             mid_chs,
             mid_chs,
@@ -461,9 +456,9 @@ def __init__(
             eps=1e-8,
             padding=config.global_padding,
         )
-        self.norm2 = norm_layer(num_channels=mid_chs)
+        self.norm2 = BitGroupNormActivation(config, num_channels=mid_chs)
         self.conv3 = WeightStandardizedConv2d(mid_chs, out_channels, 1, eps=1e-8, padding=config.global_padding)
-        self.norm3 = norm_layer(num_channels=out_channels, apply_activation=False)
+        self.norm3 = BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
         self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
 
         self.activation = ACT2FN[config.hidden_act]
@@ -492,16 +487,21 @@ def forward(self, hidden_states):
 class BitDownsampleConv(nn.Module):
     def __init__(
         self,
+        config,
         in_channels,
         out_channels,
         stride=1,
         preact=True,
-        padding=None,
-        norm_layer=None,
     ):
         super(BitDownsampleConv, self).__init__()
-        self.conv = WeightStandardizedConv2d(in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=padding)
-        self.norm = nn.Identity() if preact else norm_layer(num_channels=out_channels, apply_activation=False)
+        self.conv = WeightStandardizedConv2d(
+            in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=config.global_padding
+        )
+        self.norm = (
+            nn.Identity()
+            if preact
+            else BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
+        )
 
     def forward(self, x):
         return self.norm(self.conv(x))
@@ -711,9 +711,8 @@ def __init__(self, config):
         self.embedder = BitEmbeddings(config)
 
         self.encoder = BitEncoder(config)
-        norm_layer = BitGroupNormActivation
         self.norm = (
-            norm_layer(config, num_channels=config.hidden_sizes[-1])
+            BitGroupNormActivation(config, num_channels=config.hidden_sizes[-1])
             if config.layer_type == "preactivation"
             else nn.Identity()
         )

From f1e68c4a799769100ee1a8b5604fd7e01f5b58fd Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 12:12:16 +0100
Subject: [PATCH 71/88] revert x

---
 src/transformers/models/bit/modeling_bit.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index be370a7d1386..7ea03bf9aabc 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -320,7 +320,7 @@ def drop_path(input, drop_prob: float = 0.0, training: bool = False):
     return output
 
 
-# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit
 class BitDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
@@ -328,8 +328,8 @@ def __init__(self, drop_prob: Optional[float] = None) -> None:
         super().__init__()
         self.drop_prob = drop_prob
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return drop_path(hidden_states, self.drop_prob, self.training)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)

From 7efad37d42448a0101a48e723b3df8bf7dfe7180 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 12:15:46 +0100
Subject: [PATCH 72/88] add copied from

---
 .../models/vit_hybrid/image_processing_vit_hybrid.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index bd797ffcb68f..53bda183985e 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -40,7 +40,7 @@
 if is_vision_available():
     import PIL
 
-
+# Copied from transformers.models.bit.image_processing_bit.convert_to_rgb
 def convert_to_rgb(image: Union[Any, PIL.Image.Image]) -> Union[Any, PIL.Image.Image]:
     """
     Converts `PIL.Image.Image` to RGB format. Images in other formats are returned as is.

From ebc4f1dd4f7ff28fc6315c1520131c9de8112b2b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 12:20:44 +0100
Subject: [PATCH 73/88] last changes + integration tests

---
 src/transformers/models/bit/modeling_bit.py         |  1 -
 tests/models/vit_hybrid/test_modeling_vit_hybrid.py | 12 +++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 7ea03bf9aabc..0ec14db4d1d3 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -16,7 +16,6 @@
 
 import collections
 import math
-from functools import partial
 from typing import Optional, Tuple
 
 import torch
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index 81c373a2c704..1606d9237f52 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -204,11 +204,17 @@ def prepare_img():
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+        return (
+            ViTFeatureExtractor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            if is_vision_available()
+            else None
+        )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
+        model = ViTHybridForImageClassification.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -222,6 +228,6 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+        expected_slice = torch.tensor([-1.9090, -0.4993, -0.2389]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 503755b44a2ae401bf1141737b54b7bbc6f66eff Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 12:32:39 +0100
Subject: [PATCH 74/88] make fixup

---
 .../models/vit_hybrid/image_processing_vit_hybrid.py             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index 53bda183985e..296346a544f8 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -40,6 +40,7 @@
 if is_vision_available():
     import PIL
 
+
 # Copied from transformers.models.bit.image_processing_bit.convert_to_rgb
 def convert_to_rgb(image: Union[Any, PIL.Image.Image]) -> Union[Any, PIL.Image.Image]:
     """

From 0211676b11756c5d37f03d8fbcec2f0a76eebc99 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 5 Dec 2022 19:02:46 +0100
Subject: [PATCH 75/88] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/bit/modeling_bit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 0ec14db4d1d3..8e0c1da2910a 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -235,7 +235,7 @@ def __init__(
         kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
         stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
         dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
-        super(BitMaxPool2d, self).__init__(kernel_size, stride, padding, dilation, ceil_mode)
+        super().__init__(kernel_size, stride, padding, dilation, ceil_mode)
         if use_dynamic_padding:
             self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
         else:
@@ -492,7 +492,7 @@ def __init__(
         stride=1,
         preact=True,
     ):
-        super(BitDownsampleConv, self).__init__()
+        super().__init__()
         self.conv = WeightStandardizedConv2d(
             in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=config.global_padding
         )

From 36751e131c26442e1b4bbfdbc436c415611dee92 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 19:05:15 +0100
Subject: [PATCH 76/88] fix name

---
 src/transformers/models/bit/modeling_bit.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 8e0c1da2910a..6467170710c6 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -528,9 +528,9 @@ def __init__(
 
         # Get the layer type
         if config.layer_type == "bottleneck":
-            layer_fn = BitBottleneckLayer
+            layer_cls = BitBottleneckLayer
         else:
-            layer_fn = BitPreActivationBottleneckLayer
+            layer_cls = BitPreActivationBottleneckLayer
 
         prev_chs = in_channels
         self.layers = nn.Sequential()
@@ -542,7 +542,7 @@ def __init__(
 
             self.layers.add_module(
                 str(layer_idx),
-                layer_fn(
+                layer_cls(
                     config,
                     prev_chs,
                     out_channels,

From 391827e4e7fb7345a57853775a812a93ad294a1d Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 19:07:12 +0100
Subject: [PATCH 77/88] fix message

---
 src/transformers/models/vit_hybrid/configuration_vit_hybrid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 0ca601c91c46..b828fff495ba 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -121,7 +121,7 @@ def __init__(
                 backbone_config_class = CONFIG_MAPPING[backbone_config["model_type"]]
             else:
                 logger.info(
-                    "`model_type` is not found in `backbone_config`. Use `ResNet` as the backbone configuration class."
+                    "`model_type` is not found in `backbone_config`. Use `Bit` as the backbone configuration class."
                 )
                 backbone_config_class = BitConfig
             backbone_config = backbone_config_class(**backbone_config)

From 246a3c75e01f30a77a5f2ad670f61476ba965591 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 19:11:20 +0100
Subject: [PATCH 78/88] remove assert and refactor

---
 .../models/vit_hybrid/modeling_vit_hybrid.py          | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index ea72b117b583..4393e1451b93 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -97,20 +97,21 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         class_pos_embed = self.position_embeddings[:, 0]
         patch_pos_embed = self.position_embeddings[:, 1:]
         dim = embeddings.shape[-1]
-        h0 = height // self.config.patch_size
-        w0 = width // self.config.patch_size
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
         # we add a small number to avoid floating point error in the interpolation
         # see discussion at https://github.com/facebookresearch/dino/issues/8
-        h0, w0 = h0 + 0.1, w0 + 0.1
+        height, width = height + 0.1, width + 0.1
         patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed,
-            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)),
             mode="bicubic",
             align_corners=False,
         )
-        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
+        if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+            raise ValueError(f"Invalid height or width: {height}, {width}")
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
         return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
 

From 2f2f623d82e86cefe0ce0ce635d38307d6fdb684 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 19:16:21 +0100
Subject: [PATCH 79/88] refactor + make fixup

---
 .../models/vit_hybrid/modeling_vit_hybrid.py          |  1 -
 tests/models/bit/test_modeling_bit.py                 | 11 +----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 4393e1451b93..743dbc48837f 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -54,7 +54,6 @@
 ]
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings with ViT->ViTHybrid
 class ViTHybridEmbeddings(nn.Module):
     """
     Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index 4dab66c08143..7e59d79d81ff 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -102,7 +102,6 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
             (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
@@ -144,15 +143,7 @@ class BitModelTest(ModelTesterMixin, unittest.TestCase):
     attention_mask and seq_length.
     """
 
-    all_model_classes = (
-        (
-            BitModel,
-            BitForImageClassification,
-            BitBackbone,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (BitModel, BitForImageClassification, BitBackbone) if is_torch_available() else ()
 
     fx_compatible = False
     test_pruning = False

From e49308ba4fb7926dd06982434dbfa71e84bb4a52 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 19:44:00 +0100
Subject: [PATCH 80/88] refactor - add  + sfety checker

---
 src/transformers/models/bit/modeling_bit.py     | 10 +++++++++-
 .../models/vit_hybrid/modeling_vit_hybrid.py    | 17 ++++++-----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 6467170710c6..75e0b7f2ad27 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -649,7 +649,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->Bit,resnet->bit
 class BitPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -672,6 +671,15 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, BitModel):
             module.gradient_checkpointing = value
 
+    @torch.no_grad()
+    def _get_feature_map_size(self, dummy_image):
+        training = self.training
+        if training:
+            self.eval()
+        feature_map = self(dummy_image).feature_maps[-1]
+        self.train(training)
+        return feature_map
+
 
 BIT_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 743dbc48837f..651b151d2f1e 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -161,20 +161,15 @@ def __init__(self, config, feature_size=None):
         patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
 
         self.backbone = AutoBackbone.from_config(config.backbone_config)
+        if self.backbone.config.model_type != "bit":
+            raise ValueError(f"Backbone model type {self.backbone.model_type} is not supported.")
         feature_dim = self.backbone.channels[-1]
 
         if feature_size is None:
-            with torch.no_grad():
-                # NOTE Most reliable way of determining spatial output dimensions is to run forward pass
-                training = self.backbone.training
-                if training:
-                    self.backbone.eval()
-                feature_map = self.backbone(torch.zeros(1, num_channels, image_size[0], image_size[1])).feature_maps[
-                    -1
-                ]
-                feature_size = feature_map.shape[-2:]
-                feature_dim = feature_map.shape[1]
-                self.backbone.train(training)
+            dummy_image = torch.zeros(1, num_channels, image_size[0], image_size[1])
+            feature_map = self.backbone._get_feature_map_size(dummy_image)
+            feature_size = feature_map.shape[-2:]
+            feature_dim = feature_map.shape[1]
         else:
             feature_size = (
                 feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)

From 93669f7fb53bbdc56a73eb8b4c9cae553ad8cf94 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 19:50:51 +0100
Subject: [PATCH 81/88] fix docstring + checkpoint names

---
 src/transformers/models/bit/configuration_bit.py          | 2 +-
 src/transformers/models/bit/modeling_bit.py               | 4 ++--
 .../models/vit_hybrid/configuration_vit_hybrid.py         | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 3857d4b46bc3..d47486207a01 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 BIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/resnetv2-50": "https://huggingface.co/google/resnetv2-50/resolve/main/config.json",
+    "google/bit-50": "https://huggingface.co/google/resnetv2-50/resolve/main/config.json",
 }
 
 
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 75e0b7f2ad27..b5b361e7162c 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -52,11 +52,11 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
 
 # Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/resnetnv2-50"
+_IMAGE_CLASS_CHECKPOINT = "google/bit-50"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
 
 BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/resnetnv2-50",
+    "google/bit-50",
     # See all BiT models at https://huggingface.co/models?filter=bit
 ]
 
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index b828fff495ba..24c2f72410c3 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -26,7 +26,7 @@
 logger = logging.get_logger(__name__)
 
 VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/vit-base-r50-s16-384": "https://huggingface.co/vit-base-r50-s16-384/resolve/main/config.json",
+    "google/vit-hybrid-base-bit-384": "https://huggingface.co/vit-hybrid-base-bit-384/resolve/main/config.json",
     # See all ViT hybrid models at https://huggingface.co/models?filter=vit
 }
 
@@ -61,11 +61,11 @@ class ViTHybridConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `1`):
+        patch_size (`int`, *optional*, defaults to 1):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.

From d43498beafc58c981e8e0b3251a6154f34f12054 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 20:02:46 +0100
Subject: [PATCH 82/88] fix merge issues

---
 README.md                                     |  2 +-
 README_es.md                                  |  2 +-
 README_ja.md                                  |  2 +-
 README_ko.md                                  |  2 +-
 README_zh-hans.md                             |  2 +-
 README_zh-hant.md                             |  2 +-
 docs/source/en/index.mdx                      |  4 +--
 src/transformers/__init__.py                  | 29 ++++++++++---------
 src/transformers/models/__init__.py           |  2 +-
 .../models/auto/configuration_auto.py         |  6 ++--
 src/transformers/models/auto/modeling_auto.py |  2 +-
 src/transformers/models/bit/modeling_bit.py   |  4 +--
 src/transformers/utils/dummy_pt_objects.py    |  2 +-
 13 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 25d79785d00e..c35c4d2c51c4 100644
--- a/README.md
+++ b/README.md
@@ -272,8 +272,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_es.md b/README_es.md
index 4593656e7657..4de43f747e03 100644
--- a/README_es.md
+++ b/README_es.md
@@ -272,8 +272,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_ja.md b/README_ja.md
index c187434e178f..56718ad1ad69 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -307,8 +307,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_ko.md b/README_ko.md
index ae36dac86b7a..7ac0e08a79f5 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -222,8 +222,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_zh-hans.md b/README_zh-hans.md
index b9e3555b9b45..bf302a2cc295 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -246,8 +246,8 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 6d7ca2eba446..dfd8e0a06365 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -258,8 +258,8 @@ conda install -c huggingface transformers
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 5cffacb27767..544a74ac976a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -60,8 +60,8 @@ The documentation is organized into five sections:
 1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
@@ -232,8 +232,8 @@ Flax), PyTorch, and/or TensorFlow.
 |        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
 |        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8839d1e98166..a0bb4f9ddbe0 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -160,8 +160,8 @@
         "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "BigBirdPegasusConfig",
     ],
-    "models.bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig"],
     "models.biogpt": ["BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BioGptConfig", "BioGptTokenizer"],
+    "models.bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig"],
     "models.blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig", "BlenderbotTokenizer"],
     "models.blenderbot_small": [
         "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1051,6 +1051,14 @@
             "BigBirdPegasusPreTrainedModel",
         ]
     )
+    _import_structure["models.biogpt"].extend(
+        [
+            "BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BioGptForCausalLM",
+            "BioGptModel",
+            "BioGptPreTrainedModel",
+        ]
+    )
     _import_structure["models.bit"].extend(
         [
             "BIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1058,12 +1066,6 @@
             "BitForImageClassification",
             "BitModel",
             "BitPreTrainedModel",
-    _import_structure["models.biogpt"].extend(
-        [
-            "BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "BioGptForCausalLM",
-            "BioGptModel",
-            "BioGptPreTrainedModel",
         ]
     )
     _import_structure["models.blenderbot"].extend(
@@ -3420,8 +3422,8 @@
     from .models.bertweet import BertweetTokenizer
     from .models.big_bird import BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdConfig
     from .models.bigbird_pegasus import BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdPegasusConfig
-    from .models.bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig
     from .models.biogpt import BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, BioGptConfig, BioGptTokenizer
+    from .models.bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig
     from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
     from .models.blenderbot_small import (
         BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4199,17 +4201,18 @@
             BigBirdPegasusModel,
             BigBirdPegasusPreTrainedModel,
         )
+        from .models.biogpt import (
+            BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BioGptForCausalLM,
+            BioGptModel,
+            BioGptPreTrainedModel,
+        )
         from .models.bit import (
             BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             BitBackbone,
             BitForImageClassification,
             BitModel,
             BitPreTrainedModel,
-        from .models.biogpt import (
-            BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            BioGptForCausalLM,
-            BioGptModel,
-            BioGptPreTrainedModel,
         )
         from .models.blenderbot import (
             BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index eeab71508420..d81ca5ac828c 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -30,8 +30,8 @@
     bertweet,
     big_bird,
     bigbird_pegasus,
-    bit,
     biogpt,
+    bit,
     blenderbot,
     blenderbot_small,
     bloom,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index a52b62b123ed..bcfc7bdde481 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -37,8 +37,8 @@
         ("bert-generation", "BertGenerationConfig"),
         ("big_bird", "BigBirdConfig"),
         ("bigbird_pegasus", "BigBirdPegasusConfig"),
-        ("bit", "BitConfig"),
         ("biogpt", "BioGptConfig"),
+        ("bit", "BitConfig"),
         ("blenderbot", "BlenderbotConfig"),
         ("blenderbot-small", "BlenderbotSmallConfig"),
         ("bloom", "BloomConfig"),
@@ -192,8 +192,8 @@
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("bit", "BIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("biogpt", "BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bit", "BIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bloom", "BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -337,8 +337,8 @@
         ("bertweet", "BERTweet"),
         ("big_bird", "BigBird"),
         ("bigbird_pegasus", "BigBird-Pegasus"),
-        ("bit", "BiT"),
         ("biogpt", "BioGpt"),
+        ("bit", "BiT"),
         ("blenderbot", "Blenderbot"),
         ("blenderbot-small", "BlenderbotSmall"),
         ("bloom", "BLOOM"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e761fc706e7a..63453438c893 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -36,8 +36,8 @@
         ("bert-generation", "BertGenerationEncoder"),
         ("big_bird", "BigBirdModel"),
         ("bigbird_pegasus", "BigBirdPegasusModel"),
-        ("bit", "BitModel"),
         ("biogpt", "BioGptModel"),
+        ("bit", "BitModel"),
         ("blenderbot", "BlenderbotModel"),
         ("blenderbot-small", "BlenderbotSmallModel"),
         ("bloom", "BloomModel"),
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index b5b361e7162c..a9acbc713e75 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -327,8 +327,8 @@ def __init__(self, drop_prob: Optional[float] = None) -> None:
         super().__init__()
         self.drop_prob = drop_prob
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return drop_path(x, self.drop_prob, self.training)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
 
     def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 3673b46994b0..af230ce82f39 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1004,7 +1004,7 @@ class BioGptForCausalLM(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
-        
+
 class BioGptModel(metaclass=DummyObject):
     _backends = ["torch"]
 

From 2ddc9990ceee817b1add68095131230852104404 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 5 Dec 2022 20:05:20 +0100
Subject: [PATCH 83/88] fix function name

---
 src/transformers/models/bit/modeling_bit.py               | 2 +-
 src/transformers/models/vit_hybrid/modeling_vit_hybrid.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index a9acbc713e75..0bde63eec7ce 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -672,7 +672,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
     @torch.no_grad()
-    def _get_feature_map_size(self, dummy_image):
+    def _get_feature_map(self, dummy_image):
         training = self.training
         if training:
             self.eval()
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 651b151d2f1e..04ebe6fc1352 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -167,7 +167,7 @@ def __init__(self, config, feature_size=None):
 
         if feature_size is None:
             dummy_image = torch.zeros(1, num_channels, image_size[0], image_size[1])
-            feature_map = self.backbone._get_feature_map_size(dummy_image)
+            feature_map = self.backbone._get_feature_map(dummy_image)
             feature_size = feature_map.shape[-2:]
             feature_dim = feature_map.shape[1]
         else:

From 0db6058ec5363e6cd51aba5df2abcf68d95aad2e Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 6 Dec 2022 10:09:59 +0100
Subject: [PATCH 84/88] add dpt_hybrid support

---
 src/transformers/modeling_outputs.py          |  18 ++
 .../models/dpt/configuration_dpt.py           |  23 ++
 src/transformers/models/dpt/modeling_dpt.py   | 213 +++++++++++++++---
 3 files changed, 225 insertions(+), 29 deletions(-)

diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 57a01fa7c69c..0847fdfe21d4 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -65,6 +65,21 @@ class BaseModelOutputWithNoAttention(ModelOutput):
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class BaseModelOutputWithIntermediateActivations(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
+            Intermediate activations that can be used to compute hidden states of the model at various layers.
+    """
+    last_hidden_states: torch.FloatTensor = None
+    intermediate_activations: Optional[Tuple[torch.FloatTensor]] = None
+
+
 @dataclass
 class BaseModelOutputWithPooling(ModelOutput):
     """
@@ -89,12 +104,15 @@ class BaseModelOutputWithPooling(ModelOutput):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
+        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
+            Intermediate activations that can be used to compute hidden states of the model at various layers.
     """
 
     last_hidden_state: torch.FloatTensor = None
     pooler_output: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+    intermediate_activations: Optional[Tuple[torch.FloatTensor]] = None
 
 
 @dataclass
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index a255b0596b4d..c280ca3fa0f5 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..bit import BitConfig
 
 
 logger = logging.get_logger(__name__)
@@ -76,6 +77,8 @@ class DPTConfig(PretrainedConfig):
             - "project" passes information to the other tokens by concatenating the readout to all other tokens before
               projecting the
             representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
+        embedding_type (`str`, *optional*, defaults to `"patch_embedding"`):
+            The type of embedding to use. Can be one of [`"patch_embedding"`, `"vit_hybrid"`].
         reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
             The up/downsampling factors of the reassemble layers.
         neck_hidden_sizes (`List[str]`, *optional*, defaults to [96, 192, 384, 768]):
@@ -125,6 +128,7 @@ def __init__(
         image_size=384,
         patch_size=16,
         num_channels=3,
+        embedding_type="patch_embedding",
         qkv_bias=True,
         backbone_out_indices=[2, 5, 8, 11],
         readout_type="project",
@@ -142,6 +146,25 @@ def __init__(
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
+        
+        self.embedding_type = embedding_type
+        if embedding_type not in ["patch_embedding", "vit_hybrid"]:
+            raise ValueError("Embedding type must be one of ['patch_embedding', 'vit_hybrid']")
+        if embedding_type == "vit_hybrid":
+            logger.info("Initializing the config with a `BiT` backbone.")
+            backbone_config = {
+                "global_padding": "same",
+                "layer_type": "bottleneck",
+                "depths": [3, 4, 9],
+                "out_features": ["stage1", "stage2", "stage3"],
+                "embedding_dynamic_padding": True,
+            }
+            self.backbone_config = BitConfig(**backbone_config)
+            self.is_hybrid = True
+        else:
+            self.backbone_config = None
+            self.is_hybrid = False
+        
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 1f0cb869f482..f587848abcaa 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -41,7 +41,9 @@
     BaseModelOutputWithPooling,
     DepthEstimatorOutput,
     SemanticSegmenterOutput,
+    BaseModelOutputWithIntermediateActivations,
 )
+from ..auto import AutoBackbone
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
@@ -65,6 +67,109 @@
 ]
 
 
+# Copied from transformers.models.vit_hybrid.modeling_vit_hybrid.ViTHybridPatchEmbeddings with ViTHybrid->DPTViTHybrid
+class DPTViTHybridEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, feature_size=None):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.backbone = AutoBackbone.from_config(config.backbone_config)
+        feature_dim = self.backbone.channels[-1]
+        if len(config.backbone_config.out_features) != 3:
+            raise ValueError(
+                f"Expected backbone to have 3 output features, got {len(config.backbone_config.out_features)}"
+            )
+        self.residual_feature_map_index = [0, 1] # Always take the output of the first and second backbone stage
+
+        if feature_size is None:
+            with torch.no_grad():
+                # NOTE Most reliable way of determining spatial output dimensions is to run forward pass
+                training = self.backbone.training
+                if training:
+                    self.backbone.eval()
+                feature_map = self.backbone(torch.zeros(1, num_channels, image_size[0], image_size[1])).feature_maps[
+                    -1
+                ]
+                feature_size = feature_map.shape[-2:]
+                feature_dim = feature_map.shape[1]
+                self.backbone.train(training)
+        else:
+            feature_size = (
+                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
+            )
+            feature_dim = self.backbone.channels[-1]
+
+        self.image_size = image_size
+        self.patch_size = patch_size[0]
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=1)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+
+    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
+        posemb_tok = posemb[:, :start_index]
+        posemb_grid = posemb[0, start_index:]
+
+        old_grid_size = int(math.sqrt(len(posemb_grid)))
+
+        posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
+        posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
+        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)
+
+        posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+        return posemb
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
+        position_embeddings = self._resize_pos_embed(
+            self.position_embeddings, height // self.patch_size, width // self.patch_size
+        )
+
+        backbone_output = self.backbone(pixel_values)
+        
+        features = backbone_output.feature_maps[-1]
+
+        output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index]
+
+        embeddings = self.projection(features).flatten(2).transpose(1, 2)
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + position_embeddings
+
+        return BaseModelOutputWithIntermediateActivations(
+            last_hidden_states=embeddings,
+            intermediate_activations=output_hidden_states,
+        )
+
+
 class DPTViTEmbeddings(nn.Module):
     """
     Construct the CLS token, position and patch embeddings.
@@ -117,7 +222,9 @@ def forward(self, pixel_values):
 
         embeddings = self.dropout(embeddings)
 
-        return embeddings
+        return BaseModelOutputWithIntermediateActivations(
+            last_hidden_states=embeddings
+        )
 
 
 class DPTViTPatchEmbeddings(nn.Module):
@@ -429,6 +536,35 @@ def __init__(self, config):
 
         self.config = config
         self.layers = nn.ModuleList()
+        if config.embedding_type == "vit_hybrid":
+            self._init_reassemble_dpt_hybrid(config)
+        else:
+            self._init_reassemble_dpt(config)
+    
+    def _init_reassemble_dpt_hybrid(self, config):
+        r""""
+        This needs to be re-defined since for `DPTHybrid` the first 2 reassemble layers are set to 
+        `nn.Identity()`. 
+        """
+        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+            if i <= 1:
+                self.layers.append(nn.Identity())
+            elif i > 1:
+                self.layers.append(
+                    DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor)
+                )
+
+        if config.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            for i in range(len(config.neck_hidden_sizes)):
+                if i <= 1:
+                    self.readout_projects.append(nn.Sequential(nn.Identity()))
+                elif i > 1:
+                    self.readout_projects.append(
+                        nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act])
+                    )
+    
+    def _init_reassemble_dpt(self, config):
         for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
             self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
 
@@ -439,35 +575,38 @@ def __init__(self, config):
                     nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act])
                 )
 
-    def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
+    def forward(self, hidden_states: List[torch.Tensor], ignore_index: Optional[List]=[]) -> List[torch.Tensor]:
         """
         Args:
             hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                 List of hidden states from the backbone.
+            ignore_index (`List[int]`, *optional*):
+                List of indices to ignore when reassembling the hidden states.
         """
         out = []
 
         for i, hidden_state in enumerate(hidden_states):
-            # reshape to (B, C, H, W)
-            hidden_state, cls_token = hidden_state[:, 1:], hidden_state[:, 0]
-            batch_size, sequence_length, num_channels = hidden_state.shape
-            size = int(math.sqrt(sequence_length))
-            hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
-            hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
-
-            feature_shape = hidden_state.shape
-            if self.config.readout_type == "project":
-                # reshape to (B, H*W, C)
-                hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
-                readout = cls_token.unsqueeze(1).expand_as(hidden_state)
-                # concatenate the readout token to the hidden states and project
-                hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
-                # reshape back to (B, C, H, W)
-                hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
-            elif self.config.readout_type == "add":
-                hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
-                hidden_state = hidden_state.reshape(feature_shape)
-            hidden_state = self.layers[i](hidden_state)
+            if i not in ignore_index:
+                # reshape to (B, C, H, W)
+                hidden_state, cls_token = hidden_state[:, 1:], hidden_state[:, 0]
+                batch_size, sequence_length, num_channels = hidden_state.shape
+                size = int(math.sqrt(sequence_length))
+                hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+
+                feature_shape = hidden_state.shape
+                if self.config.readout_type == "project":
+                    # reshape to (B, H*W, C)
+                    hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
+                    readout = cls_token.unsqueeze(1).expand_as(hidden_state)
+                    # concatenate the readout token to the hidden states and project
+                    hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
+                    # reshape back to (B, C, H, W)
+                    hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
+                elif self.config.readout_type == "add":
+                    hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
+                    hidden_state = hidden_state.reshape(feature_shape)
+                hidden_state = self.layers[i](hidden_state)
             out.append(hidden_state)
 
         return out
@@ -679,9 +818,14 @@ class DPTModel(DPTPreTrainedModel):
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
+        self.is_hybird = False
 
         # vit encoder
-        self.embeddings = DPTViTEmbeddings(config)
+        if config.embedding_type == "patch_embedding":
+            self.embeddings = DPTViTEmbeddings(config)
+        else:
+            self.embeddings = DPTViTHybridEmbeddings(config)
+            self.is_hybird = True
         self.encoder = DPTViTEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -734,7 +878,7 @@ def forward(
         embedding_output = self.embeddings(pixel_values)
 
         encoder_outputs = self.encoder(
-            embedding_output,
+            embedding_output.last_hidden_states,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -754,6 +898,7 @@ def forward(
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
+            intermediate_activations=embedding_output.intermediate_activations,
         )
 
 
@@ -787,7 +932,6 @@ class DPTNeck(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-
         self.config = config
 
         # postprocessing
@@ -799,6 +943,8 @@ def __init__(self, config):
         # fusion
         self.fusion_stage = DPTFeatureFusionStage(config)
 
+        self.is_using_hybrid = config.embedding_type == "vit_hybrid"
+
     def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         if not isinstance(hidden_states, list):
             raise ValueError("hidden_states should be a list of tensors")
@@ -807,7 +953,10 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
             raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
 
         # postprocess hidden states
-        features = self.reassemble_stage(hidden_states)
+        if self.is_using_hybrid:
+            features = self.reassemble_stage(hidden_states, ignore_index=[0, 1])
+        else:
+            features = self.reassemble_stage(hidden_states)
 
         features = [self.convs[i](feature) for i, feature in enumerate(features)]
 
@@ -939,9 +1088,15 @@ def forward(
 
         # only keep certain features based on config.backbone_out_indices
         # note that the hidden_states also include the initial embeddings
-        hidden_states = [
-            feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
-        ]
+        if not self.config.is_hybrid:
+            hidden_states = [
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
+            ]
+        else:
+            backbone_hidden_states = outputs.intermediate_activations
+            backbone_hidden_states.extend(feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:])
+
+            hidden_states = backbone_hidden_states
 
         hidden_states = self.neck(hidden_states)
 

From dbffb59cd95870377311e3c308fec88a4fd4a0cd Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 6 Dec 2022 10:29:47 +0100
Subject: [PATCH 85/88] add clean

---
 src/transformers/modeling_outputs.py          |   1 +
 .../models/dpt/configuration_dpt.py           |  19 +-
 .../dpt/convert_dpt_hybrid_to_pytorch.py      | 334 ++++++++++++++++++
 src/transformers/models/dpt/modeling_dpt.py   |  32 +-
 4 files changed, 366 insertions(+), 20 deletions(-)
 create mode 100644 src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py

diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 0847fdfe21d4..6f6249c6efba 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -76,6 +76,7 @@ class BaseModelOutputWithIntermediateActivations(ModelOutput):
         intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
             Intermediate activations that can be used to compute hidden states of the model at various layers.
     """
+
     last_hidden_states: torch.FloatTensor = None
     intermediate_activations: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index c280ca3fa0f5..1a5708eb17f6 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ DPT model configuration"""
 
+import copy
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..bit import BitConfig
@@ -146,7 +148,7 @@ def __init__(
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
-        
+
         self.embedding_type = embedding_type
         if embedding_type not in ["patch_embedding", "vit_hybrid"]:
             raise ValueError("Embedding type must be one of ['patch_embedding', 'vit_hybrid']")
@@ -164,7 +166,7 @@ def __init__(
         else:
             self.backbone_config = None
             self.is_hybrid = False
-        
+
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
@@ -191,3 +193,16 @@ def __init__(
         self.auxiliary_loss_weight = auxiliary_loss_weight
         self.semantic_loss_ignore_index = semantic_loss_ignore_index
         self.semantic_classifier_dropout = semantic_classifier_dropout
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        if self.backbone_config is not None:
+            self.backbone_config = self.backbone_config.to_dict()
+
+        output = copy.deepcopy(self.__dict__)
+
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
new file mode 100644
index 000000000000..43561350cf90
--- /dev/null
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -0,0 +1,334 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import cached_download, hf_hub_url
+from transformers import (
+    DPTFeatureExtractor,
+    DPTConfig,
+    DPTForDepthEstimation,
+    DPTForSemanticSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_dpt_config(checkpoint_url):
+    config = DPTConfig(embedding_type="vit_hybrid")
+
+    if "large" in checkpoint_url:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+        config.backbone_out_indices = [5, 11, 17, 23]
+        config.neck_hidden_sizes = [256, 512, 1024, 1024]
+        expected_shape = (1, 384, 384)
+
+    if "nyu" or "midas" in checkpoint_url:
+        config.hidden_size = 768
+        config.reassemble_factors = [1, 1, 1, 0.5]
+        config.neck_hidden_sizes = [256, 512, 768, 768]
+        config.num_labels = 150
+        config.patch_size = 16
+        expected_shape = (1, 384, 384)
+        config.use_batch_norm_in_fusion_residual = False
+        config.readout_type = "project"
+
+    if "ade" in checkpoint_url:
+        config.use_batch_norm_in_fusion_residual = True
+        config.hidden_size = 768
+        config.reassemble_stage = [1, 1, 1, 0.5]
+        config.num_labels = 150
+        config.patch_size = 16
+        repo_id = "huggingface/label-files"
+        filename = "ade20k-id2label.json"
+        id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+        expected_shape = [1, 150, 480, 480]
+
+    return config, expected_shape
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(name):
+    if (
+        "pretrained.model" in name
+        and "cls_token" not in name
+        and "pos_embed" not in name
+        and "patch_embed" not in name
+    ):
+        name = name.replace("pretrained.model", "dpt.encoder")
+    if "pretrained.model" in name:
+        name = name.replace("pretrained.model", "dpt.embeddings")
+    if "patch_embed" in name:
+        name = name.replace("patch_embed", "")
+    if "pos_embed" in name:
+        name = name.replace("pos_embed", "position_embeddings")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "proj" in name and "project" not in name:
+        name = name.replace("proj", "projection")
+    if "blocks" in name:
+        name = name.replace("blocks", "layer")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "norm1" in name and "backbone" not in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name and "backbone" not in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "scratch.output_conv" in name:
+        name = name.replace("scratch.output_conv", "head")
+    if "scratch" in name:
+        name = name.replace("scratch", "neck")
+    if "layer1_rn" in name:
+        name = name.replace("layer1_rn", "convs.0")
+    if "layer2_rn" in name:
+        name = name.replace("layer2_rn", "convs.1")
+    if "layer3_rn" in name:
+        name = name.replace("layer3_rn", "convs.2")
+    if "layer4_rn" in name:
+        name = name.replace("layer4_rn", "convs.3")
+    if "refinenet" in name:
+        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
+        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
+        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
+    if "out_conv" in name:
+        name = name.replace("out_conv", "projection")
+    if "resConfUnit1" in name:
+        name = name.replace("resConfUnit1", "residual_layer1")
+    if "resConfUnit2" in name:
+        name = name.replace("resConfUnit2", "residual_layer2")
+    if "conv1" in name:
+        name = name.replace("conv1", "convolution1")
+    if "conv2" in name:
+        name = name.replace("conv2", "convolution2")
+    # readout blocks
+    if "pretrained.act_postprocess1.0.project.0" in name:
+        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
+    if "pretrained.act_postprocess2.0.project.0" in name:
+        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
+    if "pretrained.act_postprocess3.0.project.0" in name:
+        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
+    if "pretrained.act_postprocess4.0.project.0" in name:
+        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
+
+    # resize blocks
+    if "pretrained.act_postprocess1.3" in name:
+        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
+    if "pretrained.act_postprocess1.4" in name:
+        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
+    if "pretrained.act_postprocess2.3" in name:
+        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
+    if "pretrained.act_postprocess2.4" in name:
+        name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
+    if "pretrained.act_postprocess3.3" in name:
+        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
+    if "pretrained.act_postprocess4.3" in name:
+        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
+    if "pretrained.act_postprocess4.4" in name:
+        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
+    if "pretrained" in name:
+        name = name.replace("pretrained", "dpt")
+    if "bn" in name:
+        name = name.replace("bn", "batch_norm")
+    if "head" in name:
+        name = name.replace("head", "head.head")
+    if "encoder.norm" in name:
+        name = name.replace("encoder.norm", "layernorm")
+    if "auxlayer" in name:
+        name = name.replace("auxlayer", "auxiliary_head.head")
+    if "backbone" in name:
+        name = name.replace("backbone", "backbone.bit.encoder")
+
+    if ".." in name:
+        name = name.replace("..", ".")
+
+    if "stem.conv" in name:
+        name = name.replace("stem.conv", "bit.embedder.convolution")
+    if "blocks" in name:
+        name = name.replace("blocks", "layers")
+    if "convolution" in name and "backbone" in name:
+        name = name.replace("convolution", "conv")
+    if "layer" in name and "backbone" in name:
+        name = name.replace("layer", "layers")
+    if "backbone.bit.encoder.bit" in name:
+        name = name.replace("backbone.bit.encoder.bit", "backbone.bit")
+    if "embedder.conv" in name:
+        name = name.replace("embedder.conv", "embedder.convolution")
+    if "backbone.bit.encoder.stem.norm" in name:
+        name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm")
+    return name
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    for i in range(config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[
+            -config.hidden_size :
+        ]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction):
+    """
+    Copy/paste/tweak model's weights to our DPT structure.
+    """
+
+    # define DPT configuration based on URL
+    config, expected_shape = get_dpt_config(checkpoint_url)
+    # load original state_dict from URL
+    # state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
+    state_dict = torch.load(checkpoint_url, map_location="cpu")
+    # remove certain keys
+    remove_ignore_keys_(state_dict)
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        state_dict[rename_key(key)] = val
+    # read in qkv matrices
+    read_in_q_k_v(state_dict, config)
+
+    # load HuggingFace model
+    model = (
+        DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
+    )
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # Check outputs on an image
+    size = 480 if "ade" in checkpoint_url else 384
+    feature_extractor = DPTFeatureExtractor(size=size)
+
+    image = prepare_img()
+    encoding = feature_extractor(image, return_tensors="pt")
+
+    # forward pass
+    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
+
+    if show_prediction:
+        prediction = (
+                    torch.nn.functional.interpolate(
+                        outputs.unsqueeze(1),
+                        size=(image.size[1], image.size[0]),
+                        mode="bicubic",
+                        align_corners=False,
+                    )
+                    .squeeze()
+                    .cpu()
+                    .numpy()
+            )
+    
+        Image.fromarray((prediction / prediction.max()) * 255).show()
+
+    # # Assert logits
+    # expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]])
+    # if "ade" in checkpoint_url:
+    #     expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]])
+    # assert outputs.shape == torch.Size(expected_shape)
+    # assert (
+    #     torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4)
+    #     if "ade" in checkpoint_url
+    #     else torch.allclose(outputs[0, :3, :3], expected_slice)
+    # )
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
+        type=str,
+        help="URL of the original DPT checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--model_name",
+        default="dpt-large",
+        type=str,
+        help="Name of the model, in case you're pushing to the hub.",
+    )
+    parser.add_argument(
+        "--show_prediction",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    convert_dpt_checkpoint(
+        args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction
+    )
\ No newline at end of file
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index f587848abcaa..01e9dfe9a0ef 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -38,15 +38,15 @@
 )
 from ...modeling_outputs import (
     BaseModelOutput,
+    BaseModelOutputWithIntermediateActivations,
     BaseModelOutputWithPooling,
     DepthEstimatorOutput,
     SemanticSegmenterOutput,
-    BaseModelOutputWithIntermediateActivations,
 )
-from ..auto import AutoBackbone
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
+from ..auto import AutoBackbone
 from .configuration_dpt import DPTConfig
 
 
@@ -67,7 +67,6 @@
 ]
 
 
-# Copied from transformers.models.vit_hybrid.modeling_vit_hybrid.ViTHybridPatchEmbeddings with ViTHybrid->DPTViTHybrid
 class DPTViTHybridEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -90,7 +89,7 @@ def __init__(self, config, feature_size=None):
             raise ValueError(
                 f"Expected backbone to have 3 output features, got {len(config.backbone_config.out_features)}"
             )
-        self.residual_feature_map_index = [0, 1] # Always take the output of the first and second backbone stage
+        self.residual_feature_map_index = [0, 1]  # Always take the output of the first and second backbone stage
 
         if feature_size is None:
             with torch.no_grad():
@@ -151,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = F
         )
 
         backbone_output = self.backbone(pixel_values)
-        
+
         features = backbone_output.feature_maps[-1]
 
         output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index]
@@ -222,9 +221,7 @@ def forward(self, pixel_values):
 
         embeddings = self.dropout(embeddings)
 
-        return BaseModelOutputWithIntermediateActivations(
-            last_hidden_states=embeddings
-        )
+        return BaseModelOutputWithIntermediateActivations(last_hidden_states=embeddings)
 
 
 class DPTViTPatchEmbeddings(nn.Module):
@@ -540,19 +537,16 @@ def __init__(self, config):
             self._init_reassemble_dpt_hybrid(config)
         else:
             self._init_reassemble_dpt(config)
-    
+
     def _init_reassemble_dpt_hybrid(self, config):
-        r""""
-        This needs to be re-defined since for `DPTHybrid` the first 2 reassemble layers are set to 
-        `nn.Identity()`. 
+        r""" "
+        This needs to be re-defined since for `DPTHybrid` the first 2 reassemble layers are set to `nn.Identity()`.
         """
         for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
             if i <= 1:
                 self.layers.append(nn.Identity())
             elif i > 1:
-                self.layers.append(
-                    DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor)
-                )
+                self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
 
         if config.readout_type == "project":
             self.readout_projects = nn.ModuleList()
@@ -563,7 +557,7 @@ def _init_reassemble_dpt_hybrid(self, config):
                     self.readout_projects.append(
                         nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act])
                     )
-    
+
     def _init_reassemble_dpt(self, config):
         for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
             self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
@@ -575,7 +569,7 @@ def _init_reassemble_dpt(self, config):
                     nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act])
                 )
 
-    def forward(self, hidden_states: List[torch.Tensor], ignore_index: Optional[List]=[]) -> List[torch.Tensor]:
+    def forward(self, hidden_states: List[torch.Tensor], ignore_index: Optional[List] = []) -> List[torch.Tensor]:
         """
         Args:
             hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
@@ -1094,7 +1088,9 @@ def forward(
             ]
         else:
             backbone_hidden_states = outputs.intermediate_activations
-            backbone_hidden_states.extend(feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:])
+            backbone_hidden_states.extend(
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:]
+            )
 
             hidden_states = backbone_hidden_states
 

From e90fe901ff9bc3bcb68db2477c82faac8dabffe1 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 6 Dec 2022 14:13:37 +0100
Subject: [PATCH 86/88] fix copies

---
 README_hd.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_hd.md b/README_hd.md
index e17c5afb9978..c568e53813af 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -245,6 +245,7 @@ conda install -c huggingface transformers
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (गूगल रिसर्च से) साथ वाला पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv .org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानोन, फिलिप फाम, अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा।
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (गूगल रिसर्च से) साथ में पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv.org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानन, फिलिप फाम द्वारा , अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा पोस्ट किया गया।
 1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (फेसबुक से) साथ में कागज [एक ओपन-डोमेन चैटबॉट बनाने की विधि](https://arxiv.org /abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम। स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा।
 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (फेसबुक से) साथ में पेपर [एक ओपन-डोमेन चैटबॉट बनाने की रेसिपी](https://arxiv .org/abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा।
 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
@@ -376,6 +377,7 @@ conda install -c huggingface transformers
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain से) साथ में कागज [ViLT: Vision-and-Language Transformer बिना कनवल्शन या रीजन सुपरविजन](https://arxiv.org/abs/2102.03334) वोनजे किम, बोक्यूंग सोन, इल्डू किम द्वारा पोस्ट किया गया।
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (गूगल एआई से) कागज के साथ [एक इमेज इज़ वर्थ 16x16 वर्ड्स: ट्रांसफॉर्मर्स फॉर इमेज रिकॉग्निशन एट स्केल](https://arxiv.org/abs/2010.11929) एलेक्सी डोसोवित्स्की, लुकास बेयर, अलेक्जेंडर कोलेसनिकोव, डिर्क वीसेनबोर्न, शियाओहुआ झाई, थॉमस अनटरथिनर, मुस्तफा देहघानी, मैथियास मिंडरर, जॉर्ज हेगोल्ड, सिल्वेन गेली, जैकब उस्ज़कोरेइट द्वारा हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP से) साथ वाला पेपर [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:/ /arxiv.org/pdf/1908.03557) लियुनियन हेरोल्ड ली, मार्क यात्स्कर, दा यिन, चो-जुई हसीह, काई-वेई चांग द्वारा।
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [मास्कड ऑटोएन्कोडर स्केलेबल विजन लर्नर्स हैं](https://arxiv.org/ एब्स/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा।
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।

From fb1b84b56b0f45df4cfe4e6ac762c50b7a58cd0b Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 6 Dec 2022 15:16:26 +0100
Subject: [PATCH 87/88] final fix

---
 .../models/dpt/configuration_dpt.py           |  6 +--
 .../dpt/convert_dpt_hybrid_to_pytorch.py      | 42 +++++++------------
 2 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 1a5708eb17f6..dccb9d726824 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -199,10 +199,10 @@ def to_dict(self):
         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
-        if self.backbone_config is not None:
-            self.backbone_config = self.backbone_config.to_dict()
-
         output = copy.deepcopy(self.__dict__)
 
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+
         output["model_type"] = self.__class__.model_type
         return output
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
index 43561350cf90..7c9ab904d9bc 100644
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -24,12 +24,7 @@
 
 import requests
 from huggingface_hub import cached_download, hf_hub_url
-from transformers import (
-    DPTFeatureExtractor,
-    DPTConfig,
-    DPTForDepthEstimation,
-    DPTForSemanticSegmentation,
-)
+from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
 from transformers.utils import logging
 
 
@@ -201,9 +196,7 @@ def read_in_q_k_v(state_dict, config):
         in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
         in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
         # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
         state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
         state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
             config.hidden_size : config.hidden_size * 2, :
@@ -214,9 +207,7 @@ def read_in_q_k_v(state_dict, config):
         state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
             -config.hidden_size :, :
         ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[
-            -config.hidden_size :
-        ]
+        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
 
 
 # We will verify our results on an image of cute cats
@@ -247,9 +238,7 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     read_in_q_k_v(state_dict, config)
 
     # load HuggingFace model
-    model = (
-        DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
-    )
+    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
     model.load_state_dict(state_dict)
     model.eval()
 
@@ -265,17 +254,17 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
 
     if show_prediction:
         prediction = (
-                    torch.nn.functional.interpolate(
-                        outputs.unsqueeze(1),
-                        size=(image.size[1], image.size[0]),
-                        mode="bicubic",
-                        align_corners=False,
-                    )
-                    .squeeze()
-                    .cpu()
-                    .numpy()
+            torch.nn.functional.interpolate(
+                outputs.unsqueeze(1),
+                size=(image.size[1], image.size[0]),
+                mode="bicubic",
+                align_corners=False,
             )
-    
+            .squeeze()
+            .cpu()
+            .numpy()
+        )
+
         Image.fromarray((prediction / prediction.max()) * 255).show()
 
     # # Assert logits
@@ -296,7 +285,6 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # Required parameters
@@ -331,4 +319,4 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     args = parser.parse_args()
     convert_dpt_checkpoint(
         args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction
-    )
\ No newline at end of file
+    )

From d9427140d8395fee53d82c044aa8a845b08281e0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Tue, 6 Dec 2022 16:52:01 +0100
Subject: [PATCH 88/88] add patch

---
 src/transformers/models/dpt/modeling_dpt.py | 27 ++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 01e9dfe9a0ef..cead25daa470 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -92,17 +92,22 @@ def __init__(self, config, feature_size=None):
         self.residual_feature_map_index = [0, 1]  # Always take the output of the first and second backbone stage
 
         if feature_size is None:
-            with torch.no_grad():
-                # NOTE Most reliable way of determining spatial output dimensions is to run forward pass
-                training = self.backbone.training
-                if training:
-                    self.backbone.eval()
-                feature_map = self.backbone(torch.zeros(1, num_channels, image_size[0], image_size[1])).feature_maps[
-                    -1
-                ]
-                feature_size = feature_map.shape[-2:]
-                feature_dim = feature_map.shape[1]
-                self.backbone.train(training)
+            # with torch.no_grad():
+            #     # NOTE Most reliable way of determining spatial output dimensions is to run forward pass
+            #     training = self.backbone.training
+            #     if training:
+            #         self.backbone.eval()
+            #     feature_map = self.backbone(torch.zeros(1, num_channels, image_size[0], image_size[1])).feature_maps[
+            #         -1
+            #     ]
+            #     feature_size = feature_map.shape[-2:]
+            #     feature_dim = feature_map.shape[1]
+            #     self.backbone.train(training)
+
+            # TODO: add it on the config
+            feat_map_shape = (1, 1024, 24, 24)
+            feature_size = feat_map_shape[-2:]
+            feature_dim = feat_map_shape[1]
         else:
             feature_size = (
                 feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)