diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 9d1c33900c10..e49c5ba31b04 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -27,151 +27,150 @@
     title: Generation with LLMs
   title: Tutorials
 - sections:
-  - sections:
-      - local: tasks/sequence_classification
-        title: Text classification
-      - local: tasks/token_classification
-        title: Token classification
-      - local: tasks/question_answering
-        title: Question answering
-      - local: tasks/language_modeling
-        title: Causal language modeling
-      - local: tasks/masked_language_modeling
-        title: Masked language modeling
-      - local: tasks/translation
-        title: Translation
-      - local: tasks/summarization
-        title: Summarization
-      - local: tasks/multiple_choice
-        title: Multiple choice
+  - isExpanded: false
+    sections:
+    - local: tasks/sequence_classification
+      title: Text classification
+    - local: tasks/token_classification
+      title: Token classification
+    - local: tasks/question_answering
+      title: Question answering
+    - local: tasks/language_modeling
+      title: Causal language modeling
+    - local: tasks/masked_language_modeling
+      title: Masked language modeling
+    - local: tasks/translation
+      title: Translation
+    - local: tasks/summarization
+      title: Summarization
+    - local: tasks/multiple_choice
+      title: Multiple choice
     title: Natural Language Processing
-    isExpanded: false
-  - sections:
-      - local: tasks/audio_classification
-        title: Audio classification
-      - local: tasks/asr
-        title: Automatic speech recognition
+  - isExpanded: false
+    sections:
+    - local: tasks/audio_classification
+      title: Audio classification
+    - local: tasks/asr
+      title: Automatic speech recognition
     title: Audio
-    isExpanded: false
-  - sections:
-      - local: tasks/image_classification
-        title: Image classification
-      - local: tasks/semantic_segmentation
-        title: Semantic segmentation
-      - local: tasks/video_classification
-        title: Video classification
-      - local: tasks/object_detection
-        title: Object detection
-      - local: tasks/zero_shot_object_detection
-        title: Zero-shot object detection
-      - local: tasks/zero_shot_image_classification
-        title: Zero-shot image classification
-      - local: tasks/monocular_depth_estimation
-        title: Depth estimation
+  - isExpanded: false
+    sections:
+    - local: tasks/image_classification
+      title: Image classification
+    - local: tasks/semantic_segmentation
+      title: Semantic segmentation
+    - local: tasks/video_classification
+      title: Video classification
+    - local: tasks/object_detection
+      title: Object detection
+    - local: tasks/zero_shot_object_detection
+      title: Zero-shot object detection
+    - local: tasks/zero_shot_image_classification
+      title: Zero-shot image classification
+    - local: tasks/monocular_depth_estimation
+      title: Depth estimation
     title: Computer Vision
-    isExpanded: false
-  - sections:
-      - local: tasks/image_captioning
-        title: Image captioning
-      - local: tasks/document_question_answering
-        title: Document Question Answering
-      - local: tasks/visual_question_answering
-        title: Visual Question Answering
-      - local: tasks/text-to-speech
-        title: Text to speech
+  - isExpanded: false
+    sections:
+    - local: tasks/image_captioning
+      title: Image captioning
+    - local: tasks/document_question_answering
+      title: Document Question Answering
+    - local: tasks/visual_question_answering
+      title: Visual Question Answering
+    - local: tasks/text-to-speech
+      title: Text to speech
     title: Multimodal
-    isExpanded: false
-  - sections:
-      - local: generation_strategies
-        title: Customize the generation strategy
+  - isExpanded: false
+    sections:
+    - local: generation_strategies
+      title: Customize the generation strategy
     title: Generation
-    isExpanded: false
   title: Task Guides
 - sections:
-    - local: fast_tokenizers
-      title: Use fast tokenizers from 🤗 Tokenizers
-    - local: multilingual
-      title: Run inference with multilingual models
-    - local: create_a_model
-      title: Use model-specific APIs
-    - local: custom_models
-      title: Share a custom model
-    - local: sagemaker
-      title: Run training on Amazon SageMaker
-    - local: serialization
-      title: Export to ONNX
-    - local: tflite
-      title: Export to TFLite
-    - local: torchscript
-      title: Export to TorchScript
-    - local: benchmarks
-      title: Benchmarks
-    - local: notebooks
-      title: Notebooks with examples
-    - local: community
-      title: Community resources
-    - local: custom_tools
-      title: Custom Tools and Prompts
-    - local: troubleshooting
-      title: Troubleshoot
+  - local: fast_tokenizers
+    title: Use fast tokenizers from 🤗 Tokenizers
+  - local: multilingual
+    title: Run inference with multilingual models
+  - local: create_a_model
+    title: Use model-specific APIs
+  - local: custom_models
+    title: Share a custom model
+  - local: sagemaker
+    title: Run training on Amazon SageMaker
+  - local: serialization
+    title: Export to ONNX
+  - local: tflite
+    title: Export to TFLite
+  - local: torchscript
+    title: Export to TorchScript
+  - local: benchmarks
+    title: Benchmarks
+  - local: notebooks
+    title: Notebooks with examples
+  - local: community
+    title: Community resources
+  - local: custom_tools
+    title: Custom Tools and Prompts
+  - local: troubleshooting
+    title: Troubleshoot
   title: Developer guides
 - sections:
-    - local: performance
-      title: Overview
-    - sections:
-        - local: perf_train_gpu_one
-          title: Methods and tools for efficient training on a single GPU
-        - local: perf_train_gpu_many
-          title: Multiple GPUs and parallelism
-        - local: perf_train_cpu
-          title: Efficient training on CPU
-        - local: perf_train_cpu_many
-          title: Distributed CPU training
-        - local: perf_train_tpu
-          title: Training on TPUs
-        - local: perf_train_tpu_tf
-          title: Training on TPU with TensorFlow
-        - local: perf_train_special
-          title: Training on Specialized Hardware
-        - local: perf_hardware
-          title: Custom hardware for training
-        - local: hpo_train
-          title: Hyperparameter Search using Trainer API
-      title: Efficient training techniques
-    - sections:
-        - local: perf_infer_cpu
-          title: Inference on CPU
-        - local: perf_infer_gpu_one
-          title: Inference on one GPU
-        - local: perf_infer_gpu_many
-          title: Inference on many GPUs
-        - local: perf_infer_special
-          title: Inference on Specialized Hardware
-      title: Optimizing inference
-    - local: big_models
-      title: Instantiating a big model
-    - local: debugging
-      title: Troubleshooting
-    - local: tf_xla
-      title: XLA Integration for TensorFlow Models
-    - local: perf_torch_compile
-      title: Optimize inference using `torch.compile()`
+  - local: performance
+    title: Overview
+  - sections:
+    - local: perf_train_gpu_one
+      title: Methods and tools for efficient training on a single GPU
+    - local: perf_train_gpu_many
+      title: Multiple GPUs and parallelism
+    - local: perf_train_cpu
+      title: Efficient training on CPU
+    - local: perf_train_cpu_many
+      title: Distributed CPU training
+    - local: perf_train_tpu
+      title: Training on TPUs
+    - local: perf_train_tpu_tf
+      title: Training on TPU with TensorFlow
+    - local: perf_train_special
+      title: Training on Specialized Hardware
+    - local: perf_hardware
+      title: Custom hardware for training
+    - local: hpo_train
+      title: Hyperparameter Search using Trainer API
+    title: Efficient training techniques
+  - sections:
+    - local: perf_infer_cpu
+      title: Inference on CPU
+    - local: perf_infer_gpu_one
+      title: Inference on one GPU
+    - local: perf_infer_gpu_many
+      title: Inference on many GPUs
+    - local: perf_infer_special
+      title: Inference on Specialized Hardware
+    title: Optimizing inference
+  - local: big_models
+    title: Instantiating a big model
+  - local: debugging
+    title: Troubleshooting
+  - local: tf_xla
+    title: XLA Integration for TensorFlow Models
+  - local: perf_torch_compile
+    title: Optimize inference using `torch.compile()`
   title: Performance and scalability
 - sections:
-    - local: contributing
-      title: How to contribute to transformers?
-    - local: add_new_model
-      title: How to add a model to 🤗 Transformers?
-    - local: add_tensorflow_model
-      title: How to convert a 🤗 Transformers model to TensorFlow?
-    - local: add_new_pipeline
-      title: How to add a pipeline to 🤗 Transformers?
-    - local: testing
-      title: Testing
-    - local: pr_checks
-      title: Checks on a Pull Request
+  - local: contributing
+    title: How to contribute to transformers?
+  - local: add_new_model
+    title: How to add a model to 🤗 Transformers?
+  - local: add_tensorflow_model
+    title: How to convert a 🤗 Transformers model to TensorFlow?
+  - local: add_new_pipeline
+    title: How to add a pipeline to 🤗 Transformers?
+  - local: testing
+    title: Testing
+  - local: pr_checks
+    title: Checks on a Pull Request
   title: Contribute
-
 - sections:
   - local: philosophy
     title: Philosophy
@@ -533,6 +532,8 @@
         title: ResNet
       - local: model_doc/segformer
         title: SegFormer
+      - local: model_doc/superglue
+        title: SuperGlue
       - local: model_doc/swiftformer
         title: SwiftFormer
       - local: model_doc/swin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 695e51fbe293..dd68c08aa3b9 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -123,6 +123,7 @@
     ],
     "models": [],
     # Models
+    "models.superglue": ["SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP", "SuperGlueConfig", "SuperGlueTokenizer"],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.align": [
         "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -478,6 +479,7 @@
     "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
+    "models.superglue": ["SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP", "SuperGlueConfig"],
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
     "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"],
     "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
@@ -789,6 +791,7 @@
     ]
 else:
     # Fast tokenizers structure
+    _import_structure["models.superglue"].append("SuperGlueTokenizerFast")
     _import_structure["models.albert"].append("AlbertTokenizerFast")
     _import_structure["models.bart"].append("BartTokenizerFast")
     _import_structure["models.barthez"].append("BarthezTokenizerFast")
@@ -1027,6 +1030,22 @@
     _import_structure["modeling_utils"] = ["PreTrainedModel"]
 
     # PyTorch models structure
+
+    _import_structure["models.superglue"].extend(
+        [
+            "SUPERGLUE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SuperGlueForMaskedLM",
+            "SuperGlueForCausalLM",
+            "SuperGlueForMultipleChoice",
+            "SuperGlueForQuestionAnswering",
+            "SuperGlueForSequenceClassification",
+            "SuperGlueForTokenClassification",
+            "SuperGlueLayer",
+            "SuperGlueModel",
+            "SuperGluePreTrainedModel",
+            "load_tf_weights_in_superglue",
+        ]
+    )
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2505,6 +2524,15 @@
             "ResNetPreTrainedModel",
         ]
     )
+    _import_structure["models.superglue"].extend(
+        [
+            "SUPERGLUE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SuperGlueBackbone",
+            "SuperGlueForImageClassification",
+            "SuperGlueModel",
+            "SuperGluePreTrainedModel",
+        ]
+    )
     _import_structure["models.roberta"].extend(
         [
             "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4129,6 +4157,7 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+    from .models.superglue import SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP, SuperGlueConfig, SuperGlueTokenizer
     from .models.align import (
         ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP,
         AlignConfig,
@@ -4462,6 +4491,7 @@
     from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
+    from .models.superglue import SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP, SuperGlueConfig
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
     from .models.roberta_prelayernorm import (
         ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4749,6 +4779,7 @@
         from .utils.dummy_tokenizers_objects import *
     else:
         # Fast tokenizers imports
+        from .models.superglue import SuperGlueTokenizerFast
         from .models.albert import AlbertTokenizerFast
         from .models.bart import BartTokenizerFast
         from .models.barthez import BarthezTokenizerFast
@@ -4948,6 +4979,20 @@
         from .modeling_utils import PreTrainedModel
 
         # PyTorch model imports
+
+        from .models.superglue import (
+            SUPERGLUE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SuperGlueForMaskedLM,
+            SuperGlueForCausalLM,
+            SuperGlueForMultipleChoice,
+            SuperGlueForQuestionAnswering,
+            SuperGlueForSequenceClassification,
+            SuperGlueForTokenClassification,
+            SuperGlueLayer,
+            SuperGlueModel,
+            SuperGluePreTrainedModel,
+            load_tf_weights_in_superglue,
+        )
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
@@ -6162,6 +6207,13 @@
             ResNetModel,
             ResNetPreTrainedModel,
         )
+        from .models.superglue import (
+            SUPERGLUE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SuperGlueBackbone,
+            SuperGlueForImageClassification,
+            SuperGlueModel,
+            SuperGluePreTrainedModel,
+        )
         from .models.roberta import (
             ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             RobertaForCausalLM,
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index aceec7abd406..86856268bca0 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -1660,3 +1660,14 @@ def logits(self):
             FutureWarning,
         )
         return self.reconstruction
+
+@dataclass
+class ImageMatchingOutput(ModelOutput):
+    """
+    TODO documentation
+    """
+
+    image0_matches: torch.FloatTensor = None
+    image1_matches: torch.FloatTensor = None
+    image0_matching_scores: torch.FloatTensor = None
+    image1_matching_scores: torch.FloatTensor = None
\ No newline at end of file
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7af9ff766aed..accf79a8cfdf 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from . import (
+    superglue,
     albert,
     align,
     altclip,
@@ -165,6 +166,7 @@
     regnet,
     rembert,
     resnet,
+    superglue,
     roberta,
     roberta_prelayernorm,
     roc_bert,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 7230c3f1fa19..04173788cece 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -30,6 +30,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("superglue", "SuperGlueConfig"),
         ("albert", "AlbertConfig"),
         ("align", "AlignConfig"),
         ("altclip", "AltCLIPConfig"),
@@ -170,6 +171,7 @@
         ("regnet", "RegNetConfig"),
         ("rembert", "RemBertConfig"),
         ("resnet", "ResNetConfig"),
+        ("superglue", "SuperGlueConfig"),
         ("retribert", "RetriBertConfig"),
         ("roberta", "RobertaConfig"),
         ("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
@@ -236,6 +238,7 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here)
+        ("superglue", "SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("align", "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("altclip", "ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -366,6 +369,7 @@
         ("regnet", "REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("resnet", "RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("superglue", "SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("retribert", "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roberta-prelayernorm", "ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -422,6 +426,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("superglue", "SuperGlue"),
         ("albert", "ALBERT"),
         ("align", "ALIGN"),
         ("altclip", "AltCLIP"),
@@ -584,6 +589,7 @@
         ("regnet", "RegNet"),
         ("rembert", "RemBERT"),
         ("resnet", "ResNet"),
+        ("superglue", "SuperGlue"),
         ("retribert", "RetriBERT"),
         ("roberta", "RoBERTa"),
         ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9b3ab2b1705b..a59057029ffa 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -75,6 +75,7 @@
         ("poolformer", "PoolFormerFeatureExtractor"),
         ("regnet", "ConvNextFeatureExtractor"),
         ("resnet", "ConvNextFeatureExtractor"),
+        ("superglue", "ConvNextFeatureExtractor"),
         ("segformer", "SegformerFeatureExtractor"),
         ("sew", "Wav2Vec2FeatureExtractor"),
         ("sew-d", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 075fe0c96db0..dfa63fb9e121 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -89,6 +89,7 @@
         ("pvt", "PvtImageProcessor"),
         ("regnet", "ConvNextImageProcessor"),
         ("resnet", "ConvNextImageProcessor"),
+        ("superglue", "ConvNextImageProcessor"),
         ("sam", "SamImageProcessor"),
         ("segformer", "SegformerImageProcessor"),
         ("swiftformer", "ViTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index aec9eacc2a7a..8eb481b71a12 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,6 +28,7 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
+        ("superglue", "SuperGlueModel"),
         ("albert", "AlbertModel"),
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
@@ -162,6 +163,7 @@
         ("regnet", "RegNetModel"),
         ("rembert", "RemBertModel"),
         ("resnet", "ResNetModel"),
+        ("superglue", "SuperGlueModel"),
         ("retribert", "RetriBertModel"),
         ("roberta", "RobertaModel"),
         ("roberta-prelayernorm", "RobertaPreLayerNormModel"),
@@ -290,6 +292,7 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
+("superglue", "SuperGlueForMaskedLM"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -372,6 +375,7 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
+        ("superglue", "SuperGlueForCausalLM"),
         ("bart", "BartForCausalLM"),
         ("bert", "BertLMHeadModel"),
         ("bert-generation", "BertGenerationDecoder"),
@@ -490,6 +494,7 @@
         ("pvt", "PvtForImageClassification"),
         ("regnet", "RegNetForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
+        ("superglue", "SuperGlueForImageClassification"),
         ("segformer", "SegformerForImageClassification"),
         ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
@@ -563,6 +568,7 @@
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
+("superglue", "SuperGlueForMaskedLM"),
         ("albert", "AlbertForMaskedLM"),
         ("bart", "BartForConditionalGeneration"),
         ("bert", "BertForMaskedLM"),
@@ -678,6 +684,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("superglue", "SuperGlueForSequenceClassification"),
         ("albert", "AlbertForSequenceClassification"),
         ("bart", "BartForSequenceClassification"),
         ("bert", "BertForSequenceClassification"),
@@ -757,6 +764,7 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
+        ("superglue", "SuperGlueForQuestionAnswering"),
         ("albert", "AlbertForQuestionAnswering"),
         ("bart", "BartForQuestionAnswering"),
         ("bert", "BertForQuestionAnswering"),
@@ -846,6 +854,7 @@
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
+("superglue", "SuperGlueForTokenClassification"),
         ("albert", "AlbertForTokenClassification"),
         ("bert", "BertForTokenClassification"),
         ("big_bird", "BigBirdForTokenClassification"),
@@ -906,6 +915,7 @@
 MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Multiple Choice mapping
+("superglue", "SuperGlueForMultipleChoice"),
         ("albert", "AlbertForMultipleChoice"),
         ("bert", "BertForMultipleChoice"),
         ("big_bird", "BigBirdForMultipleChoice"),
@@ -1037,6 +1047,7 @@
         ("maskformer-swin", "MaskFormerSwinBackbone"),
         ("nat", "NatBackbone"),
         ("resnet", "ResNetBackbone"),
+        ("superglue", "SuperGlueBackbone"),
         ("swin", "SwinBackbone"),
         ("timm_backbone", "TimmBackbone"),
     ]
diff --git a/src/transformers/models/superglue/__init__.py b/src/transformers/models/superglue/__init__.py
new file mode 100644
index 000000000000..d7db1e2c9944
--- /dev/null
+++ b/src/transformers/models/superglue/__init__.py
@@ -0,0 +1,61 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_superglue": ["SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP", "SuperGlueConfig", "SuperGlueOnnxConfig"]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_superglue"] = [
+        "SUPERGLUE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "SuperGlueForImageClassification",
+        "SuperGlueModel",
+        "SuperGluePreTrainedModel",
+        "SuperGlueBackbone",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_superglue import SUPERGLUE_PRETRAINED_CONFIG_ARCHIVE_MAP, SuperGlueConfig, SuperGlueOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_superglue import (
+            SUPERGLUE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SuperGlueBackbone,
+            SuperGlueForImageClassification,
+            SuperGlueModel,
+            SuperGluePreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/superglue/configuration_superglue.py b/src/transformers/models/superglue/configuration_superglue.py
new file mode 100644
index 000000000000..99cc2d4ccfdf
--- /dev/null
+++ b/src/transformers/models/superglue/configuration_superglue.py
@@ -0,0 +1,34 @@
+from typing import List
+
+from transformers import PretrainedConfig
+
+
+class SuperGlueConfig(PretrainedConfig):
+
+    def __init__(
+            self,
+            descriptor_dim: int = 256,
+            keypoint_encoder_sizes: List[int] = [32, 64, 128, 256],
+            gnn_layers_types: List[str] = ['self', 'cross'] * 9,
+            num_heads: int = 4,
+            sinkhorn_iterations: int = 100,
+            matching_threshold: float = 0.2,
+            model_version: str = "indoor",
+            **kwargs,
+    ):
+        # Check whether all gnn_layers_types are either 'self' or 'cross'
+        if not all([layer_type in ['self', 'cross'] for layer_type in gnn_layers_types]):
+            raise ValueError("All gnn_layers_types must be either 'self' or 'cross'")
+
+        if model_version != "indoor" and model_version != "outdoor":
+            raise ValueError("model_version must be either 'indoor' or 'outdoor'")
+
+        self.descriptor_dim = descriptor_dim
+        self.keypoint_encoder_sizes = keypoint_encoder_sizes
+        self.gnn_layers_types = gnn_layers_types
+        self.num_heads = num_heads
+        self.sinkhorn_iterations = sinkhorn_iterations
+        self.matching_threshold = matching_threshold
+        self.model_version = model_version
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/superglue/convert_superglue_to_pytorch.py b/src/transformers/models/superglue/convert_superglue_to_pytorch.py
new file mode 100644
index 000000000000..dd318435aae7
--- /dev/null
+++ b/src/transformers/models/superglue/convert_superglue_to_pytorch.py
@@ -0,0 +1,121 @@
+import argparse
+
+import torch
+
+from transformers import SuperGlueConfig, SuperGlueModel
+
+
+def get_superglue_config(checkpoint_url):
+    config = SuperGlueConfig(
+        descriptor_dim=256,
+        keypoint_encoder_sizes=[32, 64, 128, 256],
+        gnn_layers_types=['self', 'cross'] * 9,
+        sinkhorn_iterations=100,
+        matching_threshold=0.2,
+    )
+
+    if "superglue_indoor" in checkpoint_url:
+        config.model_version = "indoor"
+    elif "superglue_outdoor" in checkpoint_url:
+        config.model_version = "outdoor"
+
+    return config
+
+
+def create_rename_keys(config, state_dict):
+    rename_keys = []
+
+    # keypoint encoder
+    n = len([3] + config.keypoint_encoder_sizes + [config.descriptor_dim])
+    for i in range(n * 2 + 1):
+        if ((i + 1) % 3) != 0:
+            rename_keys.append((f"kenc.encoder.{i}.weight", f"keypoint_encoder.encoder.layers.{i}.weight"))
+            rename_keys.append((f"kenc.encoder.{i}.bias", f"keypoint_encoder.encoder.layers.{i}.bias"))
+            if ((i % 3) - 1) == 0:
+                rename_keys.append((f"kenc.encoder.{i}.running_mean",
+                                    f"keypoint_encoder.encoder.layers.{i}.running_mean"))
+                rename_keys.append((f"kenc.encoder.{i}.running_var",
+                                    f"keypoint_encoder.encoder.layers.{i}.running_var"))
+                rename_keys.append((f"kenc.encoder.{i}.num_batches_tracked",
+                                    f"keypoint_encoder.encoder.layers.{i}.num_batches_tracked"))
+
+    # gnn
+    for i in range(len(config.gnn_layers_types)):
+        rename_keys.append((f"gnn.layers.{i}.attn.merge.weight", f"gnn.layers.{i}.attention.merge.weight"))
+        rename_keys.append((f"gnn.layers.{i}.attn.merge.bias", f"gnn.layers.{i}.attention.merge.bias"))
+        for j in range(3):
+            rename_keys.append((f"gnn.layers.{i}.attn.proj.{j}.weight", f"gnn.layers.{i}.attention.proj.{j}.weight"))
+            rename_keys.append((f"gnn.layers.{i}.attn.proj.{j}.bias", f"gnn.layers.{i}.attention.proj.{j}.bias"))
+        for j in range(len([config.descriptor_dim * 2, config.descriptor_dim * 2, config.descriptor_dim]) + 1):
+            if j != 2 :
+                rename_keys.append((f"gnn.layers.{i}.mlp.{j}.weight", f"gnn.layers.{i}.mlp.layers.{j}.weight"))
+                rename_keys.append((f"gnn.layers.{i}.mlp.{j}.bias", f"gnn.layers.{i}.mlp.layers.{j}.bias"))
+                if j == 1:
+                    rename_keys.append((f"gnn.layers.{i}.mlp.{j}.running_mean",
+                                        f"gnn.layers.{i}.mlp.layers.{j}.running_mean"))
+                    rename_keys.append((f"gnn.layers.{i}.mlp.{j}.running_var",
+                                        f"gnn.layers.{i}.mlp.layers.{j}.running_var"))
+                    rename_keys.append((f"gnn.layers.{i}.mlp.{j}.num_batches_tracked",
+                                        f"gnn.layers.{i}.mlp.layers.{j}.num_batches_tracked"))
+    return rename_keys
+
+
+# Copied from transformers.models.dinov2.convert_dinov2_to_hf
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+@torch.no_grad()
+def convert_superglue_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub):
+    """
+    TODO docs
+    """
+
+    print("Downloading original model from checkpoint...")
+    config = get_superglue_config(checkpoint_url)
+
+    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)
+    print(original_state_dict)
+
+    print("Converting model parameters...")
+    rename_keys = create_rename_keys(config, original_state_dict)
+    new_state_dict = original_state_dict.copy()
+    for src, dest in rename_keys:
+        rename_key(new_state_dict, src, dest)
+
+    for key in new_state_dict.copy().keys():
+        val = new_state_dict.pop(key)
+        if not key.startswith("superglue"):
+            key = "superglue." + key
+        new_state_dict[key] = val
+
+    model = SuperGlueModel(config)
+    model.load_state_dict(new_state_dict)
+    model.eval()
+    print("Successfully loaded weights in the model")
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://github.com/magicleap/SuperGluePretrainedNetwork/raw/master/models/weights/superglue_indoor.pth",
+        type=str,
+        help="URL of the original SuperGlue checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="model",
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument("--save_model", action="store_true", help="Save model to local")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
+
+    args = parser.parse_args()
+    convert_superglue_checkpoint(
+        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
+    )
diff --git a/src/transformers/models/superglue/modeling_superglue.py b/src/transformers/models/superglue/modeling_superglue.py
new file mode 100644
index 000000000000..d362759236d8
--- /dev/null
+++ b/src/transformers/models/superglue/modeling_superglue.py
@@ -0,0 +1,364 @@
+from copy import deepcopy
+from typing import List, Tuple, Optional, Union
+
+import torch
+from torch import nn, Tensor
+
+from transformers import PreTrainedModel
+from transformers.models.superglue.configuration_superglue import SuperGlueConfig
+from transformers.modeling_outputs import ImageMatchingOutput
+
+
+class SuperGlueMultiLayerPerceptron(nn.Module):
+    def __init__(
+            self,
+            channels: List[int],
+            do_batch_norm: bool = True,
+    ):
+        super().__init__()
+        num_layers = len(channels)
+        layers = []
+        for i in range(1, num_layers):
+            layers.append(
+                nn.Conv1d(
+                    channels[i - 1],
+                    channels[i],
+                    kernel_size=1,
+                    bias=True
+                )
+            )
+            if i < (num_layers - 1):
+                if do_batch_norm:
+                    layers.append(nn.BatchNorm1d(channels[i]))
+                layers.append(nn.ReLU())
+        self.layers = nn.Sequential(*layers)
+        nn.init.constant_(self.layers[-1].bias, 0.0)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.layers(input)
+
+
+class SuperGlueKeypointEncoder(nn.Module):
+
+    def __init__(
+            self,
+            feature_dim: int = 256,
+            layers_sizes: List[int] = [32, 64, 128, 256],
+    ):
+        super().__init__()
+        self.encoder = SuperGlueMultiLayerPerceptron(
+            channels=[3] + layers_sizes + [feature_dim]
+        )
+
+    def forward(self, keypoints: Tensor, scores: Tensor) -> Tensor:
+        keypoints = keypoints.transpose(1, 2)
+        scores = scores.unsqueeze(1)
+        inputs = torch.cat([keypoints, scores], dim=1)
+        return self.encoder(inputs)
+
+
+class SuperGlueMultiHeadAttention(nn.Module):
+    def __init__(
+            self,
+            feature_dim: int,
+            num_heads: int
+    ):
+        super().__init__()
+        assert feature_dim % num_heads == 0
+        self.feature_dim = feature_dim
+        self.num_heads = num_heads
+        self.dim = feature_dim // num_heads
+        self.merge = nn.Conv1d(feature_dim, feature_dim, kernel_size=1)
+        self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)])
+
+    def forward(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor
+    ) -> torch.Tensor:
+        batch_dim = query.size(0)
+        query, key, value = [
+            layer(x).view(batch_dim, self.dim, self.num_heads, -1)
+            for layer, x in zip(self.proj, (query, key, value))
+        ]
+        x, _ = self.attention(query, key, value)
+        output = self.merge(x.contiguous().view(batch_dim, self.dim * self.num_heads, -1))
+        return output
+
+    def attention(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        dim = query.shape[1]
+        scores = torch.einsum('bdhn,bdhm->bhnm', query, key) / dim ** .5
+        prob = torch.nn.functional.softmax(scores, dim=-1)
+        output = torch.einsum('bhnm,bdhm->bdhn', prob, value)
+        return output, prob
+
+
+class SuperGlueAttentionalPropagation(nn.Module):
+    def __init__(
+            self,
+            feature_dim: int,
+            num_heads: int
+    ):
+        super().__init__()
+        self.feature_dim = feature_dim
+        self.num_heads = num_heads
+        self.attention = SuperGlueMultiHeadAttention(
+            feature_dim=feature_dim,
+            num_heads=num_heads
+        )
+        self.mlp = SuperGlueMultiLayerPerceptron(
+            [feature_dim * 2, feature_dim * 2, feature_dim]
+        )
+        nn.init.constant_(self.mlp.layers[-1].bias, 0.0)
+
+    def forward(self, x: torch.Tensor, source: torch.Tensor) -> torch.Tensor:
+        message = self.attention(x, source, source)
+        message = torch.cat([x, message], dim=1)
+        message = self.mlp(message)
+        return message
+
+
+class SuperGlueAttentionalGNN(nn.Module):
+    def __init__(
+            self,
+            feature_dim: int,
+            num_heads: int,
+            layers_types: List[str],
+    ):
+        super().__init__()
+        self.feature_dim = feature_dim
+        self.num_heads = num_heads
+        self.layers_types = layers_types
+        self.num_layers = len(self.layers_types)
+        self.layers = nn.ModuleList(
+            [
+                SuperGlueAttentionalPropagation(
+                    self.feature_dim,
+                    self.num_heads
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+
+    def forward(self, descriptors_0: torch.Tensor, descriptors_1: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        for gnn_layer, type in zip(self.layers, self.layers_types):
+            if type == 'cross':
+                source_0, source_1 = descriptors_1, descriptors_0
+            else:  # if type == 'self':
+                source_0, source_1 = descriptors_0, descriptors_1
+            delta0, delta1 = gnn_layer(descriptors_0, source_0), gnn_layer(descriptors_1, source_1)
+            descriptor_0, descriptors_1 = (descriptors_0 + delta0), (descriptors_1 + delta1)
+        return descriptors_0, descriptors_1
+
+
+class SuperGlue(nn.Module):
+    """SuperGlue feature matching middle-end
+
+    Given two sets of keypoints and locations, we determine the
+    correspondences by:
+      1. Keypoint Encoding (normalization + visual feature and location fusion)
+      2. Graph Neural Network with multiple self and cross-attention layers
+      3. Final projection layer
+      4. Optimal Transport Layer (a differentiable Hungarian matching algorithm)
+      5. Thresholding matrix based on mutual exclusivity and a match_threshold
+
+    The correspondence ids use -1 to indicate non-matching points.
+
+    Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz, and Andrew
+    Rabinovich. SuperGlue: Learning Feature Matching with Graph Neural
+    Networks. In CVPR, 2020. https://arxiv.org/abs/1911.11763
+
+    """
+
+    def __init__(
+            self,
+            descriptor_dim: int = 256,
+            keypoint_encoder_sizes: List[int] = [32, 64, 128, 256],
+            gnn_layers_types: List[str] = ['self', 'cross'] * 9,
+            num_heads: int = 4,
+            sinkhorn_iterations: int = 100,
+            matching_threshold: float = 0.2,
+    ):
+        super().__init__()
+
+        self.descriptor_dim = descriptor_dim
+        self.keypoint_encoder_sizes = keypoint_encoder_sizes
+        self.gnn_layers_types = gnn_layers_types
+        self.num_heads = num_heads
+        self.sinkhorn_iterations = sinkhorn_iterations
+        self.match_threshold = matching_threshold
+
+        self.keypoint_encoder = SuperGlueKeypointEncoder(
+            feature_dim=self.descriptor_dim,
+            layers_sizes=self.keypoint_encoder_sizes
+        )
+
+        self.gnn = SuperGlueAttentionalGNN(
+            feature_dim=self.descriptor_dim,
+            num_heads=self.num_heads,
+            layers_types=self.gnn_layers_types
+        )
+
+        self.final_proj = nn.Conv1d(
+            self.descriptor_dim, self.descriptor_dim,
+            kernel_size=1, bias=True)
+
+        bin_score = torch.nn.Parameter(torch.tensor(1.))
+        self.register_parameter('bin_score', bin_score)
+
+    def forward(
+            self,
+            keypoints_0: Tensor,
+            scores_0: Tensor,
+            descriptors_0: Tensor,
+            keypoints_1: Tensor,
+            scores_1: Tensor,
+            descriptors_1: Tensor
+    ):
+        """Run SuperGlue on a pair of keypoints and descriptors"""
+        if keypoints_0.shape[1] == 0 or keypoints_1.shape[1] == 0:  # no keypoints
+            shape0, shape1 = keypoints_0.shape[:-1], keypoints_1.shape[:-1]
+            return tuple([
+                keypoints_0.new_full(shape0, -1, dtype=torch.int),
+                keypoints_1.new_full(shape1, -1, dtype=torch.int),
+                keypoints_0.new_zeros(shape0),
+                keypoints_1.new_zeros(shape1)
+            ])
+
+        # Keypoint MLP encoder.
+        descriptors_0 = descriptors_0 + self.keypoint_encoder(keypoints_0, scores_0)
+        descriptors_1 = descriptors_1 + self.keypoint_encoder(keypoints_1, scores_1)
+
+        # Multi-layer Transformer network.
+        descriptors_0, descriptors_1 = self.gnn(descriptors_0, descriptors_1)
+
+        # Final MLP projection.
+        projected_descriptors_0, projected_descriptors_1 = self.final_proj(descriptors_0), self.final_proj(descriptors_1)
+
+        # Compute matching descriptor distance.
+        scores = torch.einsum('bdn,bdm->bnm', projected_descriptors_0, projected_descriptors_1)
+        scores = scores / self.descriptor_dim ** .5
+
+        # Run the optimal transport.
+        scores = self.log_optimal_transport(
+            scores,
+            self.bin_score,
+            iters=self.sinkhorn_iterations
+        )
+
+        # Get the matches with score above "match_threshold".
+        max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
+        indices0, indices1 = max0.indices, max1.indices
+        mutual0 = self.arange_like(indices0, 1)[None] == indices1.gather(1, indices0)
+        mutual1 = self.arange_like(indices1, 1)[None] == indices0.gather(1, indices1)
+        zero = scores.new_tensor(0)
+        matching_scores_0 = torch.where(mutual0, max0.values.exp(), zero)
+        matching_scores_1 = torch.where(mutual1, matching_scores_0.gather(1, indices1), zero)
+        valid0 = mutual0 & (matching_scores_0 > self.match_threshold)
+        valid1 = mutual1 & valid0.gather(1, indices1)
+        matches_0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
+        matches_1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
+
+        return matches_0, matches_1, matching_scores_0, matching_scores_1
+
+    @staticmethod
+    def normalize_keypoints(
+            kpts: Tensor,
+            height: int,
+            width: int
+    ):
+        """ Normalize keypoints locations based on image image_shape"""
+        one = kpts.new_tensor(1)
+        size = torch.stack([one * width, one * height])[None]
+        center = size / 2
+        scaling = size.max(1, keepdim=True).values * 0.7
+        return (kpts - center[:, None, :]) / scaling[:, None, :]
+
+    @staticmethod
+    def log_sinkhorn_iterations(
+            Z: torch.Tensor,
+            log_mu: torch.Tensor,
+            log_nu: torch.Tensor,
+            iters: int
+    ) -> torch.Tensor:
+        """ Perform Sinkhorn Normalization in Log-space for stability"""
+        u, v = torch.zeros_like(log_mu), torch.zeros_like(log_nu)
+        for _ in range(iters):
+            u = log_mu - torch.logsumexp(Z + v.unsqueeze(1), dim=2)
+            v = log_nu - torch.logsumexp(Z + u.unsqueeze(2), dim=1)
+        return Z + u.unsqueeze(2) + v.unsqueeze(1)
+
+    @staticmethod
+    def log_optimal_transport(scores: torch.Tensor, alpha: torch.Tensor, iters: int) -> torch.Tensor:
+        """ Perform Differentiable Optimal Transport in Log-space for stability"""
+        b, m, n = scores.shape
+        one = scores.new_tensor(1)
+        ms, ns = (m * one).to(scores), (n * one).to(scores)
+
+        bins0 = alpha.expand(b, m, 1)
+        bins1 = alpha.expand(b, 1, n)
+        alpha = alpha.expand(b, 1, 1)
+
+        couplings = torch.cat([torch.cat([scores, bins0], -1),
+                               torch.cat([bins1, alpha], -1)], 1)
+
+        norm = - (ms + ns).log()
+        log_mu = torch.cat([norm.expand(m), ns.log()[None] + norm])
+        log_nu = torch.cat([norm.expand(n), ms.log()[None] + norm])
+        log_mu, log_nu = log_mu[None].expand(b, -1), log_nu[None].expand(b, -1)
+
+        Z = SuperGlueModel.log_sinkhorn_iterations(couplings, log_mu, log_nu, iters)
+        Z = Z - norm  # multiply probabilities by M+N
+        return Z
+
+    @staticmethod
+    def arange_like(x, dim: int):
+        return x.new_ones(x.shape[dim]).cumsum(0) - 1
+
+
+class SuperGlueModel(PreTrainedModel):
+    config_class = SuperGlueConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.superglue = SuperGlue(
+            descriptor_dim=config.descriptor_dim,
+            keypoint_encoder_sizes=config.keypoint_encoder_sizes,
+            gnn_layers_types=config.gnn_layers_types,
+            sinkhorn_iterations=config.sinkhorn_iterations,
+            matching_threshold=config.matching_threshold,
+        )
+
+    def forward(
+            self,
+            image0_keypoints: Tensor = None,
+            image0_scores: Tensor = None,
+            image0_descriptors: Tensor = None,
+            image1_keypoints: Tensor = None,
+            image1_scores: Tensor = None,
+            image1_descriptors: Tensor = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ImageMatchingOutput]:
+        image0_matches, image1_matches, image0_matching_scores, image1_matching_scores = self.model(
+            image0_keypoints=image0_keypoints,
+            image0_scores=image0_scores,
+            image0_descriptors=image0_descriptors,
+            image1_keypoints=image1_keypoints,
+            image1_scores=image1_scores,
+            image1_descriptors=image1_descriptors,
+        )
+        if not return_dict:
+            return image0_matches, image1_matches, image0_matching_scores, image1_matching_scores
+
+        return ImageMatchingOutput(
+            image0_matches=image0_matches,
+            image1_matches=image1_matches,
+            image0_matching_scores=image0_matching_scores,
+            image1_matching_scores=image1_matching_scores,
+        )
diff --git a/src/transformers/models/superglue/tokenization_superglue.py b/src/transformers/models/superglue/tokenization_superglue.py
new file mode 100644
index 000000000000..c079afeb5a91
--- /dev/null
+++ b/src/transformers/models/superglue/tokenization_superglue.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for SuperGlue."""
+from typing import List, Optional
+
+from tokenizers import ByteLevelBPETokenizer
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "brand-new-bert-base-cased": "https://huggingface.co/brand-new-bert-base-cased/resolve/main/vocab.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "brand-new-bert-base-cased": 1024,
+}
+
+class SuperGlueTokenizer(PreTrainedTokenizer):
+    """
+    Construct a SuperGlue tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            unk_token="<|endoftext|>",
+            bos_token="<|endoftext|>",
+            eos_token="<|endoftext|>",
+            **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+
+        """ Initialisation """
+
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+
+    def _tokenize(self, text):
+        """ Returns a tokenized string. """
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+
+    def save_vocabulary(self, save_directory):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A SuperGlue sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        SuperGlue does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`:  List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+class SuperGlueTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" SuperGlue tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            unk_token="<|endoftext|>",
+            bos_token="<|endoftext|>",
+            eos_token="<|endoftext|>",
+            add_prefix_space=False,
+            trim_offsets=True,
+            **kwargs
+    ):
+        super().__init__(
+            ByteLevelBPETokenizer(
+                vocab_file=vocab_file,
+                merges_file=merges_file,
+                add_prefix_space=add_prefix_space,
+                trim_offsets=trim_offsets,
+            ),
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            **kwargs,
+        )
+        self.add_prefix_space = add_prefix_space
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+
+    def create_token_type_ids_from_sequences(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        SuperGlue does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`:  List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+
diff --git a/src/transformers/models/superglue/tokenization_superglue_fast.py b/src/transformers/models/superglue/tokenization_superglue_fast.py
new file mode 100644
index 000000000000..319a3d977839
--- /dev/null
+++ b/src/transformers/models/superglue/tokenization_superglue_fast.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for SuperGlue."""
+from typing import List, Optional
+
+from tokenizers import ByteLevelBPETokenizer
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_superglue import SuperGlueTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "brand-new-bert-base-cased": "https://huggingface.co/brand-new-bert-base-cased/resolve/main/vocab.txt",
+    },
+    "tokenizer_file": {
+        "brand-new-bert-base-cased": "https://huggingface.co/brand-new-bert-base-cased/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "brand-new-bert-base-cased": 1024,
+}
+
+class SuperGlueTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" SuperGlue tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = SuperGlueTokenizer
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            unk_token="<|endoftext|>",
+            bos_token="<|endoftext|>",
+            eos_token="<|endoftext|>",
+            add_prefix_space=False,
+            trim_offsets=True,
+            **kwargs
+    ):
+        super().__init__(
+            ByteLevelBPETokenizer(
+                vocab_file=vocab_file,
+                merges_file=merges_file,
+                add_prefix_space=add_prefix_space,
+                trim_offsets=trim_offsets,
+            ),
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            **kwargs,
+        )
+        self.add_prefix_space = add_prefix_space
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+
+    def create_token_type_ids_from_sequences(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        SuperGlue does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`:  List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+
+