huggingface · Rocketknight1 · Apr 4, 2023 · Mar 10, 2023 · Mar 14, 2023 · Mar 20, 2023
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -269,7 +269,7 @@ Flax), PyTorch, and/or TensorFlow.
 |              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             BLIP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            BLIP-2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          BridgeTower          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/blip.mdx b/docs/source/en/model_doc/blip.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -93,4 +93,40 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 ## BlipForQuestionAnswering
 
 [[autodoc]] BlipForQuestionAnswering
-    - forward
+    - forward
+
+## TFBlipModel
+
+[[autodoc]] TFBlipModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFBlipTextModel
+
+[[autodoc]] TFBlipTextModel
+    - call
+
+
+## TFBlipVisionModel
+
+[[autodoc]] TFBlipVisionModel
+    - call
+
+
+## TFBlipForConditionalGeneration
+
+[[autodoc]] TFBlipForConditionalGeneration
+    - call
+
+
+## TFBlipForImageTextRetrieval
+
+[[autodoc]] TFBlipForImageTextRetrieval
+    - call
+
+
+## TFBlipForQuestionAnswering
+
+[[autodoc]] TFBlipForQuestionAnswering
+    - call
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -2902,6 +2902,18 @@
     _import_structure["models.blenderbot_small"].extend(
         ["TFBlenderbotSmallForConditionalGeneration", "TFBlenderbotSmallModel", "TFBlenderbotSmallPreTrainedModel"]
     )
+    _import_structure["models.blip"].extend(
+        [
+            "TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFBlipForConditionalGeneration",
+            "TFBlipForImageTextRetrieval",
+            "TFBlipForQuestionAnswering",
+            "TFBlipModel",
+            "TFBlipPreTrainedModel",
+            "TFBlipTextModel",
+            "TFBlipVisionModel",
+        ]
+    )
     _import_structure["models.camembert"].extend(
         [
             "TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6143,6 +6155,16 @@
             TFBlenderbotSmallModel,
             TFBlenderbotSmallPreTrainedModel,
         )
+        from .models.blip import (
+            TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFBlipForConditionalGeneration,
+            TFBlipForImageTextRetrieval,
+            TFBlipForQuestionAnswering,
+            TFBlipModel,
+            TFBlipPreTrainedModel,
+            TFBlipTextModel,
+            TFBlipVisionModel,
+        )
         from .models.camembert import (
             TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFCamembertForCausalLM,

diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py
@@ -196,7 +196,7 @@ def __init__(
         self._extra_commit_description = extra_commit_description
         self._override_model_class = override_model_class
 
-    def get_inputs(self, pt_model, config):
+    def get_inputs(self, pt_model, tf_dummy_inputs, config):
         """
         Returns the right inputs for the model, based on its signature.
         """
@@ -255,7 +255,11 @@ def _get_audio_input():
         tf_input = processor(**processor_inputs, return_tensors="tf")
 
         # Extra input requirements, in addition to the input modality
-        if config.is_encoder_decoder or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder")):
+        if (
+            config.is_encoder_decoder
+            or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder"))
+            or "decoder_input_ids" in tf_dummy_inputs
+        ):
             decoder_input_ids = np.asarray([[1], [1]], dtype=int) * (pt_model.config.decoder_start_token_id or 0)
             pt_input.update({"decoder_input_ids": torch.tensor(decoder_input_ids)})
             tf_input.update({"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)})
@@ -306,18 +310,24 @@ def run(self):
             except AttributeError:
                 raise AttributeError(f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers.")
 
-        # Load models and acquire a basic input compatible with the model.
+        # Check the TF dummy inputs to see what keys we need in the forward pass
+        tf_from_pt_model = tf_class.from_config(config)
+        tf_dummy_inputs = tf_from_pt_model.dummy_inputs
+
+        del tf_from_pt_model  # Try to keep only one model in memory at a time
+
+        # Load the model and get some basic inputs
         pt_model = pt_class.from_pretrained(self._local_dir)
         pt_model.eval()
 
-        pt_input, tf_input = self.get_inputs(pt_model, config)
+        pt_input, tf_input = self.get_inputs(pt_model, tf_dummy_inputs, config)
 
         with torch.no_grad():
             pt_outputs = pt_model(**pt_input, output_hidden_states=True)
         del pt_model  # will no longer be used, and may have a large memory footprint
 
         tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True)
-        tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True)
+        tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True, training=False)
 
         # Confirms that cross loading PT weights into TF worked.
         crossload_differences = self.find_pt_tf_differences(pt_outputs, tf_from_pt_outputs)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
@@ -406,6 +406,7 @@ def unpack_inputs(func):
         func (`callable`):
             The callable function of the TensorFlow model.
 
+
     Returns:
         A callable that wraps the original `func` with the behavior described above.
     """
@@ -1157,6 +1158,38 @@ def _from_config(cls, config, **kwargs):
         """
         return cls(config, **kwargs)
 
+    def get_head_mask(self, head_mask: Optional[tf.Tensor], num_hidden_layers: int) -> tf.Tensor:
+        """
+        Prepare the head mask if needed.
+
+        Args:
+            head_mask (`tf.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+
+        Returns:
+            `tf.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.shape.rank == 1:
+            head_mask = head_mask[None, None, :, None, None]
+            head_mask = tf.repeat(head_mask, repeats=num_hidden_layers, axis=0)
+        elif head_mask.shape.rank == 2:
+            head_mask = head_mask[:, None, :, None, None]
+        assert head_mask.shape.rank == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = tf.cast(head_mask, tf.float32)  # switch to float if need + fp16 compatibility
+        return head_mask
+
     def eager_serving(self, inputs):
         """
         Method used for serving the model. Intended not to be compiled with a tf.function decorator so that we can use

diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
@@ -34,6 +34,7 @@
         ("bert", "TFBertModel"),
         ("blenderbot", "TFBlenderbotModel"),
         ("blenderbot-small", "TFBlenderbotSmallModel"),
+        ("blip", "TFBlipModel"),
         ("camembert", "TFCamembertModel"),
         ("clip", "TFCLIPModel"),
         ("convbert", "TFConvBertModel"),
@@ -213,6 +214,7 @@
 TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Image Classification mapping
+        ("blip", "TFBlipModel"),
         ("clip", "TFCLIPModel"),
     ]
 )

diff --git a/src/transformers/models/blip/__init__.py b/src/transformers/models/blip/__init__.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {
@@ -52,6 +58,23 @@
         "BlipForImageTextRetrieval",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_blip"] = [
+        "TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFBlipModel",
+        "TFBlipPreTrainedModel",
+        "TFBlipForConditionalGeneration",
+        "TFBlipForQuestionAnswering",
+        "TFBlipVisionModel",
+        "TFBlipTextModel",
+        "TFBlipForImageTextRetrieval",
+    ]
+
 if TYPE_CHECKING:
     from .configuration_blip import BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, BlipConfig, BlipTextConfig, BlipVisionConfig
     from .processing_blip import BlipProcessor
@@ -81,6 +104,23 @@
             BlipVisionModel,
         )
 
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_blip import (
+            TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFBlipForConditionalGeneration,
+            TFBlipForImageTextRetrieval,
+            TFBlipForQuestionAnswering,
+            TFBlipModel,
+            TFBlipPreTrainedModel,
+            TFBlipTextModel,
+            TFBlipVisionModel,
+        )
+
 else:
     import sys
 

diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
@@ -313,17 +313,12 @@ def forward(
 
         bsz, tgt_len, embed_dim = hidden_states.size()
 
-        mixed_qkv = self.qkv(hidden_states)
         mixed_qkv = (
             self.qkv(hidden_states)
             .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads)
             .permute(2, 0, 3, 1, 4)
         )
-        query_states, key_states, value_states = (
-            mixed_qkv[0],
-            mixed_qkv[1],
-            mixed_qkv[2],
-        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
@@ -587,9 +582,7 @@ def forward(
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
+                Embedded representation of the inputs. Should be float, not int tokens.
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -824,10 +817,7 @@ def get_image_features(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            return_dict=return_dict,
-        )
+        vision_outputs = self.vision_model(pixel_values=pixel_values, return_dict=return_dict)
 
         pooled_output = vision_outputs[1]  # pooled_output
         image_features = self.visual_projection(pooled_output)
@@ -993,6 +983,10 @@ def forward(
         ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
@@ -1037,7 +1031,7 @@ def generate(
         Overrides *generate* function to be able to use the model as a conditional generator
 
         Parameters:
-            pixel_values (*torch.FloatTensor* of shape *(batch_size, image_width, image_height)*:
+            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                 Input image to be processed
             input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                 The sequence used as a prompt for the generation.
@@ -1066,9 +1060,7 @@ def generate(
         """
 
         batch_size = pixel_values.shape[0]
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-        )
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
 
         image_embeds = vision_outputs[0]
 
@@ -1198,6 +1190,10 @@ def forward(
             )
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
@@ -1266,7 +1262,7 @@ def generate(
         Parameters:
             input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
                 The sequence used as a prompt for the generation.
-            pixel_values (*torch.FloatTensor* of shape *(batch_size, image_width, image_height)*:
+            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                 Input image to be processed
             attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
@@ -1295,9 +1291,7 @@ def generate(
         2
         ```
         """
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-        )
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
 
         image_embeds = vision_outputs[0]
 
@@ -1412,6 +1406,10 @@ def forward(
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,