huggingface · jplu · Oct 12, 2020 · Oct 12, 2020 · Oct 6, 2020 · Oct 6, 2020
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
@@ -216,6 +216,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         for pat in tf_model.authorized_unexpected_keys:
             unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
 
+    if tf_model.authorized_unexpected_keys is not None:
+        for pat in tf_model.authorized_unexpected_keys:
+            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
     if len(unexpected_keys) > 0:
         logger.warning(
             f"Some weights of the PyTorch model were not used when "
@@ -265,7 +269,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
 
     import transformers
 
-    from .modeling_tf_utils import load_tf_weights
+    from .modeling_tf_utils import load_tf_weights, old_load_tf_weights
 
     logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
 
@@ -280,7 +284,12 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
     if tf_inputs is not None:
         tf_model(tf_inputs, training=False)  # Make sure model is built
 
-    load_tf_weights(tf_model, tf_checkpoint_path)
+    # Temporary fix in order to detect if the loaded model adopts the new TF code design or not.
+    # This will be removed once all the TF models will be updated to the new design
+    if tf_model.base_model_prefix in ["bert"]:
+        load_tf_weights(tf_model, tf_checkpoint_path)
+    else:
+        old_load_tf_weights(tf_model, tf_checkpoint_path)
 
     return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
 

diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
@@ -223,7 +223,6 @@ def _linear(self, inputs):
         return tf.reshape(logits, [batch_size, length, self.vocab_size])
 
 
-# Copied from transformers.modeling_tf_bert.TFBertPooler
 class TFRobertaPooler(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -244,7 +243,6 @@ def call(self, hidden_states):
         return pooled_output
 
 
-# Copied from transformers.modeling_tf_bert.TFBertSelfAttention
 class TFRobertaSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -256,40 +254,48 @@ def __init__(self, config, **kwargs):
             )
 
         self.num_attention_heads = config.num_attention_heads
+
         assert config.hidden_size % config.num_attention_heads == 0
+
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.query = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        self.query = tf.keras.layers.experimental.EinsumDense(
+            equation="abc,cde->abde",
+            output_shape=(None, config.num_attention_heads, self.attention_head_size),
+            bias_axes="de",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="query",
         )
-        self.key = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        self.key = tf.keras.layers.experimental.EinsumDense(
+            equation="abc,cde->abde",
+            output_shape=(None, config.num_attention_heads, self.attention_head_size),
+            bias_axes="de",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="key",
         )
-        self.value = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        self.value = tf.keras.layers.experimental.EinsumDense(
+            equation="abc,cde->abde",
+            output_shape=(None, config.num_attention_heads, self.attention_head_size),
+            bias_axes="de",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="value",
         )
         self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
 
-    def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+    def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
+        query_tensor = self.query(hidden_states)
 
-        return tf.transpose(x, perm=[0, 2, 1, 3])
+        # `key_tensor` = [B, S, N, H]
+        key_tensor = self.key(hidden_states)
 
-    def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
-        batch_size = shape_list(hidden_states)[0]
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
-        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
-        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = tf.matmul(
-            query_layer, key_layer, transpose_b=True
-        )  # (batch size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype)  # scale attention_scores
-        attention_scores = attention_scores / tf.math.sqrt(dk)
+        # `value_tensor` = [B, S, N, H]
+        value_tensor = self.value(hidden_states)
+
+        # Take the dot product between "query" and "key" to get the raw
+        # attention scores.
+        attention_scores = tf.einsum("BSNH,BTNH->BNTS", key_tensor, query_tensor)
+        dk = tf.cast(self.attention_head_size, dtype=attention_scores.dtype)  # scale attention_scores
+        attention_scores = tf.multiply(attention_scores, tf.math.rsqrt(dk))
 
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
@@ -306,23 +312,28 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
 
-        context_layer = tf.matmul(attention_probs, value_layer)
-        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(
-            context_layer, (batch_size, -1, self.all_head_size)
-        )  # (batch_size, seq_len_q, all_head_size)
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        attention_output = tf.einsum("BNTS,BSNH->BTNH", attention_probs, value_tensor)
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
 
         return outputs
 
 
-# Copied from transformers.modeling_tf_bert.TFBertSelfOutput
 class TFRobertaSelfOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        self.num_attention_heads = config.num_attention_heads
+
+        assert config.hidden_size % config.num_attention_heads == 0
+
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dense = tf.keras.layers.experimental.EinsumDense(
+            equation="abcd,cde->abe",
+            output_shape=(None, self.all_head_size),
+            bias_axes="e",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
         )
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
@@ -335,7 +346,6 @@ def call(self, hidden_states, input_tensor, training=False):
         return hidden_states
 
 
-# Copied from transformers.modeling_tf_bert.TFBertAttention with Bert->Roberta
 class TFRobertaAttention(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -356,13 +366,16 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, train
         return outputs
 
 
-# Copied from transformers.modeling_tf_bert.TFBertIntermediate
 class TFRobertaIntermediate(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.dense = tf.keras.layers.Dense(
-            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        self.dense = tf.keras.layers.experimental.EinsumDense(
+            equation="abc,cd->abd",
+            output_shape=(None, config.intermediate_size),
+            bias_axes="d",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
         )
 
         if isinstance(config.hidden_act, str):
@@ -377,13 +390,16 @@ def call(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.modeling_tf_bert.TFBertOutput
 class TFRobertaOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        self.dense = tf.keras.layers.experimental.EinsumDense(
+            equation="abc,cd->abd",
+            bias_axes="d",
+            output_shape=(None, config.hidden_size),
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
         )
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
@@ -396,7 +412,6 @@ def call(self, hidden_states, input_tensor, training=False):
         return hidden_states
 
 
-# Copied from transformers.modeling_tf_bert.TFBertLayer with Bert->Roberta
 class TFRobertaLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -417,7 +432,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai
         return outputs
 
 
-# Copied from transformers.modeling_tf_bert.TFBertEncoder with Bert->Roberta
 class TFRobertaEncoder(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
@@ -478,24 +492,20 @@ def __init__(self, config, **kwargs):
         # The embeddings must be the last declaration in order to follow the weights order
         self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
 
-    # Copied from transformers.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
     def get_input_embeddings(self):
         return self.embeddings
 
-    # Copied from transformers.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
         self.embeddings.vocab_size = value.shape[0]
 
-    # Copied from transformers.modeling_tf_bert.TFBertMainLayer._prune_heads
     def _prune_heads(self, heads_to_prune):
         """
         Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
         class PreTrainedModel
         """
         raise NotImplementedError
 
-    # Copied from transformers.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
         inputs,

diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
@@ -1196,7 +1196,6 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs,
     ):
         r"""
         labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1268,13 +1267,6 @@ def call(
         else:
             input_ids = inputs
 
-            if "past_key_value_states" in kwargs:
-                warnings.warn(
-                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = kwargs.pop("past_key_value_states")
-
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_attentions = output_attentions if output_attentions else self.config.output_attentions
         output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states