Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
f4378b4
Merge changes
jplu Oct 12, 2020
8d03e57
Merge changes
jplu Oct 12, 2020
8b38cdb
Fix conflict
jplu Oct 6, 2020
84853af
Fix conflict
jplu Oct 6, 2020
df612c4
Fix conflict
jplu Oct 6, 2020
d918584
Fix test
jplu Sep 28, 2020
a07ee24
Fix conflict
jplu Oct 6, 2020
c483bf5
Fix conflict
jplu Oct 6, 2020
245999c
Make return_dict the default behavior and display a warning message
jplu Sep 29, 2020
64a43f3
Revert
jplu Sep 29, 2020
336bc7a
Replace wrong keyword
jplu Sep 29, 2020
335b6f8
Revert code
jplu Sep 29, 2020
3c73516
Add eisum layers
jplu Sep 29, 2020
0bc0bb4
Add eisum layers to Electra, Longformer and Roberta
jplu Sep 29, 2020
1b13b2f
Rework a bit Longformer
jplu Sep 29, 2020
e9da1c2
Fix rebase issue
jplu Oct 6, 2020
82230df
Fix rebase issue
jplu Oct 6, 2020
de92e54
Apply style
jplu Oct 6, 2020
a268e7b
Better model design for BERT + LM pretraining for BERT + more robust …
jplu Oct 9, 2020
4502fb3
Fix return_dict to True
jplu Oct 9, 2020
857c24b
Make the layers created or not on the fly
jplu Oct 9, 2020
ac6a750
changes for return_dict
jplu Oct 12, 2020
64456a3
Implement new output logic
jplu Oct 12, 2020
42b7969
Finish new TF model outputs
jplu Oct 12, 2020
c2ec22f
Improve TF models input
jplu Oct 13, 2020
6dedd40
Inputs parsing function now returns list of values ordered by keys
jplu Oct 13, 2020
14cc4ec
Apply style
jplu Oct 13, 2020
96a6fa5
Fix check_copies
jplu Oct 13, 2020
768d078
fix Boolean
jplu Oct 13, 2020
4fef7d7
Rollback on naming
jplu Oct 13, 2020
3b7606d
Address few Sylvain's comments
jplu Oct 13, 2020
cd09cf0
Update src/transformers/modeling_tf_bert.py
jplu Oct 13, 2020
176b2be
Finish to address Sylvain's comments
jplu Oct 13, 2020
b9bca81
Several bug fix
jplu Oct 13, 2020
275a4b3
Address Patrick's comments
jplu Oct 14, 2020
d2e95bf
Put the einsum equations in lower case to make ONNX happy
jplu Oct 15, 2020
eee2c35
Apply style
jplu Oct 15, 2020
c549750
Revert the ModelOutput
jplu Oct 16, 2020
f952a78
Update src/transformers/modeling_tf_bert.py
jplu Oct 16, 2020
0b6bb3d
Fix style
jplu Oct 16, 2020
d8978d1
Fix the tests
jplu Oct 16, 2020
6f00c96
Apply style
jplu Oct 16, 2020
b3fb2f6
Fix tests
jplu Oct 16, 2020
291321b
Improve embeddings computation
jplu Oct 26, 2020
7bed1a5
Apply style
jplu Oct 26, 2020
5a1fcf6
Fix tests
jplu Oct 26, 2020
2d01bb5
Fix tests
jplu Oct 26, 2020
11ad4d0
Improve the perf of the TFBertLayer
jplu Oct 27, 2020
04af391
Optimize MaskedLM computation
jplu Oct 28, 2020
c45c0b6
Review boolean handling
jplu Oct 30, 2020
0d3ed86
Fix tests
jplu Oct 30, 2020
c29eefd
Apply style
jplu Oct 30, 2020
a1b9785
Apply style + fix ONNX tests
jplu Oct 30, 2020
ccaa787
Use the embeddings as utils
jplu Oct 30, 2020
fe7c9ae
Fix conflict
jplu Oct 30, 2020
f899b9b
Apply style
jplu Oct 30, 2020
ed73dc2
Apply style
jplu Oct 30, 2020
f974a79
Reput loss to next sentence output
jplu Oct 30, 2020
36ec3d4
Remove useless print
jplu Oct 30, 2020
e5f618e
Apply style
jplu Oct 30, 2020
7c3514b
Make return_dict=True by default in PreTrainedConfig
jplu Nov 2, 2020
2a41cee
Remove duplicate layers
jplu Nov 2, 2020
948396c
Fix TFLongformer tests
jplu Nov 3, 2020
e80b20d
Apply style
jplu Nov 3, 2020
22a7830
Fix some Flax/PyTorch tests
jplu Nov 3, 2020
0ed1f37
Fix some Flax/PyTorch tests
jplu Nov 3, 2020
687ad7c
Remove naming exceptions
jplu Nov 4, 2020
2d0b524
Apply style
jplu Nov 4, 2020
f41bd1a
Add a warning message + revert to global default return_dict behavior…
jplu Nov 6, 2020
06aa90f
Fix few forgotten tests
jplu Nov 6, 2020
accc933
Apply style
jplu Nov 6, 2020
32f6855
Fix Longformer tests
jplu Nov 6, 2020
ae6e3d4
Fix T5
jplu Nov 6, 2020
781110c
Fix conflict
jplu Nov 9, 2020
982d0f6
Restore example
jplu Nov 11, 2020
dec9e7b
Restore TF Electra
jplu Nov 11, 2020
d27965a
Restore longformer
jplu Nov 11, 2020
7b5da2e
Rebase
jplu Nov 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,237 changes: 767 additions & 470 deletions src/transformers/modeling_tf_bert.py

Large diffs are not rendered by default.

13 changes: 11 additions & 2 deletions src/transformers/modeling_tf_pytorch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
for pat in tf_model.authorized_unexpected_keys:
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]

if tf_model.authorized_unexpected_keys is not None:
for pat in tf_model.authorized_unexpected_keys:
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]

if len(unexpected_keys) > 0:
logger.warning(
f"Some weights of the PyTorch model were not used when "
Expand Down Expand Up @@ -265,7 +269,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs

import transformers

from .modeling_tf_utils import load_tf_weights
from .modeling_tf_utils import load_tf_weights, old_load_tf_weights

logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))

Expand All @@ -280,7 +284,12 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
if tf_inputs is not None:
tf_model(tf_inputs, training=False) # Make sure model is built

load_tf_weights(tf_model, tf_checkpoint_path)
# Temporary fix in order to detect if the loaded model adopts the new TF code design or not.
# This will be removed once all the TF models will be updated to the new design
if tf_model.base_model_prefix in ["bert"]:
load_tf_weights(tf_model, tf_checkpoint_path)
else:
old_load_tf_weights(tf_model, tf_checkpoint_path)

return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)

Expand Down
106 changes: 58 additions & 48 deletions src/transformers/modeling_tf_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ def _linear(self, inputs):
return tf.reshape(logits, [batch_size, length, self.vocab_size])


# Copied from transformers.modeling_tf_bert.TFBertPooler
class TFRobertaPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
Expand All @@ -244,7 +243,6 @@ def call(self, hidden_states):
return pooled_output


# Copied from transformers.modeling_tf_bert.TFBertSelfAttention
class TFRobertaSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
Expand All @@ -256,40 +254,48 @@ def __init__(self, config, **kwargs):
)

self.num_attention_heads = config.num_attention_heads

assert config.hidden_size % config.num_attention_heads == 0

self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de",
kernel_initializer=get_initializer(config.initializer_range),
name="query",
)
self.key = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
self.key = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de",
kernel_initializer=get_initializer(config.initializer_range),
name="key",
)
self.value = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
self.value = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de",
kernel_initializer=get_initializer(config.initializer_range),
name="value",
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)

def transpose_for_scores(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
query_tensor = self.query(hidden_states)

return tf.transpose(x, perm=[0, 2, 1, 3])
# `key_tensor` = [B, S, N, H]
key_tensor = self.key(hidden_states)

def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
batch_size = shape_list(hidden_states)[0]
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = tf.matmul(
query_layer, key_layer, transpose_b=True
) # (batch size, num_heads, seq_len_q, seq_len_k)
dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores
attention_scores = attention_scores / tf.math.sqrt(dk)
# `value_tensor` = [B, S, N, H]
value_tensor = self.value(hidden_states)

# Take the dot product between "query" and "key" to get the raw
# attention scores.
attention_scores = tf.einsum("BSNH,BTNH->BNTS", key_tensor, query_tensor)
dk = tf.cast(self.attention_head_size, dtype=attention_scores.dtype) # scale attention_scores
attention_scores = tf.multiply(attention_scores, tf.math.rsqrt(dk))

if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
Expand All @@ -306,23 +312,28 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai
if head_mask is not None:
attention_probs = attention_probs * head_mask

context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
) # (batch_size, seq_len_q, all_head_size)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
attention_output = tf.einsum("BNTS,BSNH->BTNH", attention_probs, value_tensor)
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)

return outputs


# Copied from transformers.modeling_tf_bert.TFBertSelfOutput
class TFRobertaSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)

self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
self.num_attention_heads = config.num_attention_heads

assert config.hidden_size % config.num_attention_heads == 0

self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe",
output_shape=(None, self.all_head_size),
bias_axes="e",
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
Expand All @@ -335,7 +346,6 @@ def call(self, hidden_states, input_tensor, training=False):
return hidden_states


# Copied from transformers.modeling_tf_bert.TFBertAttention with Bert->Roberta
class TFRobertaAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
Expand All @@ -356,13 +366,16 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, train
return outputs


# Copied from transformers.modeling_tf_bert.TFBertIntermediate
class TFRobertaIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)

self.dense = tf.keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abc,cd->abd",
output_shape=(None, config.intermediate_size),
bias_axes="d",
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)

if isinstance(config.hidden_act, str):
Expand All @@ -377,13 +390,16 @@ def call(self, hidden_states):
return hidden_states


# Copied from transformers.modeling_tf_bert.TFBertOutput
class TFRobertaOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)

self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abc,cd->abd",
bias_axes="d",
output_shape=(None, config.hidden_size),
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
Expand All @@ -396,7 +412,6 @@ def call(self, hidden_states, input_tensor, training=False):
return hidden_states


# Copied from transformers.modeling_tf_bert.TFBertLayer with Bert->Roberta
class TFRobertaLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
Expand All @@ -417,7 +432,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai
return outputs


# Copied from transformers.modeling_tf_bert.TFBertEncoder with Bert->Roberta
class TFRobertaEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
Expand Down Expand Up @@ -478,24 +492,20 @@ def __init__(self, config, **kwargs):
# The embeddings must be the last declaration in order to follow the weights order
self.embeddings = TFRobertaEmbeddings(config, name="embeddings")

# Copied from transformers.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self):
return self.embeddings

# Copied from transformers.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
self.embeddings.vocab_size = value.shape[0]

# Copied from transformers.modeling_tf_bert.TFBertMainLayer._prune_heads
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError

# Copied from transformers.modeling_tf_bert.TFBertMainLayer.call
def call(
self,
inputs,
Expand Down
8 changes: 0 additions & 8 deletions src/transformers/modeling_tf_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,6 @@ def call(
output_hidden_states=None,
return_dict=None,
training=False,
**kwargs,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Expand Down Expand Up @@ -1268,13 +1267,6 @@ def call(
else:
input_ids = inputs

if "past_key_value_states" in kwargs:
warnings.warn(
"The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
FutureWarning,
)
past_key_values = kwargs.pop("past_key_value_states")

use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions else self.config.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
Expand Down
Loading