From a038099c5d30a5cc13264f777cac5f650c1adbbe Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 9 Aug 2022 13:32:25 +0000 Subject: [PATCH 1/5] fix deberta issues --- .../models/deberta/modeling_tf_deberta.py | 42 +++++++-------- .../deberta_v2/modeling_tf_deberta_v2.py | 54 +++++++++---------- 2 files changed, 44 insertions(+), 52 deletions(-) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index 1d8c01e24acd..01bd83c56e90 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -101,27 +101,6 @@ def call(self, inputs: tf.Tensor, mask: tf.Tensor): return output -def get_mask(input, dropout): - mask = tf.cast( - 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - dropout).sample(sample_shape=shape_list(input)), tf.bool - ) - return mask, dropout - - -@tf.custom_gradient -def TFDebertaXDropout(input, local_ctx): - mask, dropout = get_mask(input, local_ctx) - scale = tf.convert_to_tensor(1.0 / (1 - dropout), dtype=tf.float32) - input = tf.cond(dropout > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) - - def custom_grad(upstream_grad): - return tf.cond( - scale > 1, lambda: (tf.where(mask, 0.0, upstream_grad) * scale, None), lambda: (upstream_grad, None) - ) - - return input, custom_grad - - class TFDebertaStableDropout(tf.keras.layers.Layer): """ Optimized dropout module for stabilizing the training @@ -134,9 +113,26 @@ def __init__(self, drop_prob, **kwargs): super().__init__(**kwargs) self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32) + @tf.custom_gradient + def xdropout(self, input): + """ + Applies dropout to the input, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. + """ + mask = tf.cast( + 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(input)), + tf.bool, + ) + scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32) + input = tf.cond(self.drop_prob > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) + + def grad(upstream): + return tf.cond(scale > 1, lambda: tf.where(mask, 0.0, upstream) * scale, lambda: upstream) + + return input, grad + def call(self, inputs: tf.Tensor, training: tf.Tensor = False): - if training and self.drop_prob > 0: - return TFDebertaXDropout(inputs, self.drop_prob) + if training: + return self.xdropout(inputs) return inputs diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index aabb3b2d380e..729ebaf09ed4 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -102,29 +102,6 @@ def call(self, inputs: tf.Tensor, mask: tf.Tensor): return output -# Copied from transformers.models.deberta.modeling_tf_deberta.get_mask -def get_mask(input, dropout): - mask = tf.cast( - 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - dropout).sample(sample_shape=shape_list(input)), tf.bool - ) - return mask, dropout - - -@tf.custom_gradient -# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXDropout -def TFDebertaV2XDropout(input, local_ctx): - mask, dropout = get_mask(input, local_ctx) - scale = tf.convert_to_tensor(1.0 / (1 - dropout), dtype=tf.float32) - input = tf.cond(dropout > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) - - def custom_grad(upstream_grad): - return tf.cond( - scale > 1, lambda: (tf.where(mask, 0.0, upstream_grad) * scale, None), lambda: (upstream_grad, None) - ) - - return input, custom_grad - - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2 class TFDebertaV2StableDropout(tf.keras.layers.Layer): """ @@ -138,9 +115,26 @@ def __init__(self, drop_prob, **kwargs): super().__init__(**kwargs) self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32) + @tf.custom_gradient + def xdropout(self, input): + """ + Applies dropout to the input, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. + """ + mask = tf.cast( + 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(input)), + tf.bool, + ) + scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32) + input = tf.cond(self.drop_prob > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) + + def grad(upstream): + return tf.cond(scale > 1, lambda: tf.where(mask, 0.0, upstream) * scale, lambda: upstream) + + return input, grad + def call(self, inputs: tf.Tensor, training: tf.Tensor = False): - if training and self.drop_prob > 0: - return TFDebertaV2XDropout(inputs, self.drop_prob) + if training: + return self.xdropout(inputs) return inputs @@ -525,10 +519,12 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer): def take_along_axis(x, indices): # Only a valid port of np.take_along_axis when the gather axis is -1 - flat_x = tf.reshape(x, (-1, x.shape[-1])) - flat_indices = tf.reshape(indices, (-1, indices.shape[-1])) - gathered = tf.gather(flat_x, flat_indices, batch_dims=1) - gathered = tf.reshape(gathered, indices.shape) + # [B, S, P] -> [B, S, P, D] + one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype) + + # if we ignore the first two dims, this is equivalent to multiplying a matrix (one hot) by a vector (x) + # grossly abusing notation: [B, S, P, D] . [B, S, D] = [B, S, P] + gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x) return gathered From 64aad9a950bdf621dce67eb9c3f6efcfc0cf7b12 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 9 Aug 2022 15:46:35 +0000 Subject: [PATCH 2/5] add different code paths for gpu and tpu --- .../deberta_v2/modeling_tf_deberta_v2.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 729ebaf09ed4..fbac13821b0a 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -519,12 +519,21 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer): def take_along_axis(x, indices): # Only a valid port of np.take_along_axis when the gather axis is -1 - # [B, S, P] -> [B, S, P, D] - one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype) - - # if we ignore the first two dims, this is equivalent to multiplying a matrix (one hot) by a vector (x) - # grossly abusing notation: [B, S, P, D] . [B, S, D] = [B, S, P] - gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x) + # TPU + gathers and reshapes don't go along well -- see https://github.com/huggingface/transformers/issues/18239 + if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy): + # [B, S, P] -> [B, S, P, D] + one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype) + + # if we ignore the first two dims, this is equivalent to multiplying a matrix (one hot) by a vector (x) + # grossly abusing notation: [B, S, P, D] . [B, S, D] = [B, S, P] + gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x) + + # GPUs, on the other hand, prefer gathers instead of large one-hot+matmuls + else: + flat_x = tf.reshape(x, (-1, x.shape[-1])) + flat_indices = tf.reshape(indices, (-1, indices.shape[-1])) + gathered = tf.gather(flat_x, flat_indices, batch_dims=1) + gathered = tf.reshape(gathered, shape_list(indices)) return gathered From b69a94dc95fa176cac35c71f94e9fe7c980e90c3 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 10 Aug 2022 09:30:58 +0000 Subject: [PATCH 3/5] shorter gpu take along axis --- src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index fbac13821b0a..13ff99d0dbc8 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -530,10 +530,7 @@ def take_along_axis(x, indices): # GPUs, on the other hand, prefer gathers instead of large one-hot+matmuls else: - flat_x = tf.reshape(x, (-1, x.shape[-1])) - flat_indices = tf.reshape(indices, (-1, indices.shape[-1])) - gathered = tf.gather(flat_x, flat_indices, batch_dims=1) - gathered = tf.reshape(gathered, shape_list(indices)) + gathered = tf.gather(x, indices, batch_dims=2) return gathered From 905cbdf8fd71f88a25c187f6d5877e1de09125ef Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 10 Aug 2022 11:38:48 +0000 Subject: [PATCH 4/5] Stable Dropout without tf cond --- .../models/deberta/modeling_tf_deberta.py | 18 +++++++++++------- .../deberta_v2/modeling_tf_deberta_v2.py | 18 +++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index 01bd83c56e90..f97232b3f6d8 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -111,24 +111,28 @@ class TFDebertaStableDropout(tf.keras.layers.Layer): def __init__(self, drop_prob, **kwargs): super().__init__(**kwargs) - self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32) + self.drop_prob = drop_prob @tf.custom_gradient - def xdropout(self, input): + def xdropout(self, inputs): """ - Applies dropout to the input, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. + Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. """ mask = tf.cast( - 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(input)), + 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(inputs)), tf.bool, ) scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32) - input = tf.cond(self.drop_prob > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) + if self.drop_prob > 0: + inputs = tf.where(mask, 0.0, inputs) * scale def grad(upstream): - return tf.cond(scale > 1, lambda: tf.where(mask, 0.0, upstream) * scale, lambda: upstream) + if self.drop_prob > 0: + return tf.where(mask, 0.0, upstream) * scale + else: + return upstream - return input, grad + return inputs, grad def call(self, inputs: tf.Tensor, training: tf.Tensor = False): if training: diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 13ff99d0dbc8..5a0f16be9810 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -113,24 +113,28 @@ class TFDebertaV2StableDropout(tf.keras.layers.Layer): def __init__(self, drop_prob, **kwargs): super().__init__(**kwargs) - self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32) + self.drop_prob = drop_prob @tf.custom_gradient - def xdropout(self, input): + def xdropout(self, inputs): """ - Applies dropout to the input, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. + Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. """ mask = tf.cast( - 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(input)), + 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(inputs)), tf.bool, ) scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32) - input = tf.cond(self.drop_prob > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) + if self.drop_prob > 0: + inputs = tf.where(mask, 0.0, inputs) * scale def grad(upstream): - return tf.cond(scale > 1, lambda: tf.where(mask, 0.0, upstream) * scale, lambda: upstream) + if self.drop_prob > 0: + return tf.where(mask, 0.0, upstream) * scale + else: + return upstream - return input, grad + return inputs, grad def call(self, inputs: tf.Tensor, training: tf.Tensor = False): if training: From 923f5488d7f18b8a72e6084e772f9e64762eda53 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 10 Aug 2022 11:44:17 +0000 Subject: [PATCH 5/5] variable must be float --- src/transformers/models/deberta/modeling_tf_deberta.py | 3 ++- src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index f97232b3f6d8..edb9b2b08555 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -119,7 +119,8 @@ def xdropout(self, inputs): Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. """ mask = tf.cast( - 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(inputs)), + 1 + - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)), tf.bool, ) scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 5a0f16be9810..fa9a202427e5 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -121,7 +121,8 @@ def xdropout(self, inputs): Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. """ mask = tf.cast( - 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - self.drop_prob).sample(sample_shape=shape_list(inputs)), + 1 + - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)), tf.bool, ) scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)