From a10c89a0bd4463de29045c7bd0061b18e5ba7a3d Mon Sep 17 00:00:00 2001 From: zburning <798672141@qq.com> Date: Fri, 25 Oct 2019 11:25:38 +0800 Subject: [PATCH 1/5] Update transformer.py --- .../language_model/transformer/transformer.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py index 82e16705c0..32d5909c77 100644 --- a/scripts/language_model/transformer/transformer.py +++ b/scripts/language_model/transformer/transformer.py @@ -486,6 +486,45 @@ def hybrid_forward(self, F, inputs, pos_emb, mem_value, mask, segments): class _BaseXLNet(mx.gluon.HybridBlock): + """ + Parameters + ---------- + vocab_size : int or None, default None + The size of the vocabulary. + num_layers : int + units : int + hidden_size : int + number of units in the hidden layer of position-wise feed-forward networks + num_heads : int + Number of heads in multi-head attention + activation + Activation function used for the position-wise feed-forward networks + two_stream + If True, use Two-Stream Self-Attention. Typically set to True for + pre-training and False during finetuning. + scaled : bool + Whether to scale the softmax input by the sqrt of the input dimension + in multi-head attention + dropout : float + attention_dropout : float + use_residual : bool + clamp_len : int + Clamp all relative distances larger than clamp_len + use_decoder : bool, default True + Whether to include the decoder for language model prediction. + tie_decoder_weight : bool, default True + Whether to tie the decoder weight with the input embeddings + weight_initializer : str or Initializer + Initializer for the input weights matrix, used for the linear + transformation of the inputs. + bias_initializer : str or Initializer + Initializer for the bias vector. + prefix : str, default 'rnn_' + Prefix for name of `Block`s (and name of weight if params is `None`). + params : Parameter or None + Container for weight sharing between cells. Created if `None`. + + """ def __init__(self, vocab_size, num_layers=2, units=128, hidden_size=2048, num_heads=4, activation='gelu', two_stream: bool = False, scaled=True, dropout=0.0, attention_dropout=0.0, use_residual=True, clamp_len: typing.Optional[int] = None, @@ -529,6 +568,33 @@ def __init__(self, vocab_size, num_layers=2, units=128, hidden_size=2048, num_he params=self.word_embed.params if tie_decoder_weight else None) def hybrid_forward(self, F, step_input, segments, mask, pos_seq, mems, mask_embed): #pylint: disable=arguments-differ + """Transformer Decoder Attention Cell. + + Parameters + ---------- + step_input : NDArray + Input of shape [batch_size, query_length] + segments : Symbol or NDArray + One-hot vector indicating if a query-key pair is in the same + segment or not. Shape [batch_size, query_length, query_length + + memory_length, 2]. `1` indicates that the pair is not in the same + segment. + mask : Symbol or NDArray + Attention mask of shape (batch_size, length, length + mem_length) + pos_seq : Symbol or NDArray + Relative distances + mems : List of NDArray or Symbol, optional + Memory from previous forward passes containing + `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size, + memory_length, units]. + + Returns + ------- + core_out : NDArray or Symbol + For use_decoder=True, logits. Otherwise output of last layer. + hids : List of NDArray or Symbol + Stacking the output of each layer + """ if self._clamp_len: pos_seq = F.clip(pos_seq, a_min=0, a_max=self._clamp_len) @@ -635,6 +701,8 @@ def forward(self, step_input, token_types, mems=None, mask=None): # pylint: dis Optional memory from previous forward passes containing `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size, memory_length, units]. + mask : Symbol or NDArray + Attention mask of shape (batch_size, length, length + mem_length) Returns ------- From 23be6c639dc0f1b5a7b2322e8c3f1572457eea00 Mon Sep 17 00:00:00 2001 From: zburning <798672141@qq.com> Date: Fri, 25 Oct 2019 11:33:12 +0800 Subject: [PATCH 2/5] Update transformer.py --- scripts/language_model/transformer/transformer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py index 32d5909c77..362d3c204b 100644 --- a/scripts/language_model/transformer/transformer.py +++ b/scripts/language_model/transformer/transformer.py @@ -489,7 +489,7 @@ class _BaseXLNet(mx.gluon.HybridBlock): """ Parameters ---------- - vocab_size : int or None, default None + vocab_size : int The size of the vocabulary. num_layers : int units : int @@ -568,11 +568,10 @@ def __init__(self, vocab_size, num_layers=2, units=128, hidden_size=2048, num_he params=self.word_embed.params if tie_decoder_weight else None) def hybrid_forward(self, F, step_input, segments, mask, pos_seq, mems, mask_embed): #pylint: disable=arguments-differ - """Transformer Decoder Attention Cell. - + """ Parameters ---------- - step_input : NDArray + step_input : Symbol or NDArray Input of shape [batch_size, query_length] segments : Symbol or NDArray One-hot vector indicating if a query-key pair is in the same From 090a6cd70826f2fcad2ae9d46fab2748aac2efab Mon Sep 17 00:00:00 2001 From: zburning <798672141@qq.com> Date: Fri, 25 Oct 2019 13:11:05 +0800 Subject: [PATCH 3/5] Update transformer.py --- scripts/language_model/transformer/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py index 362d3c204b..f0c86e9a81 100644 --- a/scripts/language_model/transformer/transformer.py +++ b/scripts/language_model/transformer/transformer.py @@ -568,7 +568,7 @@ def __init__(self, vocab_size, num_layers=2, units=128, hidden_size=2048, num_he params=self.word_embed.params if tie_decoder_weight else None) def hybrid_forward(self, F, step_input, segments, mask, pos_seq, mems, mask_embed): #pylint: disable=arguments-differ - """ + """ Parameters ---------- step_input : Symbol or NDArray From 358169f3cc106da7c39cd70d0855f2389e74e76a Mon Sep 17 00:00:00 2001 From: zburning <798672141@qq.com> Date: Sat, 26 Oct 2019 10:03:00 +0800 Subject: [PATCH 4/5] Update transformer.py deleting trailing white space --- scripts/language_model/transformer/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py index f0c86e9a81..e154c468fd 100644 --- a/scripts/language_model/transformer/transformer.py +++ b/scripts/language_model/transformer/transformer.py @@ -586,7 +586,7 @@ def hybrid_forward(self, F, step_input, segments, mask, pos_seq, mems, mask_embe Memory from previous forward passes containing `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size, memory_length, units]. - + Returns ------- core_out : NDArray or Symbol From 278340c673095a83593813bee9e6763f5d9d2ee2 Mon Sep 17 00:00:00 2001 From: zburning <798672141@qq.com> Date: Tue, 29 Oct 2019 11:28:40 +0800 Subject: [PATCH 5/5] Update transformer.py --- scripts/language_model/transformer/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py index e154c468fd..e84e143cbe 100644 --- a/scripts/language_model/transformer/transformer.py +++ b/scripts/language_model/transformer/transformer.py @@ -519,9 +519,9 @@ class _BaseXLNet(mx.gluon.HybridBlock): transformation of the inputs. bias_initializer : str or Initializer Initializer for the bias vector. - prefix : str, default 'rnn_' + prefix : str, default None Prefix for name of `Block`s (and name of weight if params is `None`). - params : Parameter or None + params : ParameterDict or None Container for weight sharing between cells. Created if `None`. """