wenet-e2e · robin1001 · Mar 20, 2023 · Mar 20, 2023
diff --git a/wenet/transformer/decoder.py b/wenet/transformer/decoder.py
@@ -42,9 +42,6 @@ class TransformerDecoder(torch.nn.Module):
         normalize_before:
             True: use layer_norm before each sub-block of a layer.
             False: use layer_norm after each sub-block of a layer.
-        concat_after: whether to concat attention layer's input and output
-            True: x -> x + linear(concat(x, att(x)))
-            False: x -> x + att(x)
     """
     def __init__(
         self,
@@ -60,7 +57,6 @@ def __init__(
         input_layer: str = "embed",
         use_output_layer: bool = True,
         normalize_before: bool = True,
-        concat_after: bool = False,
     ):
         assert check_argument_types()
         super().__init__()
@@ -90,7 +86,6 @@ def __init__(
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
-                concat_after,
             ) for _ in range(self.num_blocks)
         ])
 
@@ -202,9 +197,6 @@ class BiTransformerDecoder(torch.nn.Module):
         normalize_before:
             True: use layer_norm before each sub-block of a layer.
             False: use layer_norm after each sub-block of a layer.
-        concat_after: whether to concat attention layer's input and output
-            True: x -> x + linear(concat(x, att(x)))
-            False: x -> x + att(x)
     """
     def __init__(
         self,
@@ -221,7 +213,6 @@ def __init__(
         input_layer: str = "embed",
         use_output_layer: bool = True,
         normalize_before: bool = True,
-        concat_after: bool = False,
     ):
 
         assert check_argument_types()
@@ -230,13 +221,13 @@ def __init__(
             vocab_size, encoder_output_size, attention_heads, linear_units,
             num_blocks, dropout_rate, positional_dropout_rate,
             self_attention_dropout_rate, src_attention_dropout_rate,
-            input_layer, use_output_layer, normalize_before, concat_after)
+            input_layer, use_output_layer, normalize_before)
 
         self.right_decoder = TransformerDecoder(
             vocab_size, encoder_output_size, attention_heads, linear_units,
             r_num_blocks, dropout_rate, positional_dropout_rate,
             self_attention_dropout_rate, src_attention_dropout_rate,
-            input_layer, use_output_layer, normalize_before, concat_after)
+            input_layer, use_output_layer, normalize_before)
 
     def forward(
         self,

diff --git a/wenet/transformer/decoder_layer.py b/wenet/transformer/decoder_layer.py
@@ -35,10 +35,6 @@ class DecoderLayer(nn.Module):
         normalize_before (bool):
             True: use layer_norm before each sub-block.
             False: to use layer_norm after each sub-block.
-        concat_after (bool): Whether to concat attention layer's inpu
-            and output.
-            True: x -> x + linear(concat(x, att(x)))
-            False: x -> x + att(x)
     """
     def __init__(
         self,
@@ -48,7 +44,6 @@ def __init__(
         feed_forward: nn.Module,
         dropout_rate: float,
         normalize_before: bool = True,
-        concat_after: bool = False,
     ):
         """Construct an DecoderLayer object."""
         super().__init__()
@@ -61,13 +56,6 @@ def __init__(
         self.norm3 = nn.LayerNorm(size, eps=1e-5)
         self.dropout = nn.Dropout(dropout_rate)
         self.normalize_before = normalize_before
-        self.concat_after = concat_after
-        if self.concat_after:
-            self.concat_linear1 = nn.Linear(size + size, size)
-            self.concat_linear2 = nn.Linear(size + size, size)
-        else:
-            self.concat_linear1 = nn.Identity()
-            self.concat_linear2 = nn.Identity()
 
     def forward(
         self,
@@ -115,26 +103,16 @@ def forward(
             residual = residual[:, -1:, :]
             tgt_q_mask = tgt_mask[:, -1:, :]
 
-        if self.concat_after:
-            tgt_concat = torch.cat(
-                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1)
-            x = residual + self.concat_linear1(tgt_concat)
-        else:
-            x = residual + self.dropout(
-                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
+        x = residual + self.dropout(
+            self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
         if not self.normalize_before:
             x = self.norm1(x)
 
         residual = x
         if self.normalize_before:
             x = self.norm2(x)
-        if self.concat_after:
-            x_concat = torch.cat(
-                (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1)
-            x = residual + self.concat_linear2(x_concat)
-        else:
-            x = residual + self.dropout(
-                self.src_attn(x, memory, memory, memory_mask)[0])
+        x = residual + self.dropout(
+            self.src_attn(x, memory, memory, memory_mask)[0])
         if not self.normalize_before:
             x = self.norm2(x)
 

diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
@@ -52,7 +52,6 @@ def __init__(
         input_layer: str = "conv2d",
         pos_enc_layer_type: str = "abs_pos",
         normalize_before: bool = True,
-        concat_after: bool = False,
         static_chunk_size: int = 0,
         use_dynamic_chunk: bool = False,
         global_cmvn: torch.nn.Module = None,
@@ -77,10 +76,6 @@ def __init__(
             normalize_before (bool):
                 True: use layer_norm before each sub-block of a layer.
                 False: use layer_norm after each sub-block of a layer.
-            concat_after (bool): whether to concat attention layer's input
-                and output.
-                True: x -> x + linear(concat(x, att(x)))
-                False: x -> x + att(x)
             static_chunk_size (int): chunk size for static chunk training and
                 decoding
             use_dynamic_chunk (bool): whether use dynamic chunk size for
@@ -341,7 +336,6 @@ def __init__(
         input_layer: str = "conv2d",
         pos_enc_layer_type: str = "abs_pos",
         normalize_before: bool = True,
-        concat_after: bool = False,
         static_chunk_size: int = 0,
         use_dynamic_chunk: bool = False,
         global_cmvn: torch.nn.Module = None,
@@ -356,7 +350,7 @@ def __init__(
                          linear_units, num_blocks, dropout_rate,
                          positional_dropout_rate, attention_dropout_rate,
                          input_layer, pos_enc_layer_type, normalize_before,
-                         concat_after, static_chunk_size, use_dynamic_chunk,
+                         static_chunk_size, use_dynamic_chunk,
                          global_cmvn, use_dynamic_left_chunk)
         self.encoders = torch.nn.ModuleList([
             TransformerEncoderLayer(
@@ -365,7 +359,7 @@ def __init__(
                                      attention_dropout_rate),
                 PositionwiseFeedForward(output_size, linear_units,
                                         dropout_rate), dropout_rate,
-                normalize_before, concat_after) for _ in range(num_blocks)
+                normalize_before) for _ in range(num_blocks)
         ])
 
 
@@ -384,7 +378,6 @@ def __init__(
         input_layer: str = "conv2d",
         pos_enc_layer_type: str = "rel_pos",
         normalize_before: bool = True,
-        concat_after: bool = False,
         static_chunk_size: int = 0,
         use_dynamic_chunk: bool = False,
         global_cmvn: torch.nn.Module = None,
@@ -419,7 +412,7 @@ def __init__(
                          linear_units, num_blocks, dropout_rate,
                          positional_dropout_rate, attention_dropout_rate,
                          input_layer, pos_enc_layer_type, normalize_before,
-                         concat_after, static_chunk_size, use_dynamic_chunk,
+                         static_chunk_size, use_dynamic_chunk,
                          global_cmvn, use_dynamic_left_chunk)
         activation = get_activation(activation_type)
 
@@ -457,6 +450,5 @@ def __init__(
                     *convolution_layer_args) if use_cnn_module else None,
                 dropout_rate,
                 normalize_before,
-                concat_after,
             ) for _ in range(num_blocks)
         ])
diff --git a/wenet/transformer/encoder_layer.py b/wenet/transformer/encoder_layer.py
@@ -36,11 +36,6 @@ class TransformerEncoderLayer(nn.Module):
         normalize_before (bool):
             True: use layer_norm before each sub-block.
             False: to use layer_norm after each sub-block.
-        concat_after (bool): Whether to concat attention layer's input and
-            output.
-            True: x -> x + linear(concat(x, att(x)))
-            False: x -> x + att(x)
-
     """
     def __init__(
         self,
@@ -49,7 +44,6 @@ def __init__(
         feed_forward: torch.nn.Module,
         dropout_rate: float,
         normalize_before: bool = True,
-        concat_after: bool = False,
     ):
         """Construct an EncoderLayer object."""
         super().__init__()
@@ -60,11 +54,6 @@ def __init__(
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before
-        self.concat_after = concat_after
-        if concat_after:
-            self.concat_linear = nn.Linear(size + size, size)
-        else:
-            self.concat_linear = nn.Identity()
 
     def forward(
         self,
@@ -101,14 +90,9 @@ def forward(
         residual = x
         if self.normalize_before:
             x = self.norm1(x)
-
         x_att, new_att_cache = self.self_attn(
             x, x, x, mask, cache=att_cache)
-        if self.concat_after:
-            x_concat = torch.cat((x, x_att), dim=-1)
-            x = residual + self.concat_linear(x_concat)
-        else:
-            x = residual + self.dropout(x_att)
+        x = residual + self.dropout(x_att)
         if not self.normalize_before:
             x = self.norm1(x)
 
@@ -141,10 +125,6 @@ class ConformerEncoderLayer(nn.Module):
         normalize_before (bool):
             True: use layer_norm before each sub-block.
             False: use layer_norm after each sub-block.
-        concat_after (bool): Whether to concat attention layer's input and
-            output.
-            True: x -> x + linear(concat(x, att(x)))
-            False: x -> x + att(x)
     """
     def __init__(
         self,
@@ -155,7 +135,6 @@ def __init__(
         conv_module: Optional[nn.Module] = None,
         dropout_rate: float = 0.1,
         normalize_before: bool = True,
-        concat_after: bool = False,
     ):
         """Construct an EncoderLayer object."""
         super().__init__()
@@ -178,11 +157,6 @@ def __init__(
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before
-        self.concat_after = concat_after
-        if self.concat_after:
-            self.concat_linear = nn.Linear(size + size, size)
-        else:
-            self.concat_linear = nn.Identity()
 
 
     def forward(
@@ -230,14 +204,9 @@ def forward(
         residual = x
         if self.normalize_before:
             x = self.norm_mha(x)
-
         x_att, new_att_cache = self.self_attn(
             x, x, x, mask, pos_emb, att_cache)
-        if self.concat_after:
-            x_concat = torch.cat((x, x_att), dim=-1)
-            x = residual + self.concat_linear(x_concat)
-        else:
-            x = residual + self.dropout(x_att)
+        x = residual + self.dropout(x_att)
         if not self.normalize_before:
             x = self.norm_mha(x)