diff --git a/wenet/cif/cif_decoder.py b/wenet/cif/cif_decoder.py
index 0193406a2..919e0cc00 100644
--- a/wenet/cif/cif_decoder.py
+++ b/wenet/cif/cif_decoder.py
@@ -48,11 +48,6 @@ class BaseDecoder(nn.Module):
         use_output_layer: whether to use output layer
         pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
         normalize_before: whether to use layer_norm before the first block
-        concat_after: whether to concat attention layer's input and output
-            if True, additional linear will be applied.
-            i.e. x -> x + linear(concat(x, att(x)))
-            if False, no additional linear will be applied.
-            i.e. x -> x + att(x)
     """
 
     def __init__(
@@ -177,7 +172,6 @@ def __init__(
             use_output_layer: bool = True,
             pos_enc_class=PositionalEncoding,
             normalize_before: bool = True,
-            concat_after: bool = False,
             embeds_id: int = -1,
     ):
         assert check_argument_types()
@@ -205,8 +199,7 @@ def __init__(
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
-                normalize_before,
-                concat_after)
+                normalize_before)
             for _ in range(num_blocks)
         ])
 
@@ -294,7 +287,6 @@ def __init__(
             use_output_layer: bool = True,
             pos_enc_class=PositionalEncoding,
             normalize_before: bool = True,
-            concat_after: bool = False,
             att_layer_num: int = 6,
             kernel_size: int = 21,
             sanm_shfit: int = 0
@@ -355,7 +347,6 @@ def __init__(
                                                    dropout_rate),
                 dropout_rate,
                 normalize_before,
-                concat_after,
             ) for _ in range(att_layer_num)
         ])
         if num_blocks - att_layer_num <= 0:
@@ -374,7 +365,6 @@ def __init__(
                                                        dropout_rate),
                     dropout_rate,
                     normalize_before,
-                    concat_after,
                 ) for _ in range(num_blocks - att_layer_num)
             ])
         self.decoders3 = torch.nn.ModuleList([
@@ -386,7 +376,6 @@ def __init__(
                                                    dropout_rate),
                 dropout_rate,
                 normalize_before,
-                concat_after,
             ) for _ in range(1)
         ])
 
diff --git a/wenet/cif/decoder_layer.py b/wenet/cif/decoder_layer.py
index deab28e8a..2e5c60f39 100644
--- a/wenet/cif/decoder_layer.py
+++ b/wenet/cif/decoder_layer.py
@@ -34,11 +34,6 @@ class DecoderLayerSANM(nn.Module):
         dropout_rate (float): Dropout rate.
         normalize_before (bool): Whether to use layer_norm before the first
                 block.
-        concat_after (bool): Whether to concat attention layer's input and
-                output.
-            if True, additional linear will be applied.
-            i.e. x -> x + linear(concat(x, att(x)))
-            if False, no additional linear will be applied. i.e. x -> x + att(x)
     """
 
     def __init__(
@@ -49,7 +44,6 @@ def __init__(
             feed_forward: nn.Module,
             dropout_rate: float,
             normalize_before: bool = True,
-            concat_after: bool = False,
     ):
         """Construct an DecoderLayer object."""
         super(DecoderLayerSANM, self).__init__()
@@ -64,13 +58,6 @@ def __init__(
             self.norm3 = nn.LayerNorm(size, eps=1e-12)
         self.dropout = nn.Dropout(dropout_rate)
         self.normalize_before = normalize_before
-        self.concat_after = concat_after
-        if self.concat_after:
-            self.concat_linear1 = nn.Linear(size + size, size)
-            self.concat_linear2 = nn.Linear(size + size, size)
-        else:
-            self.concat_linear1 = nn.Identity()
-            self.concat_linear2 = nn.Identity()
 
     def forward(
             self,