PaddlePaddle · AnnaTrainingG · Oct 28, 2021 · Oct 27, 2021 · Oct 27, 2021 · Oct 28, 2021
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -194,24 +194,27 @@ def fused_multi_head_attention(x,
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces. This API only
     support self_attention. The pseudo code is as follows:
-    if pre_layer_norm:
-    	out = layer_norm(x);
-        out = linear(out) + qkv)bias
-    else:
-	out = linear(x) + bias;
-    out = transpose(out, perm=[2, 0, 3, 1, 4]);
-    # extract q, k and v from out.
-    q = out[0:1,::]
-    k = out[1:2,::]
-    v = out[2:3,::]
-    out = q * k^t;
-    out = attn_mask + out;
-    out = softmax(out);
-    out = dropout(out);
-    out = out * v;
-    out = transpose(out, perm=[0, 2, 1, 3]);
-    out = out_linear(out);
-    out = layer_norm(x + dropout(linear_bias + out));
+
+    .. code-block:: python
+
+    	if pre_layer_norm:
+    	    out = layer_norm(x)
+            out = linear(out) + qkv) + bias
+    	else:
+	    out = linear(x) + bias
+    	out = transpose(out, perm=[2, 0, 3, 1, 4])
+    	# extract q, k and v from out.
+    	q = out[0:1,::]
+    	k = out[1:2,::]
+    	v = out[2:3,::]
+    	out = q * k^t
+    	out = attn_mask + out
+    	out = softmax(out)
+    	out = dropout(out)
+    	out = out * v
+    	out = transpose(out, perm=[0, 2, 1, 3])
+    	out = out_linear(out)
+    	out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -245,6 +248,9 @@ def fused_multi_head_attention(x,
         ln_epsilon (float, optional): Small float value added to denominator of layer_norm
             to avoid dividing by zero. Default is 1e-5.
 
+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
     Examples:
 
         .. code-block:: python

diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -24,11 +24,12 @@
 
 class FusedMultiHeadAttention(Layer):
     """
-   Attention mapps queries and a set of key-value pairs to outputs, and
+    Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces.
     Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
     for more details.
+
     Parameters:
         embed_dim (int): The expected feature size in the input and output.
         num_heads (int): The number of heads in multi-head attention.
@@ -42,17 +43,18 @@ class FusedMultiHeadAttention(Layer):
             `embed_dim`. Default None.
         vdim (int, optional): The feature size in value. If None, assumed equal to
             `embed_dim`. Default None.
-        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm (True)
-            or post_layer_norm architecture (False). Default False.
+        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm 
+            (True) or post_layer_norm architecture (False). Default False.
         need_weights (bool, optional): Indicate whether to return the attention
             weights. Now, only False is supported. Default False.
         weight_attr(ParamAttr, optional):  To specify the weight parameter property.
             Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr` .
+            See usage for details in :code:`ParamAttr`.
         bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
             Default: None, which means the default bias parameter property is used.
             If it is set to False, this layer will not have trainable bias parameter.
-            See usage for details in :code:`ParamAttr` .
+            See usage for details in :code:`ParamAttr`.
+
     Examples:
 
         .. code-block:: python
@@ -139,6 +141,7 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
         """
         Applies multi-head attention to map queries and a set of key-value pairs
         to outputs.
+
         Parameters:
             query (Tensor): The queries for multi-head attention. It is a
                 tensor with shape `[batch_size, query_length, embed_dim]`. The
@@ -163,6 +166,7 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                 Now, only None is supported. Default None.
+
         Returns:
             Tensor|tuple: It is a tensor that has the same shape and data type \
                 as `query`, representing attention output.