Fused attention op forward (#35905)

功能：本PR的目标是提高attention模块的计算性能。为了减少框架层对op的调度开销，本PR通过在C++层手动实现attention模块，对外提供attention 大op；为了减少防存开销，本PR采取了两种优化方法：（1）在q,k,v计算时通过共享输入X，将该处的gemm，transpose和bias add从三次调用减少为一次；（2）使用kernel融合优化技术，在不同cuda kernel之间通过寄存器传输数据；
PaddlePaddle · Oct 22, 2021 · d490621 · d490621
1 parent 08248db
commit d490621
Show file tree

Hide file tree

Showing 12 changed files with 931 additions and 3 deletions.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -217,7 +217,7 @@ function(op_library TARGET)
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "sparse_attention_op"  "dgc_op" "fused_fc_elementwise_layernorm_op"
 "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
-"fused_bn_add_activation_op" "resnet_unit_op")
+"fused_bn_add_activation_op" "fused_attention_op" "resnet_unit_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()

diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
@@ -34,6 +34,9 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
     TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor);
     *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
     *increment = offset;
+  } else if (seed && platform::is_cpu_place(seed->place())) {
+    *seed_data = *(seed->data<int>());
+    *increment = offset;
   } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
     auto seed_offset = gen_cuda->IncrementOffset(offset);
     *seed_data = seed_offset.first;

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -16,6 +16,7 @@ register_operators(EXCLUDES
     fusion_gru_op
     fusion_lstm_op
     fused_bn_add_activation_op
+    fused_attention_op
     fused_transformer_op
     resnet_unit_op)
 
@@ -78,6 +79,9 @@ if (WITH_GPU OR WITH_ROCM)
         nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+        # fused_attention_op
+        op_library(fused_attention_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_attention);\n")
     endif()
     # resnet_unit needs cudnn 8.0 above
     if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))

diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_fp16.h>
+#include <cub/cub.cuh>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/fused/attention_layer_norm.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fmha_ref.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class FusedAttentionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    auto *input_x = ctx.Input<Tensor>("X");
+
+    const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *ln_bias = ctx.Input<Tensor>("LnBias");
+    auto *ln_mean = ctx.Output<Tensor>("LnMean");
+    auto *ln_var = ctx.Output<Tensor>("LnVariance");
+    auto *ln_out = ctx.Output<Tensor>("LnOut");
+
+    // x: qkv's input [batch_size, seq_len, dim_embed]
+    // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
+    auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
+    auto *qkv_out = ctx.Output<Tensor>("QKVOut");
+    auto *qkv_bias_out = ctx.Output<Tensor>("QKVBiasOut");
+
+    auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    auto *transpose_out_2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *qk_out = ctx.Output<Tensor>("QKOut");
+    auto *qktv_out = ctx.Output<Tensor>("QKTVOut");
+    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
+    auto *attn_dropout_mask_out = ctx.Output<Tensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_out = ctx.Output<Tensor>("AttnDropoutOut");
+    auto *src_mask_out = ctx.Output<Tensor>("SrcMaskOut");
+    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+
+    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
+    auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+    auto *out_linear_out = ctx.Output<Tensor>("OutLinearOut");
+
+    auto *ln_scale_2 = ctx.Input<Tensor>("Ln2Scale");
+    auto *ln_bias_2 = ctx.Input<Tensor>("Ln2Bias");
+    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Output<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean_2 = ctx.Output<Tensor>("Ln2Mean");
+    auto *ln_var_2 = ctx.Output<Tensor>("Ln2Variance");
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+
+    float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
+    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    auto &dropout_implementation_1 =
+        ctx.Attr<std::string>("attn_dropout_implementation");
+    bool is_upscale_in_train_1 =
+        (dropout_implementation_1 == "upscale_in_train");
+    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
+    int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+
+    // final output.
+    auto *out = ctx.Output<Tensor>("Y");
+
+    // get data ptr for qkv part.
+    const auto input_x_dims = input_x->dims();
+    const auto qkv_w_dims = qkv_weight->dims();
+
+    auto *x_data = input_x->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data<U>());
+    auto *ln_mean_data = ln_mean->mutable_data<U>(ctx.GetPlace());
+    auto *ln_var_data = ln_var->mutable_data<U>(ctx.GetPlace());
+    auto *ln_out_data = ln_out->mutable_data<T>(ctx.GetPlace());
+
+    auto *qkv_weight_data = qkv_weight->data<T>();
+    auto *qkv_bias_data = qkv_bias->data<T>();
+    auto *qkv_out_data = qkv_out->mutable_data<T>(ctx.GetPlace());
+    auto *qkv_bias_out_data = qkv_bias_out->mutable_data<T>(ctx.GetPlace());
+
+    // get data ptr for FMHA.
+    auto *transpose_out_2_data =
+        transpose_out_2->mutable_data<T>(ctx.GetPlace());
+    auto *qk_out_data = qk_out->mutable_data<T>(ctx.GetPlace());
+    auto *qktv_out_data = qktv_out->mutable_data<T>(ctx.GetPlace());
+    auto *src_mask_out_data = src_mask_out->mutable_data<T>(ctx.GetPlace());
+    auto *softmax_out_data = softmax_out->mutable_data<T>(ctx.GetPlace());
+    auto *attn_dropout_mask_out_data =
+        attn_dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
+    auto *attn_dropout_out_data =
+        attn_dropout_out->mutable_data<T>(ctx.GetPlace());
+    auto *fmha_out_data = fmha_out->mutable_data<T>(ctx.GetPlace());
+
+    // get data ptr for out_linear.
+    auto *out_linear_weight_data = out_linear_weight->data<T>();
+    auto *out_linear_bias_data = out_linear_bias->data<T>();
+    auto *out_linear_out_data = out_linear_out->mutable_data<T>(ctx.GetPlace());
+
+    // get data ptr for bias+dropout+residual+layernorm
+    auto *ln_scale_2_data =
+        (ln_scale_2 == nullptr ? nullptr : ln_scale_2->data<U>());
+    auto *ln_bias_2_data =
+        (ln_bias_2 == nullptr ? nullptr : ln_bias_2->data<U>());
+    auto *dropout_mask_out_data =
+        dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
+    auto *bias_dropout_residual_out_data =
+        bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *ln_mean_2_data = ln_mean_2->mutable_data<U>(ctx.GetPlace());
+    auto *ln_var_2_data = ln_var_2->mutable_data<U>(ctx.GetPlace());
+    auto *final_out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = input_x_dims[0];
+    int max_seq_len = input_x_dims[1];
+    int dim_embed = input_x_dims[2];
+
+    int num_head = qkv_w_dims[1];
+    int dim_head = qkv_w_dims[2];
+
+    int bsz_seq = batch_size * max_seq_len;
+    int hidden_size = num_head * dim_head;
+    int output_size = 3 * hidden_size;
+    int input_size = dim_embed;
+
+    auto layer_norm_compute = AttnLayerNorm<T>(ctx.cuda_device_context(),
+                                               epsilon, bsz_seq, dim_embed);
+    // (transA, transB, compute_bias) = (false, true, true)
+    auto qkv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, true,
+                                     bsz_seq, output_size, input_size, true);
+
+    AttnDropoutParam attn_dropout_param(
+        is_test_1, dropout_implementation_1, attn_dropout_rate,
+        is_upscale_in_train_1, is_fix_seed_1, seed_val_1, seed_1);
+    auto fmha_ref_compute =
+        FMHARef<T>(ctx.cuda_device_context(), batch_size, max_seq_len, num_head,
+                   dim_head, attn_dropout_param);
+
+    output_size = hidden_size;
+    // (transA, transB, compute_bias) = (false, false, false)
+    auto out_linear_compute =
+        AttnMatMul<T>(ctx.cuda_device_context(), false, false, bsz_seq,
+                      output_size, input_size, false);
+    DropoutParam dropout_param2(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
+        ln_epsilon);
+
+    if (pre_layer_norm) {
+      layer_norm_compute.ComputeForward(x_data, ln_scale_data, ln_bias_data,
+                                        ln_out_data, ln_mean_data, ln_var_data);
+      qkv_compute.ComputeForward(qkv_weight_data, ln_out_data, qkv_bias_data,
+                                 qkv_out_data, qkv_bias_out_data);
+    } else {
+      qkv_compute.ComputeForward(qkv_weight_data, x_data, qkv_bias_data,
+                                 qkv_out_data, qkv_bias_out_data);
+    }
+    fmha_ref_compute.ComputeForward(*qkv_bias_out, *src_mask, transpose_out_2,
+                                    qk_out, src_mask_out, softmax_out,
+                                    attn_dropout_mask_out, attn_dropout_out,
+                                    qktv_out, fmha_out);
+    // fmha_out: [batch_size, seq_len, num_head, head_dim]
+    // weight:   [embed_dim, embed_dim]
+    // out_linear_out: [batch_size, seq_len, embed_dim]
+    out_linear_compute.ComputeForward(out_linear_weight_data, fmha_out_data,
+                                      nullptr, out_linear_out_data, nullptr);
+    // output = layernorm(residual + dropout(input + bias))
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+        ctx.cuda_device_context(), out_linear_out_data, x_data,
+        out_linear_bias_data, ln_scale_2_data, ln_bias_2_data,
+        bias_dropout_residual_out_data, dropout_mask_out_data, final_out_data,
+        ln_mean_2_data, ln_var_2_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fused_attention, ops::FusedAttentionOpKernel<float>,
+                        ops::FusedAttentionOpKernel<double>,
+                        ops::FusedAttentionOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -66,7 +66,7 @@ struct DropoutParam {
     } else {
       pre_fix = pre_fix + "_";
     }
-    dropout_prob = context.Attr<float>(pre_fix + "prob");
+    dropout_prob = context.Attr<float>(pre_fix + "rate");
     auto& dropout_implementation =
         context.Attr<std::string>(pre_fix + "implementation");
     is_upscale_in_train = (dropout_implementation == "upscale_in_train");

diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
@@ -40,6 +40,9 @@
 // need to manually specify them in this map.
 std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
+    {"fused_attention",
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
+      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -92,6 +95,11 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
+    {"fused_attention",
+     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
+      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
+      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
+      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -97,6 +97,10 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
+if(NOT WITH_GPU)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
+endif()
+
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
     LIST(REMOVE_ITEM TEST_OPS test_c_concat)