PaddlePaddle · qili93 · Apr 15, 2022 · Apr 13, 2022
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
@@ -15,12 +15,8 @@ limitations under the Licnse. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
@@ -38,20 +34,39 @@ class ActivationMLUKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlActivationDesc act_desc(act_mode, alpha);
-    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
-                                 ToCnnlDataType(input->dtype()));
-    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
-                                  ToCnnlDataType(output->dtype()));
-
-    MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(),
-                    reinterpret_cast<const void*>(input->data<T>()),
-                    output_desc.get(),
-                    reinterpret_cast<void*>(output->data<T>()));
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+
+    MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(), GetBasePtr(input),
+                    output_desc.get(), GetBasePtr(output));
   }
 };
 
+// For gelu, leaky_relu
 template <cnnlActivationMode_t act_mode, typename T>
-class ActivationGradMLUKernel : public framework::OpKernel<T> {
+class ActivationGradMLUKernelV1 : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnlActivationDesc act_desc(act_mode, alpha);
+    MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
+                        dout_desc.get(), GetBasePtr(dout), x_desc.get(),
+                        GetBasePtr(x), dx_desc.get(), GetBasePtr(dx));
+  }
+};
+
+// For tanh, sigmoid
+template <cnnlActivationMode_t act_mode, typename T>
+class ActivationGradMLUKernelV2 : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out = ctx.Input<Tensor>("Out");
@@ -61,18 +76,35 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {
 
     dx->mutable_data<T>(ctx.GetPlace());
 
-    MLUCnnlTensorDesc dout_desc(*dout, CNNL_LAYOUT_ARRAY,
-                                ToCnnlDataType(dout->dtype()));
-    MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY,
-                               ToCnnlDataType(out->dtype()));
-    MLUCnnlTensorDesc dx_desc(*dx, CNNL_LAYOUT_ARRAY,
-                              ToCnnlDataType(dx->dtype()));
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
     MLUCnnlActivationDesc act_desc(act_mode, alpha);
-    MLUCnnl::ActiveGrad(
-        ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
-        dout_desc.get(), reinterpret_cast<const void*>(dout->data<T>()),
-        out_desc.get(), reinterpret_cast<const void*>(out->data<T>()),
-        dx_desc.get(), reinterpret_cast<void*>(dx->data<T>()));
+    MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, out_desc.get(),
+                        GetBasePtr(out), dout_desc.get(), GetBasePtr(dout),
+                        nullptr, nullptr, dx_desc.get(), GetBasePtr(dx));
+  }
+};
+
+// For relu, relu6
+template <cnnlActivationMode_t act_mode, typename T>
+class ActivationGradMLUKernelV3 : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnlActivationDesc act_desc(act_mode, alpha);
+    MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
+                        dout_desc.get(), GetBasePtr(dout), out_desc.get(),
+                        GetBasePtr(out), dx_desc.get(), GetBasePtr(dx));
   }
 };
 
@@ -81,10 +113,60 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
+// relu
 REGISTER_OP_MLU_KERNEL(
     relu, ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, float>,
     ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, paddle::platform::float16>);
 REGISTER_OP_MLU_KERNEL(
-    relu_grad, ops::ActivationGradMLUKernel<CNNL_ACTIVATION_RELU, float>,
-    ops::ActivationGradMLUKernel<CNNL_ACTIVATION_RELU,
-                                 paddle::platform::float16>);
+    relu_grad, ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU, float>,
+    ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU,
+                                   paddle::platform::float16>);
+
+// relu6
+REGISTER_OP_MLU_KERNEL(
+    relu6, ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, float>,
+    ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    relu6_grad, ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6, float>,
+    ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6,
+                                   paddle::platform::float16>);
+
+// sigmoid
+REGISTER_OP_MLU_KERNEL(sigmoid,
+                       ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID, float>,
+                       ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID,
+                                                paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    sigmoid_grad,
+    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID, float>,
+    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID,
+                                   paddle::platform::float16>);
+
+// tanh
+REGISTER_OP_MLU_KERNEL(
+    tanh, ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, float>,
+    ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    tanh_grad, ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH, float>,
+    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH,
+                                   paddle::platform::float16>);
+
+// gelu
+REGISTER_OP_MLU_KERNEL(
+    gelu, ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, float>,
+    ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    gelu_grad, ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU, float>,
+    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU,
+                                   paddle::platform::float16>);
+
+// leaky_relu
+REGISTER_OP_MLU_KERNEL(
+    leaky_relu, ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU, float>,
+    ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU,
+                             paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    leaky_relu_grad,
+    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU, float>,
+    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU,
+                                   paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc
@@ -51,6 +51,8 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
         }
       }
     }
+    const T *value_data = &value;
+    cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
     if (ctx.HasInput("ValueTensor")) {
       auto *value_tensor = ctx.Input<framework::Tensor>("ValueTensor");
       PADDLE_ENFORCE_EQ(
@@ -59,22 +61,18 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
               "When use Tensor as value to set Tensor value in fill_cosntant, "
               "value input(ValueTensor) size must be 1, but get %d",
               value_tensor->numel()));
-      const T *tensor_data = value_tensor->data<T>();
-      framework::Tensor mlu_tensor;
+      value_data = value_tensor->data<T>();
       auto tmp_place = value_tensor->place();
       if (platform::is_mlu_place(tmp_place)) {
-        framework::TensorCopySync(*value_tensor, platform::CPUPlace(),
-                                  &mlu_tensor);
-        tensor_data = mlu_tensor.data<T>();
+        pointer_mode = CNNL_POINTER_MODE_DEVICE;
       }
-      value = tensor_data[0];
     }
 
     auto shape = GetShape(ctx);
     out_var->mutable_data<T>(shape, ctx.GetPlace());
-    MLUCnnlTensorDesc output_desc(*out_var, CNNL_LAYOUT_ARRAY,
-                                  ToCnnlDataType(out_var->dtype()));
-    MLUCnnl::Fill(ctx, value, output_desc.get(), GetBasePtr(out_var));
+    MLUCnnlTensorDesc output_desc(*out_var);
+    MLUCnnl::Fill(ctx, pointer_mode, value_data, output_desc.get(),
+                  GetBasePtr(out_var));
   }
 };
 }  // namespace operators

diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
@@ -95,7 +95,8 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
     MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY,
                                     ToCnnlDataType(mean_var.dtype()));
     auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
-    MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var));
+    MLUCnnl::Fill(context, CNNL_POINTER_MODE_HOST, &value, mean_var_desc.get(),
+                  GetBasePtr(&mean_var));
 
     // means mul output_grad
     MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY,

diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -136,15 +136,17 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     // [total]
     total->mutable_data<int>(ctx.GetPlace());
     MLUCnnlTensorDesc total_desc(*total);
-    MLUCnnl::Fill(ctx, num_samples, total_desc.get(), GetBasePtr(total));
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &num_samples, total_desc.get(),
+                  GetBasePtr(total));
 
     // use `total` of type `float32` for calculating accuracy
     Tensor total_fp32(framework::TransToPhiDataType(VT::FP32));
     total_fp32.Resize(total->dims());
     total_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc total_fp32_desc(total_fp32);
-    MLUCnnl::Fill(ctx, static_cast<float>(num_samples), total_fp32_desc.get(),
-                  GetBasePtr(&total_fp32));
+    float num_samples_fp32 = static_cast<float>(num_samples);
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &num_samples_fp32,
+                  total_fp32_desc.get(), GetBasePtr(&total_fp32));
 
     // [accuracy]
     accuracy->mutable_data<float>(ctx.GetPlace());

diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -208,8 +208,20 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
 MLUCnnlActivationDesc::MLUCnnlActivationDesc(
     const cnnlActivationMode_t act_mode, const float ceof) {
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor(
-      active_desc_, act_mode, CNNL_NOT_PROPAGATE_NAN, ceof));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4(
+      active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION,
+      CNNL_NOT_PROPAGATE_NAN, ceof, 1.0f /*sliced_dim*/,
+      1.67326319217681884765625 /*selu_alpha*/,
+      1.05070102214813232421875 /*selu_lambda*/));
+}
+
+MLUCnnlActivationDesc::MLUCnnlActivationDesc(
+    const cnnlActivationMode_t act_mode, const float ceof,
+    const float sliced_dim, const float selu_alpha, const float selu_lambda) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4(
+      active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION,
+      CNNL_NOT_PROPAGATE_NAN, ceof, sliced_dim, selu_alpha, selu_lambda));
 }
 
 const cnnlActivationDescriptor_t MLUCnnlActivationDesc::get() const {
@@ -541,12 +553,15 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
-/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx, float value,
+/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx,
+                                const cnnlPointerMode_t pointer_mode,
+                                const void* value_ptr,
                                 const cnnlTensorDescriptor_t output_desc,
                                 void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlFill(handle, value, output_desc, output));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlFill_v3(handle, pointer_mode, value_ptr, output_desc, output));
 }
 
 /* static */ void MLUCnnl::QuantifyOffline(

diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -218,6 +218,9 @@ class MLUCnnlActivationDesc {
   MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
   MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete;
   MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof);
+  MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof,
+                        const float sliced_dim, const float selu_alpha,
+                        const float selu_lambda);
 
   const cnnlActivationDescriptor_t get() const;
   ~MLUCnnlActivationDesc();
@@ -418,7 +421,8 @@ class MLUCnnl {
                   const cnnlTensorDescriptor_t in1_desc, const void* in1,
                   const cnnlTensorDescriptor_t output_desc, void* output);
 
-  static void Fill(const ExecutionContext& ctx, float value,
+  static void Fill(const ExecutionContext& ctx,
+                   const cnnlPointerMode_t pointer_mode, const void* value_ptr,
                    const cnnlTensorDescriptor_t output_desc, void* output);
 
   static void LRN(const ExecutionContext& ctx, const int local_size,

diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -69,7 +69,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
                             "the same Tensors."));
     }
 
-    auto mu = ctx.Attr<float>("mu");
+    auto mu = static_cast<T>(ctx.Attr<float>("mu"));
     auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
     if (lrs.size() != 1) {
       PADDLE_ENFORCE_EQ(
@@ -114,7 +114,8 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
 
     Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
-    MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &mu, mu_tensor_desc.get(),
+                  GetBasePtr(&mu_tensor));
 
     for (size_t idx = 0; idx < n; ++idx) {
       RegularizationType regularization_flag =

diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -52,7 +52,8 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
       Tensor mu_tensor =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
       MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
-      MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &mu, mu_tensor_desc.get(),
+                    GetBasePtr(&mu_tensor));
 
       Tensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param);

diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -103,8 +103,8 @@ class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
                                       ToCnnlDataType(input_grad->dtype()));
 
     auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
-    MLUCnnl::Fill(context, value, input_grad_desc.get(),
-                  GetBasePtr(input_grad));
+    MLUCnnl::Fill(context, CNNL_POINTER_MODE_HOST, &value,
+                  input_grad_desc.get(), GetBasePtr(input_grad));
 
     MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
                                        CNNL_NOT_PROPAGATE_NAN);

diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc
@@ -27,7 +27,7 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
 
     // cnnl require input, scale, bias with same type. And all in device side.
-    auto& scale = ctx.Attr<float>("scale");
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
     framework::Tensor scale_tensor;
     if (ctx.HasInput("ScaleTensor")) {
       framework::Tensor float_scale_tensor =
@@ -49,14 +49,16 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
     } else {
       scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
       MLUCnnlTensorDesc scale_desc(scale_tensor);
-      MLUCnnl::Fill(ctx, scale, scale_desc.get(), GetBasePtr(&scale_tensor));
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &scale, scale_desc.get(),
+                    GetBasePtr(&scale_tensor));
     }
 
-    auto& bias = ctx.Attr<float>("bias");
+    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
     framework::Tensor bias_tensor =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc bias_desc(bias_tensor);
-    MLUCnnl::Fill(ctx, bias, bias_desc.get(), GetBasePtr(&bias_tensor));
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &bias, bias_desc.get(),
+                  GetBasePtr(&bias_tensor));
 
     auto* out_var = ctx.OutputVar("Out");
     if (in_var->IsType<phi::SelectedRows>() && in_var != out_var) {