PaddlePaddle · luotao1 · Apr 14, 2021 · Dec 2, 2020 · Mar 8, 2021 · Mar 9, 2021
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -86,7 +86,8 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},

diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -81,5 +81,22 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     z->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = framework::vectorize(x->dims());
+  const auto dst_tz = framework::vectorize(y->dims());
+
+  int j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) {
+      j--;
+    }
+  }
+
+  return dst_tz_ex;
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -105,7 +105,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, mkldnn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         // As source we use mem object with results from binary operation

diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_max);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_max, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMaxMKLDNNKernel<float>,
+                   ops::ReduceMaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_mean);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_mean, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanMKLDNNKernel<float>,
+                   ops::ReduceMeanMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_min);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_min, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMinMKLDNNKernel<float>,
+                   ops::ReduceMinMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using platform::to_void_cast;
+
+template <typename T>
+class ReduceMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm reduction_type) const {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+
+    std::vector<int64_t> output_dims =
+        CalculateOutputDims(input, output, reduce_dims, reduce_all, keep_dim);
+
+    auto input_dims = framework::vectorize(input->dims());
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    // oneDNN reduce op does not support edge case in which memory is being
+    // copied without actual reduction.
+    // In that case reorder must be executed to maintain compatibility with
+    // PaddlePaddle reduce op
+    if (input_dims == output_dims) {
+      mkldnn::memory::data_type input_type =
+          framework::ToMKLDNNDataType(input->type());
+      std::string key = platform::CreateKey(
+          dev_ctx, input_dims, input->format(), input->format(), input_type);
+      platform::ReorderMKLDNNHandler reorder_handler(
+          input_dims, input->type(), input_type, dev_ctx, onednn_engine, key);
+
+      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+          input->format(), platform::to_void_cast(input->data<T>()));
+
+      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+          output, input->format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                      reorder_dst_memory_p);
+
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    } else {
+      platform::ReductionMKLDNNHandler<T> handler(
+          reduction_type, 0.0f, 0.0f, dev_ctx, onednn_engine, ctx.GetPlace(),
+          input, output, ctx.InputName("X"), output_dims);
+
+      auto src_memory_p = handler.AcquireSrcMemory(input);
+      auto dst_memory_p = handler.AcquireDstMemory(output);
+
+      std::unordered_map<int, dnnl::memory> reduction_args = {
+          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+      auto reduction_p = handler.AcquireForwardPrimitive();
+
+      reduction_p->execute(astream, reduction_args);
+      astream.wait();
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    }
+  }
+
+ private:
+  std::vector<int64_t> CalculateOutputDims(const Tensor* input,
+                                           const Tensor* output,
+                                           std::vector<int>& reduce_dims,
+                                           bool reduce_all,
+                                           bool keep_dim) const {
+    if (keep_dim) return framework::vectorize(output->dims());
+
+    if (reduce_all)
+      return std::vector<int64_t>(framework::vectorize(input->dims()).size(),
+                                  1);
+
+    std::vector<int64_t> output_dims(framework::vectorize(input->dims()));
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      reduce_dims[i] = (reduce_dims[i] >= 0)
+                           ? reduce_dims[i]
+                           : input->dims().size() + reduce_dims[i];
+      output_dims[reduce_dims[i]] = 1;
+    }
+
+    return output_dims;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceSumMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_sum);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_sum, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumMKLDNNKernel<float>,
+                   ops::ReduceSumMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -489,6 +489,30 @@ class ReduceOp : public framework::OperatorWithKernel {
       }
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    if (ctx.Input<paddle::framework::LoDTensor>("X")->dims().size() > 5)
+      return framework::OpKernelType(input_data_type, ctx.GetPlace());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU place"));
+    }
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ReduceOpUseInputPlace : public ReduceOp {
@@ -579,6 +603,9 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int, default -1)"
         "The dtype of output, default value is -1, the dtype is same as intput")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 %s Operator.
 

diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
@@ -638,7 +638,8 @@ class ReductionMKLDNNHandler
                          const float eps, const MKLDNNDeviceContext& dev_ctx,
                          const mkldnn::engine engine, platform::Place cpu_place,
                          const Tensor* x, const Tensor* y,
-                         const std::string& uniq_name)
+                         const std::string& uniq_name,
+                         std::vector<int64_t> output_dims)
       : platform::MKLDNNHandlerT<T, dnnl::reduction>(
             dev_ctx, engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
@@ -653,20 +654,11 @@ class ReductionMKLDNNHandler
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
       const auto src_tz = framework::vectorize(x->dims());
-      const auto dst_tz = framework::vectorize(y->dims());
-
-      // For oneDNN dimensionality should match so we need to
-      // extend Y tensor dims with values of 1 (before and after pattern)
-      int j = 0;
-      std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-      for (size_t i = 0; i < src_tz.size(); ++i) {
-        dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-      }
 
       const auto src_md = dnnl::memory::desc(
           src_tz, platform::MKLDNNGetDataType<T>(), x->format());
       const auto dst_md = memory::desc(
-          dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
+          output_dims, platform::MKLDNNGetDataType<T>(), x->format());
 
       this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
     }