From c8db1bd987e63d34bd514cfe9e168b4dbe0e8d33 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 16 Nov 2021 08:58:37 +0000
Subject: [PATCH 1/4] disable copying of datatype when sharing buffer between
 two tensors.

---
 paddle/fluid/framework/tensor.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 539859c45c907..c53ff6b1bcc6b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -254,7 +254,10 @@ class Tensor {
   void ShareBufferWith(const Tensor& tensor) {
     holder_ = tensor.holder_;
     offset_ = tensor.offset_;
-    type_ = tensor.type_;
+    // NOTE(chenfeiyu): when sharing buffer, by definition only holder
+    // to the memory allocation and offset should be shared. Shape,
+    // data type, layout, and other metadata associated with a Tensor
+    // should not be copied.
   }
 
   bool IsSharedBufferWith(const Tensor& src) const {

From f7864fc01b161fe66cfad364434e0703a48ac5a6 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Fri, 19 Nov 2021 08:34:31 +0000
Subject: [PATCH 2/4] fix for elementwise_mkldnn_op, mannually set the data
 type when reusing memory

---
 .../mkldnn/elementwise_mkldnn_op.h            | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index ffcdc079985fa..6881c17646324 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -62,9 +62,22 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     // and they share a buffer (of
     // shape x) then this buffer is not big enough to hold result of elementwise
     // operation.
-    auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z))
-                          ? src_x_memory
-                          : handler.AcquireDstMemory(z);
+    const bool reuse_x_memopry =
+        x->numel() == z->numel() && x->IsSharedBufferWith(*z);
+    std::shared_ptr<dnnl::memory> dst_memory = nullptr;
+    if (reuse_x_memopry) {
+      dst_memory = src_x_memory;
+      // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
+      // than allocate its own, it's still need to take care of its data type.
+      // Unfortunately, paddle's operator only infers the output' shape, but not
+      // the data type. mutable_data<T> takes care of allocation and data type
+      // normally, but if the memory is already allocated and there is no need
+      // to re-allocate, it just set the data type. So this it added there to
+      // get the right data type.
+      z->mutable_data<T>(ctx.GetPlace());
+    } else {
+      dst_memory = handler.AcquireDstMemory(z);
+    }
 
     const auto binary_prim = handler.AcquireForwardPrimitive();
 

From c76cd83d85b3ede030acf2d536534025f1c436e5 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Sun, 21 Nov 2021 09:07:33 +0000
Subject: [PATCH 3/4] fix for sum_mkldnn_op, set output datatype manually when
 sharing buffer

---
 paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 8208a484b4a32..2760bcecd5b67 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -137,8 +137,13 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       ++input_index;
     }
 
-    auto dst_mem = in_place ? handler.AcquireDstMemory()
-                            : handler.AcquireDstMemory(output);
+    std::shared_ptr<dnnl::memory> dst_mem = nullptr;
+    if (in_place) {
+      dst_mem = handler.AcquireDstMemory();
+      output->mutable_data<T>(ctx.GetPlace());
+    } else {
+      dst_mem = handler.AcquireDstMemory(output);
+    }
 
     auto sum_p = handler.AcquireForwardPrimitive();
 

From 78793c0de7e2991556470f14d7742164ea852b2f Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Mon, 22 Nov 2021 04:25:11 +0000
Subject: [PATCH 4/4] =?UTF-8?q?fix=20for=20other=20mkldnn=20kernels=20that?=
 =?UTF-8?q?=20support=20inplace=20computation=EF=BC=88softplus,=20softmax,?=
 =?UTF-8?q?=20scale,=20activation)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../operators/mkldnn/activation_mkldnn_op.cc  |  8 +++++++-
 .../fluid/operators/mkldnn/scale_mkldnn_op.cc |  9 +++++++--
 .../operators/mkldnn/softmax_mkldnn_op.cc     | 10 +++++++---
 .../operators/mkldnn/softplus_mkldnn_op.h     | 20 ++++++++++++-------
 4 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 29e18da590ebc..4bde641d2c1af 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -91,7 +91,13 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
                                                ctx.GetPlace(), x);
 
   auto src_memory_p = handler.AcquireSrcMemory(x);
-  auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y);
+  std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
+  if (is_inplaced) {
+    dst_memory_p = src_memory_p;
+    y->mutable_data<T>(ctx.GetPlace());
+  } else {
+    dst_memory_p = handler.AcquireDstMemory(y);
+  }
   auto activation_p = handler.AcquireForwardPrimitive();
 
   auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
index 84ac14d04b85b..b8b735e96d24d 100644
--- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -41,8 +41,13 @@ class ScaleMKLDNNKernel : public framework::OpKernel<T> {
         x);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p =
-        is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+    std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
+    if (is_inplaced) {
+      dst_memory_p = src_memory_p;
+      out->mutable_data<T>(ctx.GetPlace());
+    } else {
+      dst_memory_p = handler.AcquireDstMemory(out);
+    }
     auto activation_p = handler.AcquireForwardPrimitive();
 
     auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index b0f27719bf9ad..c26c017596d32 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -103,9 +103,13 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     auto softmax_src_memory_p = handler.AcquireSrcMemory(input);
     // For Inplace src and and dst are the same memory object
-    auto softmax_dst_memory_p =
-        is_inplaced ? softmax_src_memory_p : handler.AcquireDstMemory(output);
-
+    std::shared_ptr<dnnl::memory> softmax_dst_memory_p = nullptr;
+    if (is_inplaced) {
+      softmax_dst_memory_p = softmax_src_memory_p;
+      output->mutable_data<T>(ctx.GetPlace());
+    } else {
+      softmax_dst_memory_p = handler.AcquireDstMemory(output);
+    }
     auto softmax_p = handler.AcquireForwardPrimitive();
 
     auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
index 60ea5136905de..c8c539a956505 100644
--- a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
@@ -43,7 +44,7 @@ class SoftplusMKLDNNHandler
                               1.0f / beta, 0.0f);
     }
 
-    AppendFusedActivationIfExists(ctx, post_ops);
+    AppendFusedActivationIfExists(ctx, &post_ops);
 
     dnnl::primitive_attr attrs;
     attrs.set_post_ops(post_ops);
@@ -59,16 +60,16 @@ class SoftplusMKLDNNHandler
 
  private:
   void AppendFusedActivationIfExists(const framework::ExecutionContext& ctx,
-                                     dnnl::post_ops& post_ops) {
+                                     dnnl::post_ops* post_ops) {
     const auto& fused_activation_type =
         algo_map.find(ctx.Attr<std::string>("fuse_activation_type"));
 
     if (fused_activation_type != algo_map.end()) {
       auto scale_out =
           ctx.Attr<float>("fuse_activation_scale");  // for future int8 support
-      post_ops.append_eltwise(scale_out, fused_activation_type->second,
-                              ctx.Attr<float>("fuse_activation_alpha"),
-                              ctx.Attr<float>("fuse_activation_beta"));
+      post_ops->append_eltwise(scale_out, fused_activation_type->second,
+                               ctx.Attr<float>("fuse_activation_alpha"),
+                               ctx.Attr<float>("fuse_activation_beta"));
     }
   }
 
@@ -109,8 +110,13 @@ void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) {
   auto src_memory_p = handler.AcquireSrcMemory(x);
 
   auto beta_memory_p = handler.AcquireBetaMemory(&beta);
-  auto dst_memory_p =
-      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+  std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
+  if (is_inplaced) {
+    dst_memory_p = src_memory_p;
+    out->mutable_data<T>(ctx.GetPlace());
+  } else {
+    dst_memory_p = handler.AcquireDstMemory(out);
+  }
   auto binary_p = handler.AcquireForwardPrimitive();
 
   auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();