From a328ae3b9ba2b4089e491253e985874f2c1cf147 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 31 Oct 2017 17:47:25 +0800
Subject: [PATCH 1/2] Use posix_memalign to allocate aligned memory, since some
 SIMD instructions require the alignment of memory accesses.

---
 paddle/memory/detail/system_allocator.cc | 11 ++++++++++-
 paddle/operators/reshape_op.cc           |  2 +-
 paddle/operators/save_load_op_test.cc    |  6 +++---
 3 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 33166d9ce23a4..6b4e46f56a0c9 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -41,7 +41,16 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
 
   index = 0;  // unlock memory
 
-  void* p = malloc(size);
+  void* p;
+
+#ifdef PADDLE_USE_MKLDNN
+  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+  // memory alignment
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+#else
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+#endif
+  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
 
   if (p != nullptr) {
     if (FLAGS_use_pinned_memory) {
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index eda8226480a66..9213cc7a85822 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -36,7 +36,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     auto x_dims = ctx->GetInputDim("X");
     // TODO(qiao) change batch_size
-    for (int i = 1; i < shape.size(); ++i) {
+    for (size_t i = 1; i < shape.size(); ++i) {
       PADDLE_ENFORCE(shape[i] > 0,
                      "Each dimension of shape "
                      "must be positiv except the first.");
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index fe2b15ec09c6d..a57466a48d4d6 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -34,7 +34,7 @@ TEST(SaveLoadOp, CPU) {
 
   tensor->set_lod(expect_lod);
   int* expect = tensor->mutable_data<int>(place);
-  for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
     expect[i] = static_cast<int>(i);
   }
   paddle::framework::AttributeMap attrs;
@@ -50,7 +50,7 @@ TEST(SaveLoadOp, CPU) {
       "load", {}, {{"Out", {"out_var"}}}, attrs);
   load_op->Run(scope, ctx);
   int* actual = target->data<int>();
-  for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], actual[i]);
   }
   auto& actual_lod = target->lod();
@@ -60,4 +60,4 @@ TEST(SaveLoadOp, CPU) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }
   }
-}
\ No newline at end of file
+}

From e88e1964eb79a2ea14d093ce888c702eab6a85ab Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 31 Oct 2017 18:10:21 +0800
Subject: [PATCH 2/2] Fix compiling warning.

---
 paddle/operators/nccl_op_test.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 80c50a28a9e5d..e5927d56ae7cf 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -185,7 +185,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
         recv_tensor.numel() * sizeof(float),
         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
 
-    for (size_t j = 0; j < f::product(kDims); ++j) {
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], result, 1e-5);
     }
   }
@@ -234,7 +234,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
 
-  for (int j = 0; j < f::product(kDims); ++j) {
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
@@ -282,7 +282,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
 
-  for (size_t j = 0; j < f::product(kDims); ++j) {
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }