Merge pull request PaddlePaddle#37 from jiweibo/shrink_memory

[ShrinkMemory] Update shrink memory multi-thread demo.
zhoutianzi666 · Nov 3, 2020 · 09284c0 · 09284c0
2 parents 229e40d + 7652cfe
commit 09284c0
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 37 deletions.
diff --git a/c++/test/shrink_memory/README.md b/c++/test/shrink_memory/README.md
@@ -7,6 +7,7 @@
 ### 二：**样例编译**
 
 文件`single_thread_test.cc` 为单线程使用ShrinkMemory降低内/显存的预测样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。
+文件`thread_local_test.cc` 为使用thread_local多线程使用ShrinkMemory降低内/显存的预测样例。
 文件`multi_thread_test.cc` 为多线程使用ShrinkMemory降低内/显存的预测样例程序。
 文件`CMakeLists.txt` 为编译构建文件。   
 脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
@@ -18,7 +19,7 @@
 打开`run_impl.sh`，我们对以下的几处信息进行修改：
 
 ```shell
-# 根据需要选择single_thread_test或multi_thread_test
+# 根据需要选择single_thread_test, multi_thread_test, thread_local_test
 DEMO_NAME=single_thread_test
 
 # 根据预编译库中的version.txt信息判断是否将以下三个标记打开
@@ -45,13 +46,12 @@ CUDA_LIB=/usr/local/cuda/lib64
 cd build
 # 运行样例
 ./single_thread_test -model_dir ${YOLO_MODEL_PATH} --use_gpu
-# ./multi_thread_test --model_dir ${YOUR_MODEL_PATH} --use_gpu
+# ./multi_thread_test --model_dir ${YOUR_MODEL_PATH} --use_gpu --thread_num 2
+# ./thread_local_test --model_dir ${YOUR_MODEL_PATH} --use_gpu
 ```
 
 运行过程中，请根据提示观测GPU的显存占用或CPU的内存占用，可以发现，当某次运行的batch_size很大时，会使得显/内存池较大，此时应用的显/内存占用较高，可以通过ShrinkMemory操作来显示的释放显/内存池。
 
-注意，多线程测试示例中必须EnableGpuMultiStream()，来选择thread_local allocator来支持ShrinkMemory操作。
-
 ### 更多链接
 - [Paddle Inference使用Quick Start！]()
 - [Paddle Inference Python Api使用]()
diff --git a/c++/test/shrink_memory/multi_thread_test.cc b/c++/test/shrink_memory/multi_thread_test.cc
@@ -1,4 +1,3 @@
-#include "helper.h"
 #include "paddle/include/paddle_inference_api.h"
 #include <chrono>
 #include <gflags/gflags.h>
@@ -9,41 +8,23 @@
 #include <vector>
 
 DEFINE_string(model_dir, "./mobilenetv1", "model directory.");
+DEFINE_int32(thread_num, 1, "thread num");
 DEFINE_bool(use_gpu, false, "use gpu.");
 DEFINE_bool(test_leaky, false,
             "run 1000 times, and observe whether leaky memory or not.");
 
-const size_t thread_num = 2;
-paddle::inference::Timer timer_sum;
-paddle::inference::Barrier barrier_init(thread_num);
-paddle::inference::Barrier barrier_warmup(thread_num);
-
 namespace paddle_infer {
 
 void PrepareConfig(Config *config) {
   config->SetModel(FLAGS_model_dir + "/model", FLAGS_model_dir + "/params");
   if (FLAGS_use_gpu) {
     config->EnableUseGpu(500, 0);
   }
-  // switch to thread_local allocator.
-  config->EnableGpuMultiStream();
 }
 
-void Run(int thread_id) {
-  Config config;
-  PrepareConfig(&config);
-
-  // create predictor
-  static std::mutex mutex;
-
-  std::shared_ptr<Predictor> predictor;
-  {
-    std::unique_lock<std::mutex> lock(mutex);
-    predictor = CreatePredictor(config);
-  }
+void Run(std::shared_ptr<Predictor> predictor, int thread_id) {
 
   auto run_one_loop = [&](int batch_size) {
-    // prepare inputs.
     int channels = 3;
     int height = 224;
     int width = 224;
@@ -73,10 +54,6 @@ void Run(int thread_id) {
               << " mean val: " << mean_val / output_num;
   };
 
-  barrier_init.Wait();
-  run_one_loop(1);
-  barrier_warmup.Wait();
-
   auto pause = [](const std::string &hint) {
     if (FLAGS_test_leaky) {
       return;
@@ -93,27 +70,38 @@ void Run(int thread_id) {
   for (int i = 0; i < run_times; ++i) {
     run_one_loop(40);
     pause("Pause, you can view the GPU memory usage, please enter any "
-          "character to continue running.");
+          "character to continue running. thread_id is " +
+          std::to_string(thread_id));
 
-    // release memory pool.
     predictor->ShrinkMemory();
     pause("Pause, ShrinkMemory has been called, please observe the changes of "
-          "GPU memory.");
+          "GPU memory. thread_idis " +
+          std::to_string(thread_id));
 
     run_one_loop(1);
     pause("Pause, you can view the GPU memory usage, please enter any "
-          "character to continue running.");
+          "character to continue running. thread_id is " +
+          std::to_string(thread_id));
   }
 }
 }
 
 int main(int argc, char **argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
+
+  paddle_infer::Config config;
+  paddle_infer::PrepareConfig(&config);
+  auto main_predictor = paddle_infer::CreatePredictor(config);
+  std::vector<decltype(main_predictor)> predictors;
+  for (int i = 0; i < FLAGS_thread_num; ++i) {
+    predictors.emplace_back(std::move(main_predictor->Clone()));
+  }
+
   std::vector<std::thread> threads;
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&, i]() { paddle_infer::Run(i); });
+  for (int i = 0; i < FLAGS_thread_num; ++i) {
+    threads.emplace_back(paddle_infer::Run, predictors[i], i);
   }
-  for (size_t i = 0; i < thread_num; ++i) {
+  for (int i = 0; i < FLAGS_thread_num; ++i) {
     threads[i].join();
   }
   LOG(INFO) << "Run done";

diff --git a/c++/test/shrink_memory/run_impl.sh b/c++/test/shrink_memory/run_impl.sh
@@ -4,9 +4,10 @@ mkdir -p build
 cd build
 rm -rf *
 
-# same with the single_thread_test or multi_thread_test
+# same with the single_thread_test, multi_thread_test or thread_local_test
 DEMO_NAME=single_thread_test
 #DEMO_NAME=multi_thread_test
+#DEMO_NAME=thread_local_test
 
 WITH_MKL=ON
 WITH_GPU=ON

diff --git a/c++/test/shrink_memory/thread_local_test.cc b/c++/test/shrink_memory/thread_local_test.cc
@@ -0,0 +1,120 @@
+#include "helper.h"
+#include "paddle/include/paddle_inference_api.h"
+#include <chrono>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <iostream>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+DEFINE_string(model_dir, "./mobilenetv1", "model directory.");
+DEFINE_bool(use_gpu, false, "use gpu.");
+DEFINE_bool(test_leaky, false,
+            "run 1000 times, and observe whether leaky memory or not.");
+
+const size_t thread_num = 2;
+paddle::inference::Timer timer_sum;
+paddle::inference::Barrier barrier_init(thread_num);
+paddle::inference::Barrier barrier_warmup(thread_num);
+
+namespace paddle_infer {
+
+void PrepareConfig(Config *config) {
+  config->SetModel(FLAGS_model_dir + "/model", FLAGS_model_dir + "/params");
+  if (FLAGS_use_gpu) {
+    config->EnableUseGpu(500, 0);
+  }
+  // switch to thread_local allocator.
+  config->EnableGpuMultiStream();
+}
+
+void Run(int thread_id) {
+  Config config;
+  PrepareConfig(&config);
+
+  // create predictor
+  static std::mutex mutex;
+
+  std::shared_ptr<Predictor> predictor;
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    predictor = CreatePredictor(config);
+  }
+
+  auto run_one_loop = [&](int batch_size) {
+    // prepare inputs.
+    int channels = 3;
+    int height = 224;
+    int width = 224;
+    int input_num = channels * height * width * batch_size;
+    std::vector<float> in_data(input_num, 0);
+    for (int i = 0; i < input_num; ++i) {
+      in_data[i] = i % 255 * 0.1;
+    }
+    auto in_names = predictor->GetInputNames();
+    auto in_handle = predictor->GetInputHandle(in_names[0]);
+    in_handle->Reshape({batch_size, channels, height, width});
+    in_handle->CopyFromCpu(in_data.data());
+    CHECK(predictor->Run());
+    auto out_names = predictor->GetOutputNames();
+    auto out_handle = predictor->GetOutputHandle(out_names[0]);
+    std::vector<float> out_data;
+    std::vector<int> temp_shape = out_handle->shape();
+    int output_num = std::accumulate(temp_shape.begin(), temp_shape.end(), 1,
+                                     std::multiplies<int>());
+    out_data.resize(output_num);
+    out_handle->CopyToCpu(out_data.data());
+    float mean_val = 0;
+    for (size_t j = 0; j < output_num; ++j) {
+      mean_val += out_data[j];
+    }
+    LOG(INFO) << "thread_id: " << thread_id << " batch_size: " << batch_size
+              << " mean val: " << mean_val / output_num;
+  };
+
+  barrier_init.Wait();
+  run_one_loop(1);
+  barrier_warmup.Wait();
+
+  auto pause = [](const std::string &hint) {
+    if (FLAGS_test_leaky) {
+      return;
+    }
+    std::string temp;
+    LOG(INFO) << hint;
+    std::getline(std::cin, temp);
+  };
+
+  int run_times = 1;
+  if (FLAGS_test_leaky) {
+    run_times = 100;
+  }
+  for (int i = 0; i < run_times; ++i) {
+    run_one_loop(40);
+    pause("Pause, you can view the GPU memory usage, please enter any "
+          "character to continue running.");
+
+    // release memory pool.
+    predictor->ShrinkMemory();
+    pause("Pause, ShrinkMemory has been called, please observe the changes of "
+          "GPU memory.");
+
+    run_one_loop(1);
+    pause("Pause, you can view the GPU memory usage, please enter any "
+          "character to continue running.");
+  }
+}
+}
+
+int main(int argc, char **argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<std::thread> threads;
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&, i]() { paddle_infer::Run(i); });
+  }
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads[i].join();
+  }
+  LOG(INFO) << "Run done";
+}