From 0e4647723d4e7bccf7d569fabd952ba8e7d2217e Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Wed, 31 Jan 2018 21:07:04 +0100
Subject: [PATCH] Minor refactor: prevent string copying, shared_ptr by ref
 (#8872)

---
 include/mxnet/base.h                    | 12 +++---
 src/common/utils.h                      |  4 +-
 src/engine/stream_manager.h             |  6 +--
 src/engine/thread_pool.h                | 20 ++++-----
 src/engine/threaded_engine.cc           | 24 +++++------
 src/engine/threaded_engine.h            | 12 +++---
 src/engine/threaded_engine_perdevice.cc | 54 ++++++++++++-------------
 7 files changed, 66 insertions(+), 66 deletions(-)
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 262c348eb88f..f482e4fc0b10 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2015 by Contributors
  * \file base.h
- * \brief configuation of mxnet as well as basic data structure.
+ * \brief configuration of MXNet as well as basic data structure.
  */
 #ifndef MXNET_BASE_H_
 #define MXNET_BASE_H_
@@ -243,7 +243,7 @@ struct Context {
    * \param str the string pattern
    * \return Context
    */
-  inline static Context FromString(std::string str);
+  inline static Context FromString(const std::string& str);
 };
 
 /*!
@@ -316,15 +316,15 @@ inline Context Context::GPU(int32_t dev_id) {
   return Create(kGPU, dev_id);
 }
 
-inline Context Context::FromString(std::string str) {
+inline Context Context::FromString(const std::string& str) {
   Context ret;
   try {
-    std::string::size_type l = str.find('(');
+    const std::string::size_type l = str.find('(');
     CHECK_NE(l, std::string::npos);
-    std::string::size_type r = str.find(')');
+    const std::string::size_type r = str.find(')');
     CHECK_EQ(r, str.length()-1);
 
-    std::string type = str.substr(0, l);
+    const std::string type = str.substr(0, l);
     int id = std::stoi(str.substr(l+1, r-l-1));
     if (type == "cpu") {
       ret = CPU(id);
diff --git a/src/common/utils.h b/src/common/utils.h
index 73ba6dbe69b1..4f84a54e5038 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -439,7 +439,7 @@ inline void LogStorageFallback(const nnvm::NodeAttrs& attrs,
 }
 
 // heuristic to dermine number of threads per GPU
-inline int GetNumThreadPerGPU() {
+inline int GetNumThreadsPerGPU() {
   // This is resource efficient option.
   return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2);
 }
@@ -449,7 +449,7 @@ inline int GetNumThreadPerGPU() {
 inline int GetExecNumMatchColor() {
   // This is resource efficient option.
   int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 1);
-  return std::min(num_match_color, GetNumThreadPerGPU());
+  return std::min(num_match_color, GetNumThreadsPerGPU());
 }
 
 template<typename T, typename V>
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index 432bccf27df4..ddbfde81f055 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -51,7 +51,7 @@ class StreamManager {
   RunContext GetIORunContext(Context const& ctx);
   void Finalize();
  private:
-  std::mutex m_;
+  std::mutex mutex_;
 #if MXNET_USE_CUDA
   std::array<std::array<mshadow::Stream<gpu>*, kStreams>, kNumGpus>
       gpu_streams_;
@@ -74,7 +74,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
       std::size_t use_counter;
       CUDA_CALL(cudaSetDevice(ctx.dev_id));
       {
-        std::lock_guard<std::mutex> lock{m_};
+        std::lock_guard<std::mutex> lock{mutex_};
         auto&& counter = gpu_cnt_.at(ctx.dev_id);
         if (counter == -1) {
           for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
@@ -109,7 +109,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
 #if MXNET_USE_CUDA
       CUDA_CALL(cudaSetDevice(ctx.dev_id));
       {
-        std::lock_guard<std::mutex> lock{m_};
+        std::lock_guard<std::mutex> lock{mutex_};
         if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
           gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
         }
diff --git a/src/engine/thread_pool.h b/src/engine/thread_pool.h
index b4dae6bfd41d..d40c51a3bdb8 100644
--- a/src/engine/thread_pool.h
+++ b/src/engine/thread_pool.h
@@ -58,8 +58,8 @@ class ThreadPool {
 
     /*! \brief Signal event upon destruction, even for exceptions (RAII) */
     struct SetReadyOnDestroy {
-      explicit inline SetReadyOnDestroy(std::shared_ptr<SimpleEvent> *event)
-        : event_(*event) {
+      explicit inline SetReadyOnDestroy(const std::shared_ptr<SimpleEvent>& event)
+        : event_(event) {
       }
       inline ~SetReadyOnDestroy() {
         if (event_) {
@@ -82,18 +82,20 @@ class ThreadPool {
    */
   explicit ThreadPool(size_t size, std::function<void()> func)
       : worker_threads_(size) {
+    CHECK_GT(size, 0);
     for (auto& i : worker_threads_) {
       i = std::thread(func);
     }
   }
   explicit ThreadPool(size_t size,
-                      std::function<void(std::shared_ptr<SimpleEvent> ready)> func,
+                      std::function<void(const std::shared_ptr<SimpleEvent> ready)> func,
                       const bool wait)
       : worker_threads_(size) {
+    CHECK_GT(size, 0);
+    ready_events_.reserve(size);
     for (auto& i : worker_threads_) {
-      std::shared_ptr<SimpleEvent> ptr = std::make_shared<SimpleEvent>();
-      ready_events_.emplace_back(ptr);
-      i = std::thread(func, ptr);
+      ready_events_.emplace_back(std::make_shared<SimpleEvent>());
+      i = std::thread(func, ready_events_.back());
     }
     if (wait) {
       WaitForReady();
@@ -110,8 +112,8 @@ class ThreadPool {
    * \brief Wait for all started threads to signal that they're ready
    */
   void WaitForReady() {
-    for (std::shared_ptr<SimpleEvent> ptr : ready_events_) {
-      ptr->wait();
+    for (const std::shared_ptr<SimpleEvent>& event : ready_events_) {
+      event->wait();
     }
   }
 
@@ -122,7 +124,7 @@ class ThreadPool {
   /*!
    * \brief Startup synchronization objects
    */
-  std::list<std::shared_ptr<SimpleEvent>> ready_events_;
+  std::vector<std::shared_ptr<SimpleEvent> > ready_events_;
   /*!
    * \brief Disallow default construction.
    */
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index b17d92863725..2b28a7d602a3 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -49,7 +49,7 @@ ThreadedVar::ThreadedVar(VersionedVarBlock* head) : head_{head} {
 }
 
 inline void ThreadedVar::AppendReadDependency(OprBlock* opr_block) {
-  std::lock_guard<std::mutex> lock{m_};
+  std::lock_guard<std::mutex> lock{mutex_};
   if (pending_write_ == nullptr) {
     // invariant: is_ready_to_read()
     CHECK_GE(num_pending_reads_, 0);
@@ -71,7 +71,7 @@ inline void ThreadedVar::AppendReadDependency(OprBlock* opr_block) {
 
 inline void ThreadedVar::AppendWriteDependency(OprBlock* opr_block) {
   auto&& new_var_block = VersionedVarBlock::New();
-  std::lock_guard<std::mutex> lock{m_};
+  std::lock_guard<std::mutex> lock{mutex_};
   // invariant.
   assert(head_->next == nullptr);
   assert(head_->trigger == nullptr);
@@ -102,7 +102,7 @@ inline void ThreadedVar::CompleteReadDependency(Dispatcher dispatcher) {
   OprBlock *trigger = nullptr;
   {
     // this is lock scope
-    std::lock_guard<std::mutex> lock{m_};
+    std::lock_guard<std::mutex> lock{mutex_};
     CHECK_GT(num_pending_reads_, 0);
 
     if (--num_pending_reads_ == 0) {
@@ -124,7 +124,7 @@ inline bool ThreadedVar::CompleteWriteDependency(Dispatcher dispatcher) {
   VersionedVarBlock *old_pending_write, *end_of_read_chain;
   OprBlock* trigger_write = nullptr;
   {
-    std::lock_guard<std::mutex> lock{m_};
+    std::lock_guard<std::mutex> lock{mutex_};
     // invariants
     assert(head_->next == nullptr);
     assert(pending_write_ != nullptr);
@@ -187,12 +187,12 @@ inline bool ThreadedVar::CompleteWriteDependency(Dispatcher dispatcher) {
 }
 
 inline void ThreadedVar::SetToDelete() {
-  std::lock_guard<std::mutex> lock{m_};
+  std::lock_guard<std::mutex> lock{mutex_};
   to_delete_ = true;
 }
 
 inline bool ThreadedVar::ready_to_read() {
-  std::lock_guard<std::mutex> lock{m_};
+  std::lock_guard<std::mutex> lock{mutex_};
   return this->is_ready_to_read();
 }
 
@@ -228,8 +228,8 @@ void ThreadedEngine::CheckDuplicate(std::vector<VarHandle> const& const_vars,
   // Check for duplicates.
   auto use = const_vars;
   auto mutate = mutable_vars;
-  auto use_size = use.size();
-  auto mutate_size = mutate.size();
+  const size_t use_size = use.size();
+  const size_t mutate_size = mutate.size();
   std::sort(use.begin(), use.end());
   std::sort(mutate.begin(), mutate.end());
   for (std::size_t i = 0; i < use_size; ++i) {
@@ -381,7 +381,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
     std::unique_lock<std::mutex> lock{finished_m_};
     finished_cv_.wait(lock, [this, &done]() {
         return done.load() || kill_.load();
-      });
+    });
   }
 }
 
@@ -403,11 +403,11 @@ inline void ThreadedEngine::OnComplete(ThreadedOpr* threaded_opr) {
   }
   // Mark complete for write variables.
   for (auto&& i : threaded_opr->mutable_vars) {
-    bool debug_info = (engine_info_ && debug_wait_var_ == i);
+    const bool debug_info = (engine_info_ && debug_wait_var_ == i);
     if (debug_info) {
       LOG(INFO) << "Complete write dep for " << i;
     }
-    bool to_delete = i->CompleteWriteDependency(
+    const bool to_delete = i->CompleteWriteDependency(
         [this, debug_info](OprBlock* opr) {
           if (debug_info) {
             LOG(INFO) << "PushToExecute " << opr;
@@ -426,7 +426,7 @@ inline void ThreadedEngine::OnComplete(ThreadedOpr* threaded_opr) {
   // could execute right after we mark all vars as complete, so if
   // threaded_opr is not temporary, its value is not reliable
   // anymore start from here.
-  int npending;
+  int npending = 0;
   {
     std::unique_lock<std::mutex> lock{finished_m_};
     npending = --pending_;
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index d85321c52c9f..1524f257560f 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -85,8 +85,8 @@ struct OprBlock : public common::ObjectPoolAllocatable<OprBlock> {
    * \return the wait counter after the decreasement.
    */
   inline int decr_wait() {
-    // chack invariant, avoid over trigger
-    int ret = --wait;
+    // check invariant, avoid over trigger
+    const int ret = --wait;
     CHECK_GE(ret, 0);
     return ret;
   }
@@ -112,8 +112,8 @@ struct VersionedVarBlock
  * \brief Variable implementation.
  *  Each ThreadedVar is a linked list(queue) of operations to be performed.
  */
-class ThreadedVar final : public Var,
-                          public common::ObjectPoolAllocatable<ThreadedVar> {
+class ThreadedVar final
+    : public Var, public common::ObjectPoolAllocatable<ThreadedVar> {
  public:
   /*!
    * \brief constructor
@@ -180,7 +180,7 @@ class ThreadedVar final : public Var,
   // TODO(hotpxl) change this to spinlock for faster runtime
   // TODO(hotpxl) consider rename head
   /*! \brief inetrnal mutex of the ThreadedVar */
-  std::mutex m_;
+  std::mutex mutex_;
   /*!
    * \brief number of pending reads operation in the variable.
    *  will be marked as -1 when there is a already triggered pending write.
@@ -446,7 +446,7 @@ class ThreadedEngine : public Engine {
     if (!bulk_status.count) return;
     bulk_status.count = 0;
     DeduplicateVarHandle(&bulk_status.const_vars, &bulk_status.mutable_vars);
-    auto fn = std::move(bulk_status.fn);
+    SyncFn fn = std::move(bulk_status.fn);
     this->PushAsync([fn](RunContext ctx, CallbackOnComplete on_complete) {
         fn(ctx);
         on_complete();
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index e7e222f6cbe3..638062fac437 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -89,7 +89,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 
   void Start() override {
     if (is_worker_) return;
-    gpu_worker_nthreads_ = common::GetNumThreadPerGPU();
+    gpu_worker_nthreads_ = common::GetNumThreadsPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
     // create CPU task
     int cpu_priority_nthreads = dmlc::GetEnv("MXNET_CPU_PRIORITY_NTHREADS", 4);
@@ -118,17 +118,15 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         if (opr_block->opr->prop == FnProperty::kCPUPrioritized) {
           cpu_priority_worker_->task_queue.Push(opr_block, opr_block->priority);
         } else {
-          int dev_id = ctx.dev_id;
-          int nthread = cpu_worker_nthreads_;
-          auto ptr =
-          cpu_normal_workers_.Get(dev_id, [this, ctx, nthread]() {
-              auto blk = new ThreadWorkerBlock<kWorkerQueue>();
-              blk->pool.reset(new ThreadPool(nthread,
+          const size_t nthreads = cpu_worker_nthreads_;
+          auto ptr = cpu_normal_workers_.Get(ctx.dev_id, [this, ctx, nthreads]() {
+            auto blk = new ThreadWorkerBlock<kWorkerQueue>();
+              blk->pool.reset(new ThreadPool(nthreads,
                   [this, ctx, blk](std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
                     this->CPUWorker(ctx, blk, ready_event);
                   }, true));
-              return blk;
-            });
+            return blk;
+          });
           if (ptr) {
             if (opr_block->opr->prop == FnProperty::kDeleteVar) {
               ptr->task_queue.PushFront(opr_block, opr_block->priority);
@@ -140,24 +138,23 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
       } else {
         CHECK_EQ(ctx.dev_mask(), Context::kGPU);
         // GPU execution.
-        FnProperty prop = opr_block->opr->prop;
-        bool is_copy = (prop == FnProperty::kCopyFromGPU ||
-                        prop == FnProperty::kCopyToGPU);
-        int nthread = gpu_worker_nthreads_;
+        const FnProperty prop = opr_block->opr->prop;
+        const bool is_copy = (prop == FnProperty::kCopyFromGPU ||
+                              prop == FnProperty::kCopyToGPU);
+        const size_t nthread = gpu_worker_nthreads_;
         if (is_copy) {
-          auto ptr =
-          gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
+          auto ptr = gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
             auto blk = new ThreadWorkerBlock<kCopyQueue>();
-              blk->pool.reset(new ThreadPool(
-                nthread,
-                [this, ctx, is_copy, blk]
-                  (std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
-                    this->GPUWorker(ctx, is_copy, blk, ready_event);
-                  }, true));
-              return blk;
-            });
+            blk->pool.reset(new ThreadPool(
+              nthread,
+              [this, ctx, is_copy, blk]
+                (std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+                  this->GPUWorker(ctx, is_copy, blk, ready_event);
+                }, true));
+            return blk;
+          });
           if (ptr) {
             if (opr_block->opr->prop == FnProperty::kDeleteVar) {
               ptr->task_queue.PushFront(opr_block, opr_block->priority);
@@ -177,7 +174,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
                     this->GPUWorker(ctx, is_copy, blk, ready_event);
                   }, true));
               return blk;
-            });
+          });
           if (ptr) {
             if (opr_block->opr->prop == FnProperty::kDeleteVar) {
               ptr->task_queue.PushFront(opr_block, opr_block->priority);
@@ -207,9 +204,9 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   /*! \brief whether this is a worker thread. */
   static MX_THREAD_LOCAL bool is_worker_;
   /*! \brief number of concurrent thread cpu worker uses */
-  int cpu_worker_nthreads_;
+  size_t cpu_worker_nthreads_;
   /*! \brief number of concurrent thread each gpu worker uses */
-  int gpu_worker_nthreads_;
+  size_t gpu_worker_nthreads_;
   // cpu worker
   common::LazyAllocArray<ThreadWorkerBlock<kWorkerQueue> > cpu_normal_workers_;
   // cpu priority worker
@@ -228,12 +225,13 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   inline void GPUWorker(Context ctx,
                         bool is_copy_worker,
                         ThreadWorkerBlock<type> *block,
-                        std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+                        const std::shared_ptr<ThreadPool::SimpleEvent>& ready_event) {
     this->is_worker_ = true;
 #if MXNET_USE_CUDA
+    CHECK(block != nullptr);
     mshadow::Stream<gpu> *stream;
     do {
-      ThreadPool::SimpleEvent::SetReadyOnDestroy setReady(&ready_event);
+      ThreadPool::SimpleEvent::SetReadyOnDestroy setReady(ready_event);
       // allocate stream
       mshadow::SetDevice<gpu>(ctx.dev_id);
       if (is_copy_worker) {