Simplify creation of NodeEntry instances and use emplace_back (#14095)

* Optimize move semantics of NodeEntry apache/tvm#2576 Making copies of shared_ptr is more expensive than moving. This PR reduces lock contention by using move semantics in NNVM nodes making also more convenient to construct NodeEntry classes in the code due to the added ctors. Update NDarray with NodeEntry constructors and refine initializer lists. Sync gradient.cc with tvm * Remove additional calls to NodeEntry in emplace_back * refine patch * Fix lint
apache · May 23, 2019 · 038b9fb · 038b9fb
1 parent d4e458e
commit 038b9fb
Show file tree

Hide file tree

Showing 35 changed files with 232 additions and 195 deletions.
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj
@@ -106,7 +106,7 @@
   
   
   
-  Defined in src/operator/nn/batch_norm.cc:L574
+  Defined in src/operator/nn/batch_norm.cc:L572
   
   `data`: Input data to batch normalization
   `gamma`: gamma array

diff --git a/contrib/clojure-package/test/good-test-symbol-api.clj b/contrib/clojure-package/test/good-test-symbol-api.clj
@@ -119,7 +119,7 @@
   
   
   
-  Defined in src/operator/nn/batch_norm.cc:L574
+  Defined in src/operator/nn/batch_norm.cc:L572
   
   `data`: Input data to batch normalization (optional)
   `gamma`: gamma array (optional)

diff --git a/docs/faq/new_op.md b/docs/faq/new_op.md
@@ -292,6 +292,28 @@ output or nothing to calculating gradient.
 For more complicated patterns, use `MakeGradNode(op_name, n, heads, dict)` to create gradient entries,
 where heads are input entries to the backward op, composed from ograds and n->inputs.
 
+When assembling a return vector of `std::vector<nnvm::NodeEntry> ret;` a common pattern would be to
+either create nodes in place as in:
+
+```
+ret.emplace_back(MakeNode("zeros_like", n->attrs.name + "_xyz_backward",
+    {n->inputs[1]}, nullptr, &n))
+```
+
+Or create the node, modify and then move into NodeEntry's constructor if this node is not to be used
+again. This avoids uneccessary copies of the shared_ptr.
+
+```
+for (size_t i = 0; i < n->inputs.size(); ++i) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = copy_op;
+  node->inputs = {ograds[0]};
+  ret.emplace_back(std::move(node));
+}
+```
+
+The first case uses RVO and the second in place construction.
+
 #### FCompute\<xpu\>
 
 Simple operators can register FCompute<xpu> with `.set_attr<FCompute>("FCompute<cpu>", ...)` and `.set_attr<FCompute>("FCompute<gpu>", ...)` for both CPU and (optionally) GPU computation.

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
@@ -82,7 +82,8 @@ class MKLDNNMemory;
 class NDArray {
  public:
   /*! \brief default constructor */
-  NDArray() {
+  NDArray()
+    : entry_(nullptr) {
   }
   /*!
    * \brief constructs a new dynamic NDArray
@@ -94,8 +95,10 @@ class NDArray {
   NDArray(const mxnet::TShape &shape, Context ctx,
           bool delay_alloc = false, int dtype = mshadow::default_type_flag)
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
-        shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
-        entry_({nullptr, 0, 0}) {
+        shape_(shape),
+        dtype_(dtype),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
   /*! \brief constructor for NDArray with storage type
    */
@@ -109,11 +112,12 @@ class NDArray {
    * \param ctx context of NDArray
    * \param dtype data type of this ndarray
    */
-  explicit NDArray(Context ctx, int dtype = mshadow::default_type_flag) {
-    ptr_ = std::make_shared<Chunk>(mxnet::TShape(mshadow::Shape1(0)), ctx, true, dtype);
-    dtype_ = dtype;
-    storage_type_ = kDefaultStorage;
-    entry_ = {nullptr, 0, 0};
+  explicit NDArray(Context ctx, int dtype = mshadow::default_type_flag)
+      : ptr_(std::make_shared<Chunk>(mxnet::TShape(mshadow::Shape1(0)), ctx, true, dtype)),
+        shape_(),
+        dtype_(dtype),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
@@ -123,9 +127,11 @@ class NDArray {
    * \param dev_id the device id this tensor sits at
    */
   NDArray(const TBlob &data, int dev_id)
-      : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
-        dtype_(data.type_flag_), storage_type_(kDefaultStorage),
-        entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(data, dev_id)),
+        shape_(data.shape_),
+        dtype_(data.type_flag_),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
 
   /*!
@@ -137,20 +143,22 @@ class NDArray {
    * \param deleter the function pointer of custom deleter
    */
   NDArray(const TBlob &data, int dev_id, const std::function<void()>& deleter)
-      : ptr_(new Chunk(data, dev_id),
-        [deleter](Chunk *p) {
-          deleter();    // call custom deleter
-          delete p;     // delete Chunk object
+      : ptr_(new Chunk(data, dev_id), [deleter](Chunk *p) {
+            deleter();    // call custom deleter
+            delete p;     // delete Chunk object
         }),
         shape_(data.shape_),
         dtype_(data.type_flag_), storage_type_(kDefaultStorage),
-        entry_({nullptr, 0, 0}) {
+        entry_(nullptr) {
   }
 
   /*! \brief create ndarray from shared memory */
   NDArray(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
-      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
-        dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)),
+        shape_(shape),
+        dtype_(dtype),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
 
   /*!
@@ -165,8 +173,11 @@ class NDArray {
    */
   NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape,
           const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
-      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
-        dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)),
+        shape_(shape),
+        dtype_(data.type_flag_),
+        storage_type_(stype),
+        entry_(nullptr) {
   }
   /*!
    * \brief initialize the NDArray, assuming it is not assigned a meaningful shape before
@@ -642,7 +653,7 @@ class NDArray {
    */
   NDArray Detach() const {
     NDArray ret(*this);
-    ret.entry_ = nnvm::NodeEntry{nullptr, 0, 0};
+    ret.entry_ = nnvm::NodeEntry(nullptr);
     return ret;
   }
 

diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
@@ -56,7 +56,7 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (uint32_t i = 0; i < g->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+    ret.emplace_back(g, i, 0);
   }
 
   return ret;

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
@@ -223,11 +223,12 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
     ng->attrs.op = Op::Get("_zeros_without_dtype");
     ng->attrs.name = "zeros_without_dtype";
     ng->attrs.op->attr_parser(&(ng->attrs));
-    return nnvm::NodeEntry{ng, 0, 0};
+    return nnvm::NodeEntry(std::move(ng), 0, 0);
   }
 
   // remove zero in the sum. at least keep 1.
   auto begin = std::remove_if(v.begin(), v.end(), [](const nnvm::NodeEntry& nodeEntry) {
+     CHECK(nodeEntry.node);
      return nodeEntry.node->op() == zeros_op || nodeEntry.node->op() == zeros_like_op;
   });
   if (begin == v.begin()) ++begin;
@@ -244,7 +245,7 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
       sum_node->attrs.dict["num_args"] = std::to_string(v.size());
       sum_node->attrs.op->attr_parser(&(sum_node->attrs));
       sum_node->inputs = std::move(v);
-      return nnvm::NodeEntry{sum_node, 0, 0};
+      return nnvm::NodeEntry(std::move(sum_node), 0, 0);
     } else {
       // use a stream line of plus instead
       nnvm::NodeEntry ret = v[0];
@@ -274,7 +275,7 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
         x->attrs.op = ewise_plus_op;
         x->attrs.name = os.str();
         x->inputs = {ret, v[i]};
-        ret = nnvm::NodeEntry{x, 0, 0};
+        ret = nnvm::NodeEntry(std::move(x), 0, 0);
       }
       // identity node is used to avoid exposure of dummy plus node
       // when its output get assigned to another space.
@@ -323,15 +324,15 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   }
   if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
-    NodeEntry ngrad{nnvm::Node::Create(), 0, 0};
+    NodeEntry ngrad(nnvm::Node::Create(), 0, 0);
     head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
     head_grad_map_[ngrad.node.get()] = i;
   }
   std::vector<NodePtr> args = symbol.ListInputs(nnvm::Symbol::kReadOnlyArgs);
   std::vector<NodeEntry> xs;
   for (size_t i = 0; i < grad_req_types.size(); ++i) {
     if (grad_req_types[i] != kNullOp) {
-      xs.emplace_back(NodeEntry{args[i], 0, 0});
+      xs.emplace_back(args[i]);
     }
   }
 

diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
@@ -628,7 +628,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
     }
     if (dispatch_mode_name) {
       for (size_t i = node_start; i < node_end; i++) {
-        if (dispatch_modes[i] == DispatchMode::kUndefined) ++num_unknown;
+        if (dispatch_modes[i] == DispatchMode::kUndefined) {
+          ++num_unknown;
+        }
       }
     }
     ++i;

diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
@@ -98,7 +98,7 @@ CachedOp::CachedOp(
   using namespace nnvm;
   using namespace imperative;
   static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
-  static const auto _copy = Op::Get("_copy");
+  static const auto _copy_op = Op::Get("_copy");
   config_.Init(flags);
 
   if (config_.static_shape) {
@@ -107,21 +107,21 @@ CachedOp::CachedOp(
 
   // construct forward graph
   {
-    NodeEntryMap<int> dedup_out;
-    for (const auto& i : sym.outputs) {
-      if (dedup_out.count(i)) {
+    NodeEntryMap<size_t> dedup_out;
+    for (const NodeEntry& nodeEntry : sym.outputs) {
+      if (dedup_out.find(nodeEntry) != dedup_out.end()) {
         NodePtr copy_node = Node::Create();
-        copy_node->attrs.op = _copy;
+        copy_node->attrs.op = _copy_op;
         copy_node->attrs.name =
-            i.node->attrs.name + "_copy" + std::to_string(dedup_out[i]++);
-        copy_node->inputs.emplace_back(i);
-        if (_copy->attr_parser != nullptr) {
-          _copy->attr_parser(&(copy_node->attrs));
+            nodeEntry.node->attrs.name + "_copy" + std::to_string(dedup_out[nodeEntry]++);
+        copy_node->inputs.emplace_back(nodeEntry);
+        if (_copy_op->attr_parser != nullptr) {
+          _copy_op->attr_parser(&(copy_node->attrs));
         }
-        fwd_graph_.outputs.emplace_back(copy_node, 0, 0);
+        fwd_graph_.outputs.emplace_back(std::move(copy_node));
       } else {
-        dedup_out.insert({i, 0});
-        fwd_graph_.outputs.push_back(i);
+        dedup_out.emplace(nodeEntry, 0);
+        fwd_graph_.outputs.push_back(nodeEntry);
       }
     }
     const auto& idx = fwd_graph_.indexed_graph();
@@ -143,14 +143,15 @@ CachedOp::CachedOp(
 
   // Set params
   {
-    const auto& idx = fwd_graph_.indexed_graph();
+    const auto& indexed_graph = fwd_graph_.indexed_graph();
     if (config_.data_indices.ndim() || config_.param_indices.ndim()) {
       CHECK_EQ(config_.data_indices.ndim() + config_.param_indices.ndim(),
-               idx.input_nodes().size());
+               indexed_graph.input_nodes().size());
     } else {
       std::vector<uint32_t> tmp;
-      for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-        tmp.push_back(i);
+      tmp.reserve(indexed_graph.input_nodes().size());
+      for (size_t i = 0; i < indexed_graph.input_nodes().size(); ++i) {
+        tmp.emplace_back(i);
       }
       config_.data_indices.assign(tmp.begin(), tmp.end());
     }
@@ -159,20 +160,20 @@ CachedOp::CachedOp(
   // construct backward graph
   {
     ograd_entries_.reserve(fwd_graph_.outputs.size());
-    for (size_t i = 0; i < fwd_graph_.outputs.size(); ++i) {
-      ograd_entries_.emplace_back(NodeEntry{Node::Create(), 0, 0});
-    }
+    for (size_t i = 0; i < fwd_graph_.outputs.size(); ++i)
+      ograd_entries_.emplace_back(Node::Create());
 
     std::vector<NodeEntry> xs;
-    const auto& idx = fwd_graph_.indexed_graph();
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      auto nid = idx.input_nodes()[i];
-      if (idx.mutable_input_nodes().count(nid)) continue;
+    const IndexedGraph& indexed_graph = fwd_graph_.indexed_graph();
+    for (size_t i = 0; i < indexed_graph.input_nodes().size(); ++i) {
+      const uint32_t node_id = indexed_graph.input_nodes()[i];
+      if (indexed_graph.mutable_input_nodes().count(node_id))
+        continue;
       fwd_input_to_grad_output_[i] = xs.size();
-      xs.emplace_back(NodeEntry{idx[nid].weak_ref.lock(), 0, 0});
+      xs.emplace_back(indexed_graph[node_id].weak_ref.lock());
     }
 
-    CHECK_GT(xs.size(), 0)
+    CHECK(!xs.empty())
         << "There are no inputs in computation graph that require gradients.";
 
     grad_graph_ = pass::MXGradient(
@@ -199,7 +200,7 @@ CachedOp::CachedOp(
     }
 
     auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >("forward_ref_count");
-    for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count[i] += ref_count[i];
+    for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += ref_count[i];
     fwd_graph_.attrs["full_ref_count"] =
         std::make_shared<dmlc::any>(std::move(full_ref_count));
 
@@ -238,9 +239,12 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
   p->attrs.parsed = node->attrs.parsed;
   p->control_deps.push_back(node);
   p->inputs.reserve(bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size());
-  for (auto i : bwd_ograd_dep_) p->inputs.push_back(ograds[i]);
-  for (auto i : bwd_in_dep_) p->inputs.push_back(node->inputs[i]);
-  for (auto i : bwd_out_dep_) p->inputs.emplace_back(NodeEntry{node, i, 0});
+  for (auto i : bwd_ograd_dep_)
+    p->inputs.push_back(ograds[i]);
+  for (auto i : bwd_in_dep_)
+    p->inputs.push_back(node->inputs[i]);
+  for (auto i : bwd_out_dep_)
+    p->inputs.emplace_back(node, i, 0);
   std::vector<NodeEntry> ret;
   ret.reserve(num_inputs());
   const auto& auxs = mutable_input_nodes();
@@ -251,13 +255,14 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
     uint32_t k = 0;
     for (const auto& i : fwd_graph_.indexed_graph().input_nodes()) {
       if (auxs.count(i)) {
-        ret.emplace_back(NodeEntry{nop, 0, 0});
+        ret.emplace_back(nop);
       } else {
-        ret.emplace_back(NodeEntry{p, k++, 0});
+        ret.emplace_back(p, k++, 0);
       }
     }
   } else {
-    for (uint32_t i = 0; i < num_inputs(); ++i) ret.emplace_back(NodeEntry{p, i, 0});
+    for (uint32_t i = 0; i < num_inputs(); ++i)
+        ret.emplace_back(p, i, 0);
   }
   return ret;
 }

diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
@@ -167,7 +167,7 @@ void Imperative::GetBackwardDependency(
     std::vector<nnvm::NodeEntry> ograd_entries;
     ograd_entries.reserve(num_outputs);
     for (uint32_t i = 0; i < num_outputs; ++i) {
-      ograd_entries.emplace_back(nnvm::NodeEntry{nullptr, i, 1});
+      ograd_entries.emplace_back(nullptr, i, 1);
     }
     auto igrad_entries = fgradient[node->op()](node, ograd_entries);
     for (const auto& i : igrad_entries) {
@@ -363,7 +363,7 @@ std::vector<NDArray*> Imperative::Backward(
       auto node = Node::Create();
       node->attrs.op = copy_op;
       node->inputs.push_back(e);
-      graph.outputs.emplace_back(node, 0, 0);
+      graph.outputs.emplace_back(std::move(node));
     } else {
       graph.outputs.push_back(e);
     }