From f2a96a0a5b97aa0dc382df7a4866a219e4b884ad Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 16 Sep 2015 17:29:27 -0700 Subject: [PATCH] Add graph coloring based temp space allocation. --- example/cifar10/cifar10.py | 1 + src/engine/stream_manager.h | 3 +- src/symbol/graph_algorithm.h | 116 +++++++++++++++++++++++++++++++++++ src/symbol/graph_executor.cc | 51 +++++++++++++-- src/symbol/graph_executor.h | 2 + 5 files changed, 166 insertions(+), 7 deletions(-) create mode 100644 src/symbol/graph_algorithm.h diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py index c428beffafca..81ba9e5591f7 100644 --- a/example/cifar10/cifar10.py +++ b/example/cifar10/cifar10.py @@ -163,6 +163,7 @@ def RandomInit(narray): in_data = mx.nd.empty(data_shape, mx.gpu()) executor = loss.simple_bind(mx.gpu(), data = in_data) +print executor.debug_str() out_narray = executor.outputs[0] pred = mx.nd.zeros(out_narray.shape, mx.cpu()) diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h index b9303e86f08c..3c668788c20c 100644 --- a/src/engine/stream_manager.h +++ b/src/engine/stream_manager.h @@ -43,8 +43,9 @@ template RunContext StreamManager::GetRunContext( Context const& ctx) { RunContext ret; + ret.stream = nullptr; switch (ctx.dev_mask) { - case cpu::kDevMask: ret.stream = nullptr; break; + case cpu::kDevMask: break; case gpu::kDevMask: { #if MXNET_USE_CUDA std::size_t use_counter; diff --git a/src/symbol/graph_algorithm.h b/src/symbol/graph_algorithm.h new file mode 100644 index 000000000000..021bc0744981 --- /dev/null +++ b/src/symbol/graph_algorithm.h @@ -0,0 +1,116 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file graph_allocation_helper.h + * \brief This header contains graph algorithms on StaticGraph. + * It is used compute informations such as whether two + * operations can run in parallel, and helps allocation. +*/ +#ifndef MXNET_SYMBOL_GRAPH_ALGORITHM_H_ +#define MXNET_SYMBOL_GRAPH_ALGORITHM_H_ + +#include +#include +#include +#include + +namespace mxnet { +namespace graph { +/*! + * \brief Find best path in the DAG, with reward defined + * by sum of reward of each node along the path. + * \param graph the original static graph. + * \param topo_order topo order of the nodes in the graph. + * \param node_reward the reward of each node. + * \param path the output path of nodes. + * \return the total reward of best path. + */ +inline uint32_t FindBestPath( + const StaticGraph &graph, + const std::vector &topo_order, + const std::vector &node_reward, + std::vector *path) { + const uint32_t num_nodes = static_cast(graph.nodes.size()); + CHECK_EQ(graph.nodes.size(), node_reward.size()); + CHECK_EQ(graph.nodes.size(), topo_order.size()); + + std::vector best_reward(node_reward.size(), 0); + std::vector next_node(node_reward.size(), num_nodes); + uint32_t best_solution = 0, best_start_node = 0; + + // traverse in reverse topo order + for (auto it = topo_order.rbegin(); it != topo_order.rend(); ++it) { + const uint32_t nid = *it; + best_reward[nid] += node_reward[nid]; + if (best_reward[nid] > best_solution) { + best_solution = best_reward[nid]; + best_start_node = nid; + } + for (const StaticGraph::DataEntry& e : graph.nodes[nid].inputs) { + const uint32_t prev = e.source_id; + if (best_reward[nid] > best_reward[prev]) { + best_reward[prev] = best_reward[nid]; + next_node[prev] = nid; + } + } + } + path->clear(); + uint32_t reward = 0; + for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) { + path->push_back(nid); reward += node_reward[nid]; + } + CHECK_EQ(reward, best_solution); + return best_solution; +} + +/*! + * \brief Color the nodes in the graph into index. + * The coloring algorithm tries to assign node group + * such that node in the same group cannot run in parallel. + * + * \param graph the original static graph. + * \param topo_order topo order of the nodes in the graph. + * \param node_importance The importance of the node + * \param max_ncolor maximum number of colors allowed. + * \param color the color index of each of the node. + * \return the total number of colors. + */ +inline uint32_t ColorNodeGroup( + const StaticGraph &graph, + const std::vector &topo_order, + std::vector node_importance, + uint32_t max_ncolor, + std::vector *color) { + CHECK_NE(max_ncolor, 0); + CHECK_EQ(graph.nodes.size(), topo_order.size()); + CHECK_EQ(graph.nodes.size(), node_importance.size()); + + color->clear(); + color->resize(topo_order.size(), max_ncolor); + uint32_t cindex; + // greedy algorithm, every time + // find a path with best reward and assign a new color + // All the nodes in the path cannot run in parallel. + for (cindex = 0; cindex < max_ncolor - 1; ++cindex) { + std::vector path; + uint32_t reward = FindBestPath(graph, topo_order, node_importance, &path); + if (reward == 0) break; + for (uint32_t nid : path) { + if (node_importance[nid] != 0) { + CHECK_EQ(color->at(nid), max_ncolor); + color->at(nid) = cindex; + // make the importance 0 after color is decided. + node_importance[nid] = 0; + } + } + } + // assign i for rest of the node + for (size_t i = 0; i < topo_order.size(); ++i) { + if (color->at(i) == max_ncolor) { + color->at(i) = cindex; + } + } + return cindex + 1; +} +} // namespace graph +} // namespace mxnet +#endif // MXNET_SYMBOL_GRAPH_ALGORITHM_H_ diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc index 82071da15425..943a50f63f6a 100644 --- a/src/symbol/graph_executor.cc +++ b/src/symbol/graph_executor.cc @@ -8,6 +8,7 @@ #include #include #include "./graph_executor.h" +#include "./graph_algorithm.h" namespace mxnet { /*! @@ -452,25 +453,62 @@ void GraphExecutor::InitDataEntryMemory() { } } +// simple unique context index of context +inline uint32_t UniqueContextIndex(const Context &ctx) { + if (ctx.dev_mask == cpu::kDevMask) return 0; + return ctx.dev_id + 1; +} + void GraphExecutor::InitResources() { - // Resource allocation + // prepare for temp space allocation + std::vector req_temp_cnt(topo_order_.size(), 0); for (size_t i = 0; i < topo_order_.size(); ++i) { uint32_t nid = topo_order_[i]; if (!op_nodes_[nid].activated) continue; if (graph_.nodes[nid].is_variable()) continue; + uint32_t cnt = 0; + for (const ResourceRequest& req : GetResource(nid)) { + if (req.type == ResourceRequest::kTempSpace) ++cnt; + } + CHECK_LE(cnt, 1) << "Node can only have one temp space request"; + req_temp_cnt[nid] = cnt; + } + uint32_t num_color = 16; + std::vector req_temp_color; + // use graph coloring to find node that won't run in parallel + num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt, + num_color, &req_temp_color); + + // cached resources temp space + std::map > cached_temp; + total_allocated_temp_ = 0; + // Resource allocation + for (size_t i = 0; i < topo_order_.size(); ++i) { + uint32_t nid = topo_order_[i]; + if (!op_nodes_[nid].activated) continue; + if (graph_.nodes[nid].is_variable()) continue; const std::vector& reqs = GetResource(nid); auto& requested = op_nodes_[nid].op_ctx.requested; requested.clear(); // Get the resource of temporal space. for (const ResourceRequest& req : reqs) { + const Context &ctx = op_nodes_[nid].ctx; if (req.type == ResourceRequest::kTempSpace) { - // TODO(tqchen, bing) more smarter graph aware temp sapce allocation. - requested.push_back(ResourceManager::Get()->Request( - op_nodes_[nid].ctx, req)); + uint32_t color = req_temp_color[nid]; + uint32_t ctx_id = UniqueContextIndex(ctx); + // try to reuse graph in same color + std::map &cmap = cached_temp[ctx_id]; + if (cmap.count(color) != 0) { + requested.push_back(cmap.at(color)); + } else { + Resource r = ResourceManager::Get()->Request(ctx, req); + requested.push_back(r); + cmap[color] = r; + ++total_allocated_temp_; + } } else if (req.type == ResourceRequest::kRandom) { - requested.push_back(ResourceManager::Get()->Request( - op_nodes_[nid].ctx, req)); + requested.push_back(ResourceManager::Get()->Request(ctx, req)); } else { LOG(FATAL) << "resource type not yet supported"; } @@ -561,6 +599,7 @@ void GraphExecutor::Print(std::ostream &os) const { } } os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n"; + os << "Total " << total_allocated_temp_ <<" TempSpace resource requested\n"; } void GraphExecutor::Forward(bool is_train) { diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h index 2f32e34dc31f..a7dbb90892ee 100644 --- a/src/symbol/graph_executor.h +++ b/src/symbol/graph_executor.h @@ -182,6 +182,8 @@ class GraphExecutor : public Executor { bool enable_inplace_allocation_; // total allocated space in #reals size_t total_allocated_reals_; + // total allocated temp space + size_t total_allocated_temp_; // number of forward nodes in the graph size_t num_forward_nodes_; // head gradient node in the graph, if there is backward pass