Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Merge pull request #87 from tqchen/master
Browse files Browse the repository at this point in the history
Add graph coloring based temp space allocation.
  • Loading branch information
antinucleon committed Sep 17, 2015
2 parents 8c87c35 + f2a96a0 commit 9514226
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 7 deletions.
1 change: 1 addition & 0 deletions example/cifar10/cifar10.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def RandomInit(narray):

in_data = mx.nd.empty(data_shape, mx.gpu())
executor = loss.simple_bind(mx.gpu(), data = in_data)
print executor.debug_str()

out_narray = executor.outputs[0]
pred = mx.nd.zeros(out_narray.shape, mx.cpu())
Expand Down
3 changes: 2 additions & 1 deletion src/engine/stream_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ template <std::size_t kNumGpus, std::size_t kStreams>
RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
Context const& ctx) {
RunContext ret;
ret.stream = nullptr;
switch (ctx.dev_mask) {
case cpu::kDevMask: ret.stream = nullptr; break;
case cpu::kDevMask: break;
case gpu::kDevMask: {
#if MXNET_USE_CUDA
std::size_t use_counter;
Expand Down
116 changes: 116 additions & 0 deletions src/symbol/graph_algorithm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*!
* Copyright (c) 2015 by Contributors
* \file graph_allocation_helper.h
* \brief This header contains graph algorithms on StaticGraph.
* It is used compute informations such as whether two
* operations can run in parallel, and helps allocation.
*/
#ifndef MXNET_SYMBOL_GRAPH_ALGORITHM_H_
#define MXNET_SYMBOL_GRAPH_ALGORITHM_H_

#include <mxnet/base.h>
#include <dmlc/logging.h>
#include <mxnet/symbolic.h>
#include <vector>

namespace mxnet {
namespace graph {
/*!
* \brief Find best path in the DAG, with reward defined
* by sum of reward of each node along the path.
* \param graph the original static graph.
* \param topo_order topo order of the nodes in the graph.
* \param node_reward the reward of each node.
* \param path the output path of nodes.
* \return the total reward of best path.
*/
inline uint32_t FindBestPath(
const StaticGraph &graph,
const std::vector<uint32_t> &topo_order,
const std::vector<uint32_t> &node_reward,
std::vector<uint32_t> *path) {
const uint32_t num_nodes = static_cast<uint32_t>(graph.nodes.size());
CHECK_EQ(graph.nodes.size(), node_reward.size());
CHECK_EQ(graph.nodes.size(), topo_order.size());

std::vector<uint32_t> best_reward(node_reward.size(), 0);
std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
uint32_t best_solution = 0, best_start_node = 0;

// traverse in reverse topo order
for (auto it = topo_order.rbegin(); it != topo_order.rend(); ++it) {
const uint32_t nid = *it;
best_reward[nid] += node_reward[nid];
if (best_reward[nid] > best_solution) {
best_solution = best_reward[nid];
best_start_node = nid;
}
for (const StaticGraph::DataEntry& e : graph.nodes[nid].inputs) {
const uint32_t prev = e.source_id;
if (best_reward[nid] > best_reward[prev]) {
best_reward[prev] = best_reward[nid];
next_node[prev] = nid;
}
}
}
path->clear();
uint32_t reward = 0;
for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
path->push_back(nid); reward += node_reward[nid];
}
CHECK_EQ(reward, best_solution);
return best_solution;
}

/*!
* \brief Color the nodes in the graph into index.
* The coloring algorithm tries to assign node group
* such that node in the same group cannot run in parallel.
*
* \param graph the original static graph.
* \param topo_order topo order of the nodes in the graph.
* \param node_importance The importance of the node
* \param max_ncolor maximum number of colors allowed.
* \param color the color index of each of the node.
* \return the total number of colors.
*/
inline uint32_t ColorNodeGroup(
const StaticGraph &graph,
const std::vector<uint32_t> &topo_order,
std::vector<uint32_t> node_importance,
uint32_t max_ncolor,
std::vector<uint32_t> *color) {
CHECK_NE(max_ncolor, 0);
CHECK_EQ(graph.nodes.size(), topo_order.size());
CHECK_EQ(graph.nodes.size(), node_importance.size());

color->clear();
color->resize(topo_order.size(), max_ncolor);
uint32_t cindex;
// greedy algorithm, every time
// find a path with best reward and assign a new color
// All the nodes in the path cannot run in parallel.
for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
std::vector<uint32_t> path;
uint32_t reward = FindBestPath(graph, topo_order, node_importance, &path);
if (reward == 0) break;
for (uint32_t nid : path) {
if (node_importance[nid] != 0) {
CHECK_EQ(color->at(nid), max_ncolor);
color->at(nid) = cindex;
// make the importance 0 after color is decided.
node_importance[nid] = 0;
}
}
}
// assign i for rest of the node
for (size_t i = 0; i < topo_order.size(); ++i) {
if (color->at(i) == max_ncolor) {
color->at(i) = cindex;
}
}
return cindex + 1;
}
} // namespace graph
} // namespace mxnet
#endif // MXNET_SYMBOL_GRAPH_ALGORITHM_H_
51 changes: 45 additions & 6 deletions src/symbol/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <mxnet/symbolic.h>
#include <memory>
#include "./graph_executor.h"
#include "./graph_algorithm.h"

namespace mxnet {
/*!
Expand Down Expand Up @@ -452,25 +453,62 @@ void GraphExecutor::InitDataEntryMemory() {
}
}

// simple unique context index of context
inline uint32_t UniqueContextIndex(const Context &ctx) {
if (ctx.dev_mask == cpu::kDevMask) return 0;
return ctx.dev_id + 1;
}

void GraphExecutor::InitResources() {
// Resource allocation
// prepare for temp space allocation
std::vector<uint32_t> req_temp_cnt(topo_order_.size(), 0);
for (size_t i = 0; i < topo_order_.size(); ++i) {
uint32_t nid = topo_order_[i];
if (!op_nodes_[nid].activated) continue;
if (graph_.nodes[nid].is_variable()) continue;
uint32_t cnt = 0;
for (const ResourceRequest& req : GetResource(nid)) {
if (req.type == ResourceRequest::kTempSpace) ++cnt;
}
CHECK_LE(cnt, 1) << "Node can only have one temp space request";
req_temp_cnt[nid] = cnt;
}
uint32_t num_color = 16;
std::vector<uint32_t> req_temp_color;
// use graph coloring to find node that won't run in parallel
num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt,
num_color, &req_temp_color);

// cached resources temp space
std::map<uint32_t, std::map<uint32_t, Resource> > cached_temp;
total_allocated_temp_ = 0;

// Resource allocation
for (size_t i = 0; i < topo_order_.size(); ++i) {
uint32_t nid = topo_order_[i];
if (!op_nodes_[nid].activated) continue;
if (graph_.nodes[nid].is_variable()) continue;
const std::vector<ResourceRequest>& reqs = GetResource(nid);
auto& requested = op_nodes_[nid].op_ctx.requested;
requested.clear();
// Get the resource of temporal space.
for (const ResourceRequest& req : reqs) {
const Context &ctx = op_nodes_[nid].ctx;
if (req.type == ResourceRequest::kTempSpace) {
// TODO(tqchen, bing) more smarter graph aware temp sapce allocation.
requested.push_back(ResourceManager::Get()->Request(
op_nodes_[nid].ctx, req));
uint32_t color = req_temp_color[nid];
uint32_t ctx_id = UniqueContextIndex(ctx);
// try to reuse graph in same color
std::map<uint32_t, Resource> &cmap = cached_temp[ctx_id];
if (cmap.count(color) != 0) {
requested.push_back(cmap.at(color));
} else {
Resource r = ResourceManager::Get()->Request(ctx, req);
requested.push_back(r);
cmap[color] = r;
++total_allocated_temp_;
}
} else if (req.type == ResourceRequest::kRandom) {
requested.push_back(ResourceManager::Get()->Request(
op_nodes_[nid].ctx, req));
requested.push_back(ResourceManager::Get()->Request(ctx, req));
} else {
LOG(FATAL) << "resource type not yet supported";
}
Expand Down Expand Up @@ -561,6 +599,7 @@ void GraphExecutor::Print(std::ostream &os) const {
}
}
os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n";
os << "Total " << total_allocated_temp_ <<" TempSpace resource requested\n";
}

void GraphExecutor::Forward(bool is_train) {
Expand Down
2 changes: 2 additions & 0 deletions src/symbol/graph_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ class GraphExecutor : public Executor {
bool enable_inplace_allocation_;
// total allocated space in #reals
size_t total_allocated_reals_;
// total allocated temp space
size_t total_allocated_temp_;
// number of forward nodes in the graph
size_t num_forward_nodes_;
// head gradient node in the graph, if there is backward pass
Expand Down

0 comments on commit 9514226

Please sign in to comment.