Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Add graph coloring based temp space allocation. #87

Merged
merged 1 commit into from
Sep 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/cifar10/cifar10.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def RandomInit(narray):

in_data = mx.nd.empty(data_shape, mx.gpu())
executor = loss.simple_bind(mx.gpu(), data = in_data)
print executor.debug_str()

out_narray = executor.outputs[0]
pred = mx.nd.zeros(out_narray.shape, mx.cpu())
Expand Down
3 changes: 2 additions & 1 deletion src/engine/stream_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ template <std::size_t kNumGpus, std::size_t kStreams>
RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
Context const& ctx) {
RunContext ret;
ret.stream = nullptr;
switch (ctx.dev_mask) {
case cpu::kDevMask: ret.stream = nullptr; break;
case cpu::kDevMask: break;
case gpu::kDevMask: {
#if MXNET_USE_CUDA
std::size_t use_counter;
Expand Down
116 changes: 116 additions & 0 deletions src/symbol/graph_algorithm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*!
* Copyright (c) 2015 by Contributors
* \file graph_allocation_helper.h
* \brief This header contains graph algorithms on StaticGraph.
* It is used compute informations such as whether two
* operations can run in parallel, and helps allocation.
*/
#ifndef MXNET_SYMBOL_GRAPH_ALGORITHM_H_
#define MXNET_SYMBOL_GRAPH_ALGORITHM_H_

#include <mxnet/base.h>
#include <dmlc/logging.h>
#include <mxnet/symbolic.h>
#include <vector>

namespace mxnet {
namespace graph {
/*!
* \brief Find best path in the DAG, with reward defined
* by sum of reward of each node along the path.
* \param graph the original static graph.
* \param topo_order topo order of the nodes in the graph.
* \param node_reward the reward of each node.
* \param path the output path of nodes.
* \return the total reward of best path.
*/
inline uint32_t FindBestPath(
const StaticGraph &graph,
const std::vector<uint32_t> &topo_order,
const std::vector<uint32_t> &node_reward,
std::vector<uint32_t> *path) {
const uint32_t num_nodes = static_cast<uint32_t>(graph.nodes.size());
CHECK_EQ(graph.nodes.size(), node_reward.size());
CHECK_EQ(graph.nodes.size(), topo_order.size());

std::vector<uint32_t> best_reward(node_reward.size(), 0);
std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
uint32_t best_solution = 0, best_start_node = 0;

// traverse in reverse topo order
for (auto it = topo_order.rbegin(); it != topo_order.rend(); ++it) {
const uint32_t nid = *it;
best_reward[nid] += node_reward[nid];
if (best_reward[nid] > best_solution) {
best_solution = best_reward[nid];
best_start_node = nid;
}
for (const StaticGraph::DataEntry& e : graph.nodes[nid].inputs) {
const uint32_t prev = e.source_id;
if (best_reward[nid] > best_reward[prev]) {
best_reward[prev] = best_reward[nid];
next_node[prev] = nid;
}
}
}
path->clear();
uint32_t reward = 0;
for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
path->push_back(nid); reward += node_reward[nid];
}
CHECK_EQ(reward, best_solution);
return best_solution;
}

/*!
* \brief Color the nodes in the graph into index.
* The coloring algorithm tries to assign node group
* such that node in the same group cannot run in parallel.
*
* \param graph the original static graph.
* \param topo_order topo order of the nodes in the graph.
* \param node_importance The importance of the node
* \param max_ncolor maximum number of colors allowed.
* \param color the color index of each of the node.
* \return the total number of colors.
*/
inline uint32_t ColorNodeGroup(
const StaticGraph &graph,
const std::vector<uint32_t> &topo_order,
std::vector<uint32_t> node_importance,
uint32_t max_ncolor,
std::vector<uint32_t> *color) {
CHECK_NE(max_ncolor, 0);
CHECK_EQ(graph.nodes.size(), topo_order.size());
CHECK_EQ(graph.nodes.size(), node_importance.size());

color->clear();
color->resize(topo_order.size(), max_ncolor);
uint32_t cindex;
// greedy algorithm, every time
// find a path with best reward and assign a new color
// All the nodes in the path cannot run in parallel.
for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
std::vector<uint32_t> path;
uint32_t reward = FindBestPath(graph, topo_order, node_importance, &path);
if (reward == 0) break;
for (uint32_t nid : path) {
if (node_importance[nid] != 0) {
CHECK_EQ(color->at(nid), max_ncolor);
color->at(nid) = cindex;
// make the importance 0 after color is decided.
node_importance[nid] = 0;
}
}
}
// assign i for rest of the node
for (size_t i = 0; i < topo_order.size(); ++i) {
if (color->at(i) == max_ncolor) {
color->at(i) = cindex;
}
}
return cindex + 1;
}
} // namespace graph
} // namespace mxnet
#endif // MXNET_SYMBOL_GRAPH_ALGORITHM_H_
51 changes: 45 additions & 6 deletions src/symbol/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <mxnet/symbolic.h>
#include <memory>
#include "./graph_executor.h"
#include "./graph_algorithm.h"

namespace mxnet {
/*!
Expand Down Expand Up @@ -452,25 +453,62 @@ void GraphExecutor::InitDataEntryMemory() {
}
}

// simple unique context index of context
inline uint32_t UniqueContextIndex(const Context &ctx) {
if (ctx.dev_mask == cpu::kDevMask) return 0;
return ctx.dev_id + 1;
}

void GraphExecutor::InitResources() {
// Resource allocation
// prepare for temp space allocation
std::vector<uint32_t> req_temp_cnt(topo_order_.size(), 0);
for (size_t i = 0; i < topo_order_.size(); ++i) {
uint32_t nid = topo_order_[i];
if (!op_nodes_[nid].activated) continue;
if (graph_.nodes[nid].is_variable()) continue;
uint32_t cnt = 0;
for (const ResourceRequest& req : GetResource(nid)) {
if (req.type == ResourceRequest::kTempSpace) ++cnt;
}
CHECK_LE(cnt, 1) << "Node can only have one temp space request";
req_temp_cnt[nid] = cnt;
}
uint32_t num_color = 16;
std::vector<uint32_t> req_temp_color;
// use graph coloring to find node that won't run in parallel
num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt,
num_color, &req_temp_color);

// cached resources temp space
std::map<uint32_t, std::map<uint32_t, Resource> > cached_temp;
total_allocated_temp_ = 0;

// Resource allocation
for (size_t i = 0; i < topo_order_.size(); ++i) {
uint32_t nid = topo_order_[i];
if (!op_nodes_[nid].activated) continue;
if (graph_.nodes[nid].is_variable()) continue;
const std::vector<ResourceRequest>& reqs = GetResource(nid);
auto& requested = op_nodes_[nid].op_ctx.requested;
requested.clear();
// Get the resource of temporal space.
for (const ResourceRequest& req : reqs) {
const Context &ctx = op_nodes_[nid].ctx;
if (req.type == ResourceRequest::kTempSpace) {
// TODO(tqchen, bing) more smarter graph aware temp sapce allocation.
requested.push_back(ResourceManager::Get()->Request(
op_nodes_[nid].ctx, req));
uint32_t color = req_temp_color[nid];
uint32_t ctx_id = UniqueContextIndex(ctx);
// try to reuse graph in same color
std::map<uint32_t, Resource> &cmap = cached_temp[ctx_id];
if (cmap.count(color) != 0) {
requested.push_back(cmap.at(color));
} else {
Resource r = ResourceManager::Get()->Request(ctx, req);
requested.push_back(r);
cmap[color] = r;
++total_allocated_temp_;
}
} else if (req.type == ResourceRequest::kRandom) {
requested.push_back(ResourceManager::Get()->Request(
op_nodes_[nid].ctx, req));
requested.push_back(ResourceManager::Get()->Request(ctx, req));
} else {
LOG(FATAL) << "resource type not yet supported";
}
Expand Down Expand Up @@ -561,6 +599,7 @@ void GraphExecutor::Print(std::ostream &os) const {
}
}
os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n";
os << "Total " << total_allocated_temp_ <<" TempSpace resource requested\n";
}

void GraphExecutor::Forward(bool is_train) {
Expand Down
2 changes: 2 additions & 0 deletions src/symbol/graph_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ class GraphExecutor : public Executor {
bool enable_inplace_allocation_;
// total allocated space in #reals
size_t total_allocated_reals_;
// total allocated temp space
size_t total_allocated_temp_;
// number of forward nodes in the graph
size_t num_forward_nodes_;
// head gradient node in the graph, if there is backward pass
Expand Down