diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index c428beffafca..81ba9e5591f7 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -163,6 +163,7 @@ def RandomInit(narray):
 
 in_data = mx.nd.empty(data_shape, mx.gpu())
 executor = loss.simple_bind(mx.gpu(), data = in_data)
+print executor.debug_str()
 
 out_narray = executor.outputs[0]
 pred = mx.nd.zeros(out_narray.shape, mx.cpu())
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index b9303e86f08c..3c668788c20c 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -43,8 +43,9 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     Context const& ctx) {
   RunContext ret;
+  ret.stream = nullptr;
   switch (ctx.dev_mask) {
-    case cpu::kDevMask: ret.stream = nullptr; break;
+    case cpu::kDevMask: break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
       std::size_t use_counter;
diff --git a/src/symbol/graph_algorithm.h b/src/symbol/graph_algorithm.h
new file mode 100644
index 000000000000..021bc0744981
--- /dev/null
+++ b/src/symbol/graph_algorithm.h
@@ -0,0 +1,116 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_allocation_helper.h
+ * \brief This header contains graph algorithms on StaticGraph.
+ *  It is used  compute informations such as whether two
+ *  operations can run in parallel, and helps allocation.
+*/
+#ifndef MXNET_SYMBOL_GRAPH_ALGORITHM_H_
+#define MXNET_SYMBOL_GRAPH_ALGORITHM_H_
+
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mxnet/symbolic.h>
+#include <vector>
+
+namespace mxnet {
+namespace graph {
+/*!
+ * \brief Find best path in the DAG, with reward defined
+ *  by sum of reward of each node along the path.
+ * \param graph the original static graph.
+ * \param topo_order topo order of the nodes in the graph.
+ * \param node_reward the reward of each node.
+ * \param path the output path of nodes.
+ * \return the total reward of best path.
+ */
+inline uint32_t FindBestPath(
+    const StaticGraph &graph,
+    const std::vector<uint32_t> &topo_order,
+    const std::vector<uint32_t> &node_reward,
+    std::vector<uint32_t> *path) {
+  const uint32_t num_nodes = static_cast<uint32_t>(graph.nodes.size());
+  CHECK_EQ(graph.nodes.size(), node_reward.size());
+  CHECK_EQ(graph.nodes.size(), topo_order.size());
+
+  std::vector<uint32_t> best_reward(node_reward.size(), 0);
+  std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
+  uint32_t best_solution = 0, best_start_node = 0;
+
+  // traverse in reverse topo order
+  for (auto it = topo_order.rbegin(); it != topo_order.rend(); ++it) {
+    const uint32_t nid = *it;
+    best_reward[nid] += node_reward[nid];
+    if (best_reward[nid] > best_solution) {
+      best_solution = best_reward[nid];
+      best_start_node = nid;
+    }
+    for (const StaticGraph::DataEntry& e : graph.nodes[nid].inputs) {
+      const uint32_t prev = e.source_id;
+      if (best_reward[nid] > best_reward[prev]) {
+        best_reward[prev] = best_reward[nid];
+        next_node[prev] = nid;
+      }
+    }
+  }
+  path->clear();
+  uint32_t reward = 0;
+  for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
+    path->push_back(nid); reward += node_reward[nid];
+  }
+  CHECK_EQ(reward, best_solution);
+  return best_solution;
+}
+
+/*!
+ * \brief Color the nodes in the graph into index.
+ *  The coloring algorithm tries to assign node group
+ *  such that node in the same group cannot run in parallel.
+ *
+ * \param graph the original static graph.
+ * \param topo_order topo order of the nodes in the graph.
+ * \param node_importance The importance of the node
+ * \param max_ncolor maximum number of colors allowed.
+ * \param color the color index of each of the node.
+ * \return the total number of colors.
+ */
+inline uint32_t ColorNodeGroup(
+    const StaticGraph &graph,
+    const std::vector<uint32_t> &topo_order,
+    std::vector<uint32_t> node_importance,
+    uint32_t max_ncolor,
+    std::vector<uint32_t> *color) {
+  CHECK_NE(max_ncolor, 0);
+  CHECK_EQ(graph.nodes.size(), topo_order.size());
+  CHECK_EQ(graph.nodes.size(), node_importance.size());
+
+  color->clear();
+  color->resize(topo_order.size(), max_ncolor);
+  uint32_t cindex;
+  // greedy algorithm, every time
+  // find a path with best reward and assign a new color
+  // All the nodes in the path cannot run in parallel.
+  for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
+    std::vector<uint32_t> path;
+    uint32_t reward = FindBestPath(graph, topo_order, node_importance, &path);
+    if (reward == 0) break;
+    for (uint32_t nid : path) {
+      if (node_importance[nid] != 0) {
+        CHECK_EQ(color->at(nid), max_ncolor);
+        color->at(nid) = cindex;
+        // make the importance 0 after color is decided.
+        node_importance[nid] = 0;
+      }
+    }
+  }
+  // assign i for rest of the node
+  for (size_t i = 0; i < topo_order.size(); ++i) {
+    if (color->at(i) == max_ncolor) {
+      color->at(i) = cindex;
+    }
+  }
+  return cindex + 1;
+}
+}  // namespace graph
+}  // namespace mxnet
+#endif  // MXNET_SYMBOL_GRAPH_ALGORITHM_H_
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 82071da15425..943a50f63f6a 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -8,6 +8,7 @@
 #include <mxnet/symbolic.h>
 #include <memory>
 #include "./graph_executor.h"
+#include "./graph_algorithm.h"
 
 namespace mxnet {
 /*!
@@ -452,25 +453,62 @@ void GraphExecutor::InitDataEntryMemory() {
   }
 }
 
+// simple unique context index of context
+inline uint32_t UniqueContextIndex(const Context &ctx) {
+  if (ctx.dev_mask == cpu::kDevMask) return 0;
+  return ctx.dev_id + 1;
+}
+
 void GraphExecutor::InitResources() {
-  // Resource allocation
+  // prepare for temp space allocation
+  std::vector<uint32_t> req_temp_cnt(topo_order_.size(), 0);
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
     if (graph_.nodes[nid].is_variable()) continue;
+    uint32_t cnt = 0;
+    for (const ResourceRequest& req : GetResource(nid)) {
+      if (req.type == ResourceRequest::kTempSpace) ++cnt;
+    }
+    CHECK_LE(cnt, 1) << "Node can only have one temp space request";
+    req_temp_cnt[nid] = cnt;
+  }
+  uint32_t num_color = 16;
+  std::vector<uint32_t> req_temp_color;
+  // use graph coloring to find node that won't run in parallel
+  num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt,
+                                    num_color, &req_temp_color);
+
+  // cached resources temp space
+  std::map<uint32_t, std::map<uint32_t, Resource> > cached_temp;
+  total_allocated_temp_ = 0;
 
+  // Resource allocation
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
     const std::vector<ResourceRequest>& reqs = GetResource(nid);
     auto& requested = op_nodes_[nid].op_ctx.requested;
     requested.clear();
     // Get the resource of temporal space.
     for (const ResourceRequest& req : reqs) {
+      const Context &ctx = op_nodes_[nid].ctx;
       if (req.type == ResourceRequest::kTempSpace) {
-        // TODO(tqchen, bing) more smarter graph aware temp sapce allocation.
-        requested.push_back(ResourceManager::Get()->Request(
-            op_nodes_[nid].ctx, req));
+        uint32_t color = req_temp_color[nid];
+        uint32_t ctx_id = UniqueContextIndex(ctx);
+        // try to reuse graph in same color
+        std::map<uint32_t, Resource> &cmap = cached_temp[ctx_id];
+        if (cmap.count(color) != 0) {
+          requested.push_back(cmap.at(color));
+        } else {
+          Resource r = ResourceManager::Get()->Request(ctx, req);
+          requested.push_back(r);
+          cmap[color] = r;
+          ++total_allocated_temp_;
+        }
       } else if (req.type == ResourceRequest::kRandom) {
-        requested.push_back(ResourceManager::Get()->Request(
-            op_nodes_[nid].ctx, req));
+        requested.push_back(ResourceManager::Get()->Request(ctx, req));
       } else {
         LOG(FATAL) << "resource type not yet supported";
       }
@@ -561,6 +599,7 @@ void GraphExecutor::Print(std::ostream &os) const {
     }
   }
   os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n";
+  os << "Total " << total_allocated_temp_ <<" TempSpace resource requested\n";
 }
 
 void GraphExecutor::Forward(bool is_train) {
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index 2f32e34dc31f..a7dbb90892ee 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -182,6 +182,8 @@ class GraphExecutor : public Executor {
   bool enable_inplace_allocation_;
   // total allocated space in #reals
   size_t total_allocated_reals_;
+  // total allocated temp space
+  size_t total_allocated_temp_;
   // number of forward nodes in the graph
   size_t num_forward_nodes_;
   // head gradient node in the graph, if there is backward pass