From fcb45cbb2038364735ca5a27a4dae4e3604016bc Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 15 Aug 2015 16:20:44 -0600
Subject: [PATCH 01/11] Add activation op

---
 Makefile                      |   7 +-
 src/operator/activation-inl.h | 141 ++++++++++++++++++++++++++++++++++
 src/operator/activation.cc    |  29 +++++++
 src/operator/activation.cu    |  21 +++++
 4 files changed, 195 insertions(+), 3 deletions(-)
 create mode 100644 src/operator/activation-inl.h
 create mode 100644 src/operator/activation.cc
 create mode 100644 src/operator/activation.cu
diff --git a/Makefile b/Makefile
index 581674c784a2..da029f77ef27 100644
--- a/Makefile
+++ b/Makefile
@@ -58,14 +58,14 @@ endif
 BIN = test/api_registry_test
 OBJ = storage.o narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o fully_connected_cpu.o static_graph.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o fully_connected_cpu.o static_graph.o activation_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_op_gpu.o fully_connected_gpu.o
+	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -87,7 +87,8 @@ c_api.o: src/c_api.cc
 operator.o: src/operator/static_operator_wrapper.cc
 fully_connected_cpu.o: src/operator/fully_connected.cc
 fully_connected_gpu.o: src/operator/fully_connected.cu
-
+activation_cpu.o: src/operator/activation.cc
+activation_gpu.o: src/operator/activation.cu
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
new file mode 100644
index 000000000000..221e69ce948f
--- /dev/null
+++ b/src/operator/activation-inl.h
@@ -0,0 +1,141 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_ACTIVATION_INL_H_
+#define MXNET_OPERATOR_ACTIVATION_INL_H_
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+// Declare enumeration of input order to make code more intuitive.
+// // These enums are only visible within this header
+enum ActivationOpInputs {kData};
+enum ActivationOpOutputs {kOut};
+enum ActivationOpType {kReLU};
+/**
+ * \brief This is the implementation of activation operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
+template<typename xpu, typename ForwardOp, typename BackwardOp>
+class ActivationOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    out = F<ForwardOp>(data);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Tensor<xpu, 2> out_gradient = out_grad[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(grad, req[kData], F<BackwardOp>(out_gradient * F<ForwardOp>(data)));
+  }
+};  // class ActivationOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateActivationOp(ActivationOpType type);
+
+#if DMLC_USE_CXX11
+class ActivationProp : public OperatorProperty {
+ public:
+  virtual void SetParam(const char *name, const char *val) {
+    if (!strcmp(name, "type")) {
+      if (!strcmp(val, "relu")) {
+        type_ = kReLU;
+      }
+    }
+    // TODO(bing): check optype valid
+  }
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    const TShape &dshape = in_shape->at(0);
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    return new ActivationProp();
+  }
+
+  virtual std::string TypeString() const {
+    switch (type_) {
+      case kReLU: return "Activation : ReLU";
+      default: return "Invalid Activation";
+    }
+  }
+
+  // decalre dependency and inplace optimization options
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut], in_data[kData]};
+  }
+
+  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<int> &in_grad) const {
+    return {};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  ActivationOpType type_;
+};
+#endif  // DMLC_USE_CXX11
+
+namespace act {
+/*! \brief Rectified Linear Operation */
+struct relu {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a > 0.0f ? a : 0.0f;
+  }
+};
+struct relu_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a > 0.0f ? 1.0f : 0.0f;
+  }
+};
+
+}  // namespace act
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_ACTIVATION_INL_H_
+
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
new file mode 100644
index 000000000000..b26c1e24dc53
--- /dev/null
+++ b/src/operator/activation.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include <mxnet/registry.h>
+#include "./activation-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateActivationOp<cpu>(ActivationOpType type) {
+  switch (type) {
+    case kReLU: return new ActivationOp<cpu, act::relu, act::relu_grad>();
+    default: return NULL;
+  }
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *ActivationProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateActivationOp, type_);
+}
+
+REGISTER_OP_PROPERTY(Activation, ActivationProp);
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
new file mode 100644
index 000000000000..b6a523c003ec
--- /dev/null
+++ b/src/operator/activation.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./activation-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateActivationOp<gpu>(ActivationOpType type) {
+  switch(type) {
+    case kReLU: return new ActivationOp<gpu, act::relu, act::relu_grad>();
+    default: return NULL;
+  }
+}
+}  // op
+}  // namespace mxnet
+

From ee2d7f1951c82804120984bf3df2a4dfd4eb5eb5 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 15 Aug 2015 19:44:52 -0600
Subject: [PATCH 02/11] infer shape

---
 include/mxnet/c_api.h              |  58 ++++----
 include/mxnet/context.h            |   2 +
 include/mxnet/operator.h           |  30 +++--
 include/mxnet/symbolic.h           |  68 ++++++++--
 python/mxnet/narray.py             |   2 +-
 python/mxnet/symbol.py             |  75 ++++++++++-
 python/mxnet/symbol_creator.py     |   2 +-
 python/test_infer_shape.py         |  19 +++
 src/c_api.cc                       | 209 +++++++++++++++++++----------
 src/operator/activation-inl.h      |   4 +-
 src/operator/fully_connected-inl.h |  27 ++--
 src/operator/operator_common.h     |  39 ++++--
 src/operator/param.h               |  12 +-
 src/symbol/static_graph.cc         |  92 +++++++++++--
 src/symbol/symbol.cc               |  96 ++++++++-----
 15 files changed, 541 insertions(+), 194 deletions(-)
 create mode 100644 python/test_infer_shape.py

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index a9a15c4a8007..fe035b21bc7f 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -49,10 +49,9 @@ typedef void *DataIterHandle;
  *  \return error info
  */
 MXNET_DLL const char *MXGetLastError();
-
-//--------------------------------
+//-------------------------------------
 // Part 1: NArray creation and deletion
-//--------------------------------
+//-------------------------------------
 /*!
  * \brief create a NArray handle that is not initialized
  *  can be used to pass in as mutate variables
@@ -189,7 +188,6 @@ MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
                              mx_uint *num_scalars,
                              mx_uint *num_mutate_vars,
                              int *type_mask);
-
 /*!
  * \brief invoke a function, the array size of passed in arguments
  *   must match the values in the
@@ -301,8 +299,8 @@ MXNET_DLL int MXSymbolListArguments(SymbolHandle symbol,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolListReturns(SymbolHandle symbol,
-                                    mx_uint *out_size,
-                                    const char ***out_str_array);
+                                  mx_uint *out_size,
+                                  const char ***out_str_array);
 /*!
  * \brief Compose the symbol on other symbols.
  *
@@ -322,6 +320,36 @@ MXNET_DLL int MXSymbolCompose(SymbolHandle sym,
                               mx_uint num_args,
                               const char** keys,
                               SymbolHandle* args);
+/*!
+ * \brief infer shape of unknown input shapes given the known one.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
+                                 mx_uint num_args,
+                                 const char** keys,
+                                 const mx_uint *arg_ind_ptr,
+                                 const mx_uint *arg_shape_data,
+                                 mx_uint *in_shape_size,
+                                 const mx_uint **in_shape_ndim,
+                                 const mx_uint ***in_shape_data,
+                                 mx_uint *out_shape_size,
+                                 const mx_uint **out_shape_ndim,
+                                 const mx_uint ***out_shape_data,
+                                 int *complete);
 //--------------------------------------------
 // Part 4: operator interface on NArray
 //--------------------------------------------
@@ -352,24 +380,6 @@ MXNET_DLL int MXOpFree(OperatorHandle op);
  */
 MXNET_DLL int MXOpDescribeArgs(mx_uint *out_size,
                                int **out_array);
-/*!
- * \brief infer shape of unknown input shapes given the known one
- *  this function do not return the shape of output
- *  the shapes are packed into a CSR matrix represened by ind_ptr and shape_array
- *
- *  When the function returns, it return a new CSR matrix by updating ind_ptr,
- *  and return the content in the return value
- *
- * \param ind_ptr the head pointer of the rows in CSR
- * \param shape_array the content of the CSR
- * \param out_nout number of output arguments of this operation
- * \param out_array another content of CSR with infered shape
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXOpInferShape(mx_uint *ind_ptr,
-                             mx_uint *shape_array,
-                             mx_uint *out_nout,
-                             mx_uint *out_array);
 /*!
  * \brief call forward on the operator
  * \param op the operator handle
diff --git a/include/mxnet/context.h b/include/mxnet/context.h
index 262ba2e787d4..8dfa618ca180 100644
--- a/include/mxnet/context.h
+++ b/include/mxnet/context.h
@@ -6,6 +6,8 @@
 #ifndef MXNET_CONTEXT_H_
 #define MXNET_CONTEXT_H_
 
+#include "./base.h"
+
 namespace mxnet {
 
 /*! \brief Context information about the execution enviroment */
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 0299ef2bf167..65d6e3e92637 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -40,16 +40,18 @@ enum OpReqType {
 struct OpContext {
   /*! \brief whether it is training phase */
   int is_train;
-  /*! \brief Stream we are running on */
-  void *stream;
+  /*! \brief RunContext related resources */
+  RunContext run_ctx;
   /*! \brief Resources requested by the operator */
   std::vector<Resource> requested;
   /*!
-   * \brief set the RunContext related parts
-   * \param ctx the context
+   * \brief get mshadow stream from Context
+   * \return the mshadow stream
+   * \tparam xpu the device type of the stream
    */
-  inline void SetRunContext(const RunContext &ctx) {
-    stream = ctx.stream;
+  template<typename xpu>
+  inline mshadow::Stream<xpu>* get_stream() const {
+    return static_cast<mshadow::Stream<xpu>*>(run_ctx.stream);
   }
 };
 
@@ -84,13 +86,22 @@ class Operator {
                        const std::vector<TBlob> &out_data) = 0;
   /*!
    * \brief Perform a Backward Operation, write gradient to the in_grad.
+   *
+   * Convention:
+   *   out_grad.size() == OperatorProperty.NumVisibleReturns()
+   *   out_data.size() == OperatorProperty.NumReturns()
+   * out_data can contain additional invisible returns that remembers the
+   * state carried from the Forward pass. For example mask in the dropout.
+   *
+   * The gradients are passed from visible returns in this function.
+   *
    * \param ctx runtime context available to this call
-   * \param out_grad the gradient value we get from output of the Operator
+   * \param out_grad the gradient value we get from of the Operator.
    * \param in_data the array of input data.
    * \param out_data the array of output data.
    * \param req request types of the saving operation, can be all types.
    * \param in_grad the array of gradient we need to write to.
-   * \sa OpReqType, OpContext
+   * \sa OpReqType, OpContext, OperatorProperty
    */
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
@@ -166,7 +177,8 @@ class OperatorProperty {
    *
    * \param out_shape the shape of outputs of the operator
    *     InferShape will modify the vector to fill output TShape
-   * \return if the shape inference is successful, return true, else return false.
+   * \return true if the shape inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_shapes are inconsistent.
    */
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) const = 0;
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index dc00f5a33fb6..e04f82b4f30f 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -38,7 +38,18 @@ class StaticGraph {
     /*! \brief index of output from the source. */
     uint32_t index;
   };
-  /*! \brief Operation Node in static graph */
+  /*!
+   * \brief Operation Node in static graphs.
+   *  There are two types of node, Forward and Backward Node.
+   *
+   *  - Forward node corresponds to the op.Forward
+   *  - Backward node corresponds to the Backward pass,
+   *    where the corresponding forward node is indicated by backward_source_id.
+   *    The op field in Backward node is nullptr
+   *
+   *  The reason we explicit support Backward node is to allow special treatment
+   *  such as shape inference and state sharing with Forward pass.
+   */
   struct Node {
     /*! \brief wrapped operator property */
     std::unique_ptr<OperatorProperty> op;
@@ -46,13 +57,36 @@ class StaticGraph {
     std::string name;
     /*! \brief inputs (node_id, index) for of the nodes*/
     std::vector<DataEntry> inputs;
+    /*!
+     * \brief If this field is nonnegative, this indicates this
+     *  Node is corresponds to a Backward Operation of Operator.
+     *  backward_source_id will points to the corresponding Forward Node.
+     *
+     *  For normal node, this field is -1.
+     *  When the node is a Backward node, the op field will be nullptr
+     */
+    int32_t backward_source_id;
+    /*! \brief default constructor */
+    Node() : backward_source_id(-1) {}
+    /*! \return whether the node is forward op node */
+    inline bool is_forward() const {
+      return op != nullptr;
+    }
+    /*! \return whether the node is backward op node */
+    inline bool is_backward() const {
+      return backward_source_id != -1;
+    }
+    /*! \return whether the node is variable node */
+    inline bool is_variable() const {
+      return op == nullptr && !is_backward();
+    }
   };
   /*! \brief all nodes in the graph */
   std::vector<Node> nodes;
-  /*! \brief index is nodes that correspods to arguments */
+  /*! \brief index of nodes that correspods to arguments */
   std::vector<uint32_t> arg_nodes;
-  /*! \brief outputs(heads) of the graph */
-  std::vector<DataEntry> outputs;
+  /*! \brief heads outputs of the graph */
+  std::vector<DataEntry> heads;
   // funtions to help inference in static graph
   /*!
    * \brief Perform a topological sort on the graph
@@ -85,8 +119,8 @@ class StaticGraph {
    *     InferShape will modify the vector to fill output TShape
    * \return if the shape inference is successful, return true, else return false.
    */
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape) const;
+  bool InferShape(std::vector<TShape>* in_shape,
+                  std::vector<TShape>* out_shape) const;
 };
 
 /*!
@@ -174,7 +208,7 @@ class Symbol {
                       const std::string& name) const;
   /*!
    * \brief infer the shapes of outputs and unknown input arguments
-   * \param in_shape the shape of input arguments of the operator
+   * \param arg_shapes the shape of input arguments of the operator
    *     this should be of same length as the vector returned by ListArguments
    *     in_shape allows unknown elements, which are checked by shape.ndim() == 0.
    *     For unknown shapes, InferShape will try to fill in the correct Shape in in_shape
@@ -182,11 +216,23 @@ class Symbol {
    *
    *     common practice: set the shape of data input, and usually weight's shape can be infered
    *
-   * \param out_shape the shape of outputs of the operator
-   *     InferShape will modify the vector to fill output TShape
-   * \return if the shape inference is successful, return true, else return false.
+   * \param out_shapes Use to store the infered shapes of outputs.
+   * \return true if the shape inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_shapes are inconsistent.
+   */
+  bool InferShape(std::vector<TShape> *arg_shapes,
+                  std::vector<TShape> *out_shapes) const;
+  /*!
+   * \brief infer the shapes by providing shapes of known arguments.
+   * \param known_arg_shapes map of argument name to shape of arguments with known shapes.
+   * \param arg_shapes used to store infered shapes of arguments.
+   * \param out_shapes used to store infered shapes of outputs.
+   * \return true if the shape inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_shapes are inconsistent.
    */
-  bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape) const;
+  bool InferShape(const std::unordered_map<std::string, TShape> &known_arg_shapes,
+                  std::vector<TShape> *arg_shapes,
+                  std::vector<TShape> *out_shapes) const;
   /*!
    * \brief get number of outputs of this symbol
    * \return number of outputs
diff --git a/python/mxnet/narray.py b/python/mxnet/narray.py
index 26a2198bd765..61839ecc0a60 100644
--- a/python/mxnet/narray.py
+++ b/python/mxnet/narray.py
@@ -134,7 +134,7 @@ def shape(self):
         pdata = ctypes.POINTER(mx_uint)()
         check_call(_LIB.MXNArrayGetShape(
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
-        return tuple(pdata[i] for i in range(ndim.value))
+        return tuple(pdata[:ndim.value])
 
     @property
     def context(self):
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 031b18ab862f..0caa4b6a0a90 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -5,7 +5,7 @@
 
 import ctypes
 from .base import _LIB
-from .base import c_array, c_str
+from .base import c_array, c_str, mx_uint
 from .base import SymbolHandle
 from .base import check_call
 
@@ -136,6 +136,77 @@ def list_returns(self):
                 self.handle, ctypes.byref(size), ctypes.byref(sarr)))
         return [sarr[i] for i in range(size.value)]
 
+    def infer_shape(self, *args, **kwargs):
+        """Infer the shape of outputs and arguments of given known shapes of arguments.
+
+        User can either pass in the known shapes in positional way or keyword argument way.
+        Pair of Nones is returned if there is not enough information passed in.
+        An error will be raised if there is inconsistency found in the known shapes passed in.
+
+        Parameters
+        ----------
+        *args :
+            Provide shape of arguments in a positional way.
+            Unknown shape can be marked as None
+
+        **kwargs :
+            Provide keyword arguments of known shapes.
+
+        Returns
+        -------
+        arg_shapes : list of tuple or None
+            List of shapes of arguments.
+            The order is in the same order as list_arguments()
+        out_shapes : list of tuple or None
+            List of shapes of outputs.
+            The order is in the same order as list_returns()
+        """
+        if len(args) != 0 and len(kwargs) != 0:
+            raise ValueError('Can only specify known argument shapes either by positional or kwargs way.')
+        sdata = []
+        indptr = [0]
+        if len(args) != 0:
+            keys = None
+            for s in args:
+                if s is not None:
+                    if not isinstance(s, tuple):
+                        raise TypeError('Argument need to be shapes(tuple)')
+                    sdata.extend(s)
+                indptr.append(len(sdata))
+        else:
+            keys = []
+            for k, v in kwargs.items():
+                keys.append(c_str(k))
+                if not isinstance(v, tuple):
+                    raise TypeError('Argument need to be shapes(tuple)')
+                sdata.extend(v)
+                indptr.append(len(sdata))
+        arg_shape_size = mx_uint()
+        arg_shape_ndim = ctypes.POINTER(mx_uint)()
+        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        out_shape_size = mx_uint()
+        out_shape_ndim = ctypes.POINTER(mx_uint)()
+        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        complete = ctypes.c_int()
+        check_call(_LIB.MXSymbolInferShape(
+                self.handle, len(indptr) - 1,
+                c_array(ctypes.c_char_p, keys),
+                c_array(mx_uint, indptr),
+                c_array(mx_uint, sdata),
+                ctypes.byref(arg_shape_size),
+                ctypes.byref(arg_shape_ndim),
+                ctypes.byref(arg_shape_data),
+                ctypes.byref(out_shape_size),
+                ctypes.byref(out_shape_ndim),
+                ctypes.byref(out_shape_data),
+                ctypes.byref(complete)))
+        if complete.value != 0:
+            arg_shapes = [tuple(arg_shape_data[i][:arg_shape_ndim[i]]) for i in range(arg_shape_size.value)]
+            out_shapes = [tuple(out_shape_data[i][:out_shape_ndim[i]]) for i in range(out_shape_size.value)]
+            return (arg_shapes, out_shapes)
+        else:
+            return (None, None)
+
     def debug_str(self):
         """Get a debug string.
 
@@ -145,6 +216,6 @@ def debug_str(self):
             Debug string of the symbol.
         """
         debug_str = ctypes.c_char_p()
-        check_call(_LIB.MXSymbolPrint( \
+        check_call(_LIB.MXSymbolPrint(
                 self.handle, ctypes.byref(debug_str)))
         return debug_str.value
diff --git a/python/mxnet/symbol_creator.py b/python/mxnet/symbol_creator.py
index c81deebaef11..d507a9c2871a 100644
--- a/python/mxnet/symbol_creator.py
+++ b/python/mxnet/symbol_creator.py
@@ -54,7 +54,7 @@ def __call__(self, *args, **kwargs):
             if isinstance(v, Symbol):
                 symbol_kwargs[k] = v
             else:
-                param_keys.append(k)
+                param_keys.append(c_str(k))
                 param_vals.append(c_str(str(v)))
 
         # create atomic symbol
diff --git a/python/test_infer_shape.py b/python/test_infer_shape.py
new file mode 100644
index 000000000000..b94388e5546d
--- /dev/null
+++ b/python/test_infer_shape.py
@@ -0,0 +1,19 @@
+# pylint: skip-file
+import mxnet as mx
+
+data = mx.sym.Variable('data')
+
+fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=1000)
+fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', num_hidden=10)
+fc3 = mx.sym.FullyConnected( name='fc2', num_hidden=10)
+
+print fc2.list_arguments()
+
+data_shape = (100, 100)
+arg_shapes, out_shapes = fc2.infer_shape(data=data_shape)
+print dict(zip(fc2.list_arguments(), arg_shapes))
+print dict(zip(fc2.list_returns(), out_shapes))
+
+weight_shape= (1, 100)
+data_shape = (100, 100)
+arg_shapes, out_shapes = fc2.infer_shape(data=data_shape, fc1_weight=weight_shape)
diff --git a/src/c_api.cc b/src/c_api.cc
index d5a1a67d70c6..896e0b5e5532 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -11,6 +11,7 @@
 #include <mxnet/registry.h>
 #include <mxnet/operator.h>
 #include <mxnet/c_api.h>
+#include <vector>
 #include <mutex>
 #include <memory>
 
@@ -27,61 +28,76 @@
 #message("Warning: Threadlocal is not enabled");
 #endif
 
-/*! \brief symbol wrapper to easily hold returning information */
-struct MXAPISymbolWrapper {
-  /*! \brief the actual symbol */
-  mxnet::Symbol sym;
+using namespace mxnet;
+
+/*! \brief entry to to easily hold returning information */
+struct MXAPIThreadLocalEntry {
+  /*! \brief holds last error message */
+  std::string last_error;
   /*! \brief result holder for returning string */
   std::string ret_str;
   /*! \brief result holder for returning strings */
   std::vector<std::string> ret_vec_str;
   /*! \brief result holder for returning string pointers */
   std::vector<const char *> ret_vec_charp;
+  /*! \brief result holder for returning shapes */
+  std::vector<TShape> arg_shapes, out_shapes;
+  /*! \brief result holder for returning shape dimensions */
+  std::vector<mx_uint> arg_shape_ndim, out_shape_ndim;
+  /*! \brief result holder for returning shape pointer */
+  std::vector<const mx_uint*> arg_shape_data, out_shape_data;
+  // helper function to setup return value of shape array
+  inline static void SetupShapeArrayReturn(
+      const std::vector<TShape> &shapes,
+      std::vector<mx_uint> *ndim,
+      std::vector<const mx_uint*> *data) {
+    ndim->resize(shapes.size());
+    data->resize(shapes.size());
+    for (size_t i = 0; i < shapes.size(); ++i) {
+      ndim->at(i) = shapes[i].ndim();
+      data->at(i) = shapes[i].data();
+    }
+  }
 };
 
 /*!
- * \brief helper to store error message in threadlocal storage
+ * \brief A threadlocal store to store threadlocal variables.
+ *  Will return a thread local singleton of type T
+ * \tparam T the type we like to store
  */
-class MXAPIErrorMessageHelper {
+class MXAPIThreadLocalStore {
  public:
-  /*! \brief get a single instance out from */
-  static MXAPIErrorMessageHelper *Get() {
-    static MXAPIErrorMessageHelper inst;
-    return &inst;
-  }
-  /*!
-   * \brief a helper function for error handling
-   *  will set the last error to be str_set when it is not NULL
-   * \param str_set the error to set
-   * \return a pointer message to last error
-   */
-  static const char *SetGetLastError(const char *str_set) {
-    // use last_error to record last error
-    static MX_TREAD_LOCAL std::string *last_error = NULL;
-    if (last_error == NULL) {
-      last_error = new std::string();
-      Get()->RegisterDelete(last_error);
+  /*! \brief store return entry */
+  typedef MXAPIThreadLocalEntry T;
+  /*! \return get a thread local singleton */
+  static T* Get() {
+    static MX_TREAD_LOCAL T* ptr = nullptr;
+    if (ptr == nullptr) {
+      ptr = new T();
+      Singleton()->RegisterDelete(ptr);
     }
-    if (str_set != NULL) {
-      *last_error = str_set;
-    }
-    return last_error->c_str();
+    return ptr;
   }
 
  private:
   /*! \brief constructor */
-  MXAPIErrorMessageHelper() {}
+  MXAPIThreadLocalStore() {}
   /*! \brief destructor */
-  ~MXAPIErrorMessageHelper() {
+  ~MXAPIThreadLocalStore() {
     for (size_t i = 0; i < data_.size(); ++i) {
       delete data_[i];
     }
   }
+  /*! \return singleton of the store */
+  static MXAPIThreadLocalStore *Singleton() {
+    static MXAPIThreadLocalStore inst;
+    return &inst;
+  }
   /*!
    * \brief register str for internal deletion
    * \param str the string pointer
    */
-  void RegisterDelete(std::string *str) {
+  void RegisterDelete(T *str) {
     std::unique_lock<std::mutex> lock(mutex_);
     data_.push_back(str);
     lock.unlock();
@@ -89,13 +105,12 @@ class MXAPIErrorMessageHelper {
   /*! \brief internal mutex */
   std::mutex mutex_;
   /*!\brief internal data */
-  std::vector<std::string*> data_;
+  std::vector<T*> data_;
 };
 
 // NOTE: all functions return 0 upon success
 // consider add try/catch block for user error
 // handling in the future
-using namespace mxnet;
 
 /*! \brief  macro to guard beginning and end section of all functions */
 #define API_BEGIN() try {
@@ -111,7 +126,7 @@ using namespace mxnet;
 
 /*! \brief return str message of the last error */
 const char *MXGetLastError() {
-  return MXAPIErrorMessageHelper::SetGetLastError(NULL);
+  return MXAPIThreadLocalStore::Get()->last_error.c_str();
 }
 
 /*!
@@ -120,7 +135,7 @@ const char *MXGetLastError() {
  * \return the return value of API after exception is handled
  */
 int MXHandleException(const dmlc::Error &e) {
-  MXAPIErrorMessageHelper::SetGetLastError(e.what());
+  MXAPIThreadLocalStore::Get()->last_error = e.what();
   return -1;
 }
 
@@ -295,7 +310,7 @@ int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
                                const char **keys,
                                const char **vals,
                                SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
+  Symbol *s = new Symbol();
   OperatorProperty *op = nullptr;
 
   API_BEGIN();
@@ -304,15 +319,15 @@ int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
   for (int i = 0; i < num_param; ++i) {
     op->SetParam(keys[i], vals[i]);
   }
-  s->sym = Symbol::Create(op);
+  *s = Symbol::Create(op);
   *out = s;
   API_END_HANDLE_ERROR(delete s; delete op);
 }
 
 int MXSymbolCreateVariable(const char *name, SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
+  Symbol *s = new Symbol();
   API_BEGIN();
-  s->sym = Symbol::CreateVariable(name);
+  *s = Symbol::CreateVariable(name);
   *out = s;
   API_END_HANDLE_ERROR(delete s);
 }
@@ -320,71 +335,72 @@ int MXSymbolCreateVariable(const char *name, SymbolHandle *out) {
 int MXSymbolCreateGroup(mx_uint num_symbols,
                         SymbolHandle *symbols,
                         SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
-  MXAPISymbolWrapper **sym_arr = (MXAPISymbolWrapper**)symbols; // NOLINT(*)
+  Symbol *s = new Symbol();
+  Symbol **sym_arr = (Symbol**)symbols; // NOLINT(*)
   API_BEGIN();
   std::vector<Symbol> syms;
   for (mx_uint i = 0; i < num_symbols; ++i) {
-    syms.push_back(sym_arr[i]->sym);
+    syms.push_back(*sym_arr[i]);
   }
-  s->sym = Symbol::CreateGroup(syms);
+  *s = Symbol::CreateGroup(syms);
   *out = s;
   API_END_HANDLE_ERROR(delete s);
 }
 
 int MXSymbolFree(SymbolHandle symbol) {
   API_BEGIN();
-  delete static_cast<MXAPISymbolWrapper*>(symbol);
+  delete static_cast<Symbol*>(symbol);
   API_END();
 }
 
 int MXSymbolCopy(SymbolHandle symbol, SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
-
+  Symbol *s = new Symbol();
   API_BEGIN();
-  s->sym = (static_cast<const MXAPISymbolWrapper*>(symbol)->sym).Copy();
+  *s = static_cast<const Symbol*>(symbol)->Copy();
   *out = s;
   API_END_HANDLE_ERROR(delete s);
 }
 
 int MXSymbolPrint(SymbolHandle symbol, const char **out_str) {
-  MXAPISymbolWrapper *s = static_cast<MXAPISymbolWrapper*>(symbol);
-
+  Symbol *s = static_cast<Symbol*>(symbol);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
   std::ostringstream os;
-  (s->sym).Print(os);
-  s->ret_str = os.str();
-  *out_str = (s->ret_str).c_str();
+  s->Print(os);
+  ret->ret_str = os.str();
+  *out_str = (ret->ret_str).c_str();
   API_END();
 }
 
 int MXSymbolListArguments(SymbolHandle symbol,
                           mx_uint *out_size,
                           const char ***out_str_array) {
-  MXAPISymbolWrapper *s = static_cast<MXAPISymbolWrapper*>(symbol);
+  Symbol *s = static_cast<Symbol*>(symbol);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
-  s->ret_vec_str = std::move((s->sym).ListArguments());
-  s->ret_vec_charp.clear();
-  for (size_t i = 0; i < s->ret_vec_str.size(); ++i) {
-    s->ret_vec_charp.push_back(s->ret_vec_str[i].c_str());
+  ret->ret_vec_str = std::move(s->ListArguments());
+  ret->ret_vec_charp.clear();
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
   }
-  *out_size = static_cast<mx_uint>(s->ret_vec_charp.size());
-  *out_str_array = dmlc::BeginPtr(s->ret_vec_charp);
+  *out_size = static_cast<mx_uint>(ret->ret_vec_charp.size());
+  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
   API_END();
 }
 
 int MXSymbolListReturns(SymbolHandle symbol,
-                          mx_uint *out_size,
-                          const char ***out_str_array) {
-  MXAPISymbolWrapper *s = static_cast<MXAPISymbolWrapper*>(symbol);
+                        mx_uint *out_size,
+                        const char ***out_str_array) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
-  s->ret_vec_str = std::move((s->sym).ListReturns());
-  s->ret_vec_charp.clear();
-  for (size_t i = 0; i < s->ret_vec_str.size(); ++i) {
-    s->ret_vec_charp.push_back(s->ret_vec_str[i].c_str());
+  ret->ret_vec_str = std::move(s->ListReturns());
+  ret->ret_vec_charp.clear();
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
   }
-  *out_size = static_cast<mx_uint>(s->ret_vec_charp.size());
-  *out_str_array = dmlc::BeginPtr(s->ret_vec_charp);
+  *out_size = static_cast<mx_uint>(ret->ret_vec_charp.size());
+  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
   API_END();
 }
 
@@ -397,19 +413,68 @@ int MXSymbolCompose(SymbolHandle sym,
   std::string s_name;
   if (name != nullptr) s_name = name;
 
-  MXAPISymbolWrapper* s = static_cast<MXAPISymbolWrapper*>(sym);
+  Symbol* s = static_cast<Symbol*>(sym);
   if (keys == nullptr && num_args != 0) {
     std::vector<Symbol> pos_args;
     for (mx_uint i = 0; i < num_args; ++i) {
-      pos_args.push_back(((MXAPISymbolWrapper*)(args[i]))->sym);  //  NOLINT(*)
+      pos_args.push_back(*((Symbol*)args[i]));  //  NOLINT(*)
     }
-    (s->sym).Compose(pos_args, s_name);
+    s->Compose(pos_args, s_name);
   } else {
     std::unordered_map<std::string, Symbol> kwargs;
     for (mx_uint i = 0; i < num_args; ++i) {
-      kwargs[keys[i]] = ((MXAPISymbolWrapper*)(args[i]))->sym;  //  NOLINT(*)
+      kwargs[keys[i]] = *((Symbol*)args[i]);  //  NOLINT(*)
+    }
+    s->Compose(kwargs, s_name);
+  }
+  API_END();
+}
+
+int MXSymbolInferShape(SymbolHandle sym,
+                       mx_uint num_args,
+                       const char** keys,
+                       const mx_uint *arg_ind_ptr,
+                       const mx_uint *arg_shape_data,
+                       mx_uint *in_shape_size,
+                       const mx_uint **in_shape_ndim,
+                       const mx_uint ***in_shape_data,
+                       mx_uint *out_shape_size,
+                       const mx_uint **out_shape_ndim,
+                       const mx_uint ***out_shape_data,
+                       int *complete) {
+  Symbol *s = static_cast<Symbol*>(sym);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  bool succ;
+  API_BEGIN();
+  if (keys == nullptr && num_args != 0) {
+    ret->arg_shapes.clear();
+    for (mx_uint i = 0; i < num_args; ++i) {
+      ret->arg_shapes.push_back(TShape(arg_shape_data + arg_ind_ptr[i],
+                                       arg_shape_data + arg_ind_ptr[i+1]));
     }
-    (s->sym).Compose(kwargs, s_name);
+    succ = s->InferShape(&(ret->arg_shapes), &(ret->out_shapes));
+  } else {
+    std::unordered_map<std::string, TShape> kwargs;
+    for (mx_uint i = 0; i < num_args; ++i) {
+      kwargs[keys[i]] = TShape(arg_shape_data + arg_ind_ptr[i],
+                               arg_shape_data + arg_ind_ptr[i+1]);
+    }
+    succ = s->InferShape(kwargs, &(ret->arg_shapes), &(ret->out_shapes));
+  }
+  if (succ) {
+    MXAPIThreadLocalEntry::SetupShapeArrayReturn(
+        ret->arg_shapes, &(ret->arg_shape_ndim), &(ret->arg_shape_data));
+    MXAPIThreadLocalEntry::SetupShapeArrayReturn(
+        ret->out_shapes, &(ret->out_shape_ndim), &(ret->out_shape_data));
+    *in_shape_size = static_cast<mx_uint>(ret->arg_shapes.size());
+    *in_shape_ndim = dmlc::BeginPtr(ret->arg_shape_ndim);
+    *in_shape_data = dmlc::BeginPtr(ret->arg_shape_data);
+    *out_shape_size = static_cast<mx_uint>(ret->out_shapes.size());
+    *out_shape_ndim = dmlc::BeginPtr(ret->out_shape_ndim);
+    *out_shape_data = dmlc::BeginPtr(ret->out_shape_data);
+    *complete = 1;
+  } else {
+    *complete = 0;
   }
   API_END();
 }
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 221e69ce948f..564f81d47833 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -37,7 +37,7 @@ class ActivationOp : public Operator {
     CHECK_EQ(req[kOut], kWriteTo);
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
     out = F<ForwardOp>(data);
@@ -54,7 +54,7 @@ class ActivationOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
     CHECK_EQ(req.size(), 1);
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> out_gradient = out_grad[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 5c54d37220ee..e2913e65aba8 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -45,7 +45,7 @@ class FullyConnectedOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     // TODO(bing): check the BLAS Handle, be careful
     // maybe need blas handle from context
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
     Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
@@ -70,7 +70,7 @@ class FullyConnectedOp : public Operator {
     CHECK_EQ(req.size(), expected);
     // TODO(bing): check the BLAS Handle, be careful
     //  maybe need blas handle from context
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
     Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
@@ -123,16 +123,25 @@ class FullyConnectedProp : public OperatorProperty {
     }
     CHECK_GT(param_.num_hidden, 0);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_EQ(dshape.ndim(), 4) << \
-        "Input data should be 4D in batch-1-1-hidden";
-    CHECK_NE(dshape.ndim(), 0) << "Require data shape to be known";
-    ShapeAssignCheck((*in_shape)[kWeight], Shape2(param_.num_hidden, dshape[3]));
+    // require data to be known
+    if (dshape.ndim() ==  0) return false;
+
+    index_t num_input;
+    if (dshape.ndim() == 4) {
+      // TODO(bing) consider deprecate 4D input
+      CHECK(dshape[1] == 1 && dshape[2] == 1);
+      num_input = dshape[3];
+    } else {
+      CHECK_EQ(dshape.ndim(), 2)
+          << "FullyConnecteded: Input data should be 2D in (batch, num_hidden)";
+      num_input = dshape[1];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, kWeight, Shape2(param_.num_hidden, num_input));
     if (param_.no_bias == 0) {
-      ShapeAssignCheck((*in_shape)[kBias], Shape1(param_.num_hidden));
+      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_hidden));
     }
     out_shape->clear();
-    out_shape->push_back(dshape);
-    (*out_shape)[0][3] = param_.num_hidden;
+    out_shape->push_back(Shape2(dshape[0], param_.num_hidden));
     return true;
   }
 
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 87b581f28278..7ffa3d1456d2 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -34,20 +34,39 @@ inline void Assign(OType &out, // NOLINT(*)
     default: LOG(FATAL) << "not reached";
   }
 }
+
+/*! \brief exception throwed by InferShape error */
+struct InferShapeError {
+  /*! \brief analyze message */
+  std::string msg;
+  /*! \brief corresponding input index */
+  int index;
+  // constructor
+  InferShapeError(std::string msg, int index)
+      : msg(msg), index(index) {}
+};
+
 /*!
- * \brief assign shape to out if out is unknown
- *  otherwise check consistency
- * \param out the output shape to be stored
+ * \brief macro assign shape to out if out is unknown otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param shape_array the shape array to store the result
+ * \param index the index of in the array
  * \param shape the infered shape
  */
-template<typename TS>
-inline void ShapeAssignCheck(TShape &out, const TS &shape) { // NOLINT(*)
-  if (out.ndim() == 0) {
-    out = shape;
-  } else {
-    CHECK(out == shape) << "InferShape:: shape inconsistent";
+#define SHAPE_ASSIGN_CHECK(shape_array, index, shape)                   \
+  {                                                                     \
+    auto &out = (shape_array)[index];                                   \
+    if (out.ndim() == 0) {                                              \
+      out = shape;                                                      \
+    } else {                                                            \
+      if (out != shape) {                                               \
+        std::ostringstream os;                                          \
+        os << "Shape inconsistent, Provided " <<  '='<< out << ','      \
+           << " inferred shape=" << shape;                              \
+        throw ::mxnet::op::InferShapeError(os.str(), index);            \
+      }                                                                 \
+    }                                                                   \
   }
-}
 
 // helper macro to implement bind dispatch
 #if MXNET_USE_CUDA
diff --git a/src/operator/param.h b/src/operator/param.h
index e1f6b4ee58d8..f0ce5886e2fb 100644
--- a/src/operator/param.h
+++ b/src/operator/param.h
@@ -35,10 +35,6 @@ struct Param {
   int no_bias;
   /*! \brief maximum temp_col_size allowed in each layer */
   int temp_col_max;
-  /*! \brief number of input channels */
-  int num_input_channel;
-  /*! \brief number of input hidden nodes, used by fullc */
-  int num_input_node;
   /*! \brief reserved fields, for future compatibility */
   int reserved[64];
 
@@ -48,11 +44,9 @@ struct Param {
   }
 
   inline void SetParam(const char *name, const char* val) {
-    if (!strcmp(name, "nhidden")) num_hidden = atoi(val);
-    if (!strcmp(name, "num_input_node")) num_input_node = atoi(val);
-    if (!strcmp(name, "num_input_channel")) num_input_channel = atoi(val);
-    if (!strcmp(name, "nchannel")) num_channel = atoi(val);
-    if (!strcmp(name, "ngroup")) num_group = atoi(val);
+    if (!strcmp(name, "num_hidden")) num_hidden = atoi(val);
+    if (!strcmp(name, "num_channel")) num_channel = atoi(val);
+    if (!strcmp(name, "num_group")) num_group = atoi(val);
     if (!strcmp(name, "kernel_size")) {
       kernel_y = kernel_x = atoi(val);
     }
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 5419e26afe86..62de7963638a 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -7,14 +7,18 @@
 #include <mxnet/symbolic.h>
 #include <vector>
 #include <queue>
+#include "../operator/operator_common.h"
 
 namespace mxnet {
 std::vector<uint32_t> StaticGraph::TopoSort() const {
   std::vector<int> out_degree(nodes.size(), 0);
-  for (const Node &n : nodes) {
-    for (const DataEntry &e : n.inputs) {
+  for (const Node& n : nodes) {
+    for (const DataEntry& e : n.inputs) {
       ++out_degree[e.source_id];
     }
+    if (n.is_backward()) {
+      ++out_degree[n.backward_source_id];
+    }
   }
   std::vector<uint32_t> ret(nodes.size());
   auto result = ret.rbegin();
@@ -29,12 +33,17 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
     queue.pop();
     *result = node_id;
     ++result;
-    for (const DataEntry &e : nodes[node_id].inputs) {
-      out_degree[e.source_id] -= 1;
-      if (out_degree[e.source_id] == 0) {
+    const Node& n = nodes[node_id];
+    for (const DataEntry& e : n.inputs) {
+      if (--out_degree[e.source_id] == 0) {
         queue.push(e.source_id);
       }
     }
+    if (n.is_backward()) {
+      if (--out_degree[n.backward_source_id] == 0) {
+        queue.push(n.backward_source_id);
+      }
+    }
   }
   return std::move(ret);
 }
@@ -42,19 +51,73 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
 bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
                                   std::vector<std::vector<TShape> > *node_out_shapes) const {
   for (uint32_t nid : topo_order) {
-    const Node &node = nodes[nid];
-    if (node.op != nullptr) {
+    const Node& node = nodes[nid];
+    if (node.is_forward()) {
       std::vector<TShape> in_shape;
-      for (const DataEntry &e : node.inputs) {
+      for (const DataEntry& e : node.inputs) {
         in_shape.push_back((*node_out_shapes)[e.source_id][e.index]);
       }
-      if (!node.op->InferShape(&in_shape, &(*node_out_shapes)[nid])) return false;
+      try {
+        if (!node.op->InferShape(&in_shape, &(*node_out_shapes)[nid])) return false;
+      } catch (const op::InferShapeError &err) {
+        // error handling
+        const std::string &op_name = node.name;
+        std::string arg_name = node.op->ListArguments()[err.index];
+        std::ostringstream os;
+        os << "InferShape Error in "
+           << op_name << "\'s" << ' ' << arg_name << " argument\n";
+        auto &source = nodes[node.inputs[err.index].source_id];
+        if (source.is_variable()) {
+          os << "Corresponding keyword of symbol: " << source.name << '\n' << err.msg;
+        }
+        throw dmlc::Error(os.str());
+      }
       for (size_t i = 0; i < node.inputs.size(); ++i) {
-        const DataEntry &e = node.inputs[i];
+        const DataEntry& e = node.inputs[i];
         (*node_out_shapes)[e.source_id][e.index] = in_shape[i];
       }
+    } else if (node.is_backward()) {
+      // simply use shapes from forward pass to assign backward shape
+      const Node& forward = nodes[node.backward_source_id];
+      CHECK(forward.is_forward());
+      std::vector<TShape>& in_grad_shapes = (*node_out_shapes)[nid];
+      CHECK(in_grad_shapes.size() == forward.inputs.size());
+      // assign the input shape to output gradients
+      for (size_t i = 0; i < forward.inputs.size(); ++i) {
+        const DataEntry &e = forward.inputs[i];
+        try {
+          SHAPE_ASSIGN_CHECK(in_grad_shapes, i, (*node_out_shapes)[e.source_id][e.index]);
+        } catch (const op::InferShapeError &err) {
+          const std::string &op_name = forward.name;
+          std::string arg_name = forward.op->ListArguments()[e.index];
+          std::ostringstream os;
+          os << "InferShape Error in "
+             << op_name << "\'s" << ' ' << arg_name << " gradient argument\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
+      // consistent check for input shapes
+      auto& out_data_shapes = (*node_out_shapes)[node.backward_source_id];
+      // use BackwardInputs to select entries corresponding to node.inputs
+      auto in_shape = forward.op->BackwardInputs(
+          out_data_shapes, in_grad_shapes, out_data_shapes);
+      for (size_t i = 0; i < node.inputs.size(); ++i) {
+        const DataEntry& e = node.inputs[i];
+        try {
+          SHAPE_ASSIGN_CHECK((*node_out_shapes)[e.source_id], e.index, in_shape[i]);
+        } catch (const op::InferShapeError &err) {
+          const std::string &op_name = nodes[e.source_id].name;
+          std::ostringstream os;
+          os << "InferShape Error in "
+             << op_name << "\'s" << " gradient values\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
     }
   }
+  // TODO(bing) assign shape for head gradient
   return true;
 }
 
@@ -63,8 +126,10 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   std::vector<std::vector<TShape> > node_out_shapes(nodes.size());
   for (size_t i = 0; i < nodes.size(); ++i) {
     int nout = 1;
-    if (nodes[i].op != nullptr) {
+    if (nodes[i].is_forward()) {
       nout = nodes[i].op->NumReturns();
+    } else if (nodes[i].is_backward()) {
+      nout = static_cast<int>(nodes[nodes[i].backward_source_id].inputs.size());
     }
     node_out_shapes[i].resize(nout);
   }
@@ -78,8 +143,9 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   for (size_t i = 0; i < arg_nodes.size(); ++i) {
     (*in_shape)[i] = node_out_shapes[arg_nodes[i]][0];
   }
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    DataEntry e = outputs[i];
+  out_shape->resize(heads.size());
+  for (size_t i = 0; i < heads.size(); ++i) {
+    const DataEntry &e = heads[i];
     (*out_shape)[i] = node_out_shapes[e.source_id][e.index];
   }
   return true;
diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index 86cf54feabfa..54a5fe9422b2 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -1,7 +1,7 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file symbol.cc
- * \brief symbol of mxnet
+  *\file symbol.cc
+  *\brief symbol of mxnet
  */
 #include <dmlc/logging.h>
 #include <mxnet/symbolic.h>
@@ -12,13 +12,13 @@
 
 namespace mxnet {
 /*!
- * \brief Node is represents node of an operator in the symbolic graph.
+  *\brief Node is represents node of an operator in the symbolic graph.
  *
- * It stores connection to the inputs to function represented by OperatorProperty
- * NOTE on data structure: there are three types of node:
- * - Normal node: contains all the necessary elements of a graph.
- * - OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
- * - Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
+  *It stores connection to the inputs to function represented by OperatorProperty
+  *NOTE on data structure: there are three types of node:
+  *- Normal node: contains all the necessary elements of a graph.
+  *- OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
+  *- Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
  */
 struct Symbol::Node {
   /*! \brief Operator of this node */
@@ -28,11 +28,11 @@ struct Symbol::Node {
   /*! \brief inputs to this node */
   std::vector<DataEntry> inputs;
   /*!
-   * \brief constructor
-   * \param op the OperatorProperty to construct the Node
-   * \param name the name of the symbol
+    *\brief constructor
+    *\param op the OperatorProperty to construct the Node
+    *\param name the name of the symbol
    */
-  explicit Node(OperatorProperty* op = nullptr, const std::string& name = "")
+  explicit Node(OperatorProperty *op = nullptr, const std::string& name = "")
       : op(op), name(name) {
   }
   /*! \return Whether the symbol is atomic */
@@ -63,7 +63,7 @@ inline void Symbol::DFSVisit(FVisit fvisit) const {
     }
   }
   while (!stack.empty()) {
-    Node* back = stack.back();
+    Node *back = stack.back();
     stack.pop_back();
     fvisit(back);
     for (auto it = back->inputs.rbegin(); it != back->inputs.rend(); ++it) {
@@ -76,6 +76,28 @@ inline void Symbol::DFSVisit(FVisit fvisit) const {
   }
 }
 
+// helper function to handle keyword argument mismatch
+// throw approperiate messages
+template<typename TMap>
+inline void KeywordArgumentMismatch(const char *source,
+                                    const TMap &kwargs,
+                                    const std::vector<std::string> args) {
+  std::unordered_set<std::string> keys(args.begin(), args.end());
+  std::ostringstream head, msg;
+  msg << "\nCandidate arguments:\n";
+  for (size_t i = 0; i < args.size(); ++i) {
+    msg << "\t[" << i << ']' << args[i] << '\n';
+  }
+
+  for (const auto& kv : kwargs) {
+    if (keys.count(kv.first) == 0) {
+      LOG(FATAL) << source
+                 << "Keyword argument name " << kv.first << " not found."
+                 << msg.str();
+    }
+  }
+}
+
 int Symbol::FindDuplicateArgs(std::unordered_map<std::string, int> *out) const {
   out->clear();
   int max_dup = 1;
@@ -328,19 +350,8 @@ void Symbol::Compose(const std::unordered_map<std::string, Symbol>& kwargs,
     }
   }
   if (nmatched != kwargs.size()) {
-    // Error message handling
-    std::vector<std::string> req_args = this->ListArguments();
-    std::unordered_set<std::string> keys(req_args.begin(), req_args.end());
-    std::ostringstream msg;
-    msg << "\nCandidate arguments:\n";
-    for (size_t i = 0; i < req_args.size(); ++i) {
-      msg << "\t[" << i << ']' << req_args[i] << '\n';
-    }
-    for (const auto& kv : kwargs) {
-      CHECK_NE(keys.count(kv.first), 0)
-          << "Keyword Argument " << kv.first << " not found in arguments."
-          << msg.str();
-    }
+    KeywordArgumentMismatch(
+        "Symbol.Compose", kwargs, ListArguments());
   }
 }
 
@@ -358,11 +369,34 @@ Symbol Symbol::operator () (const std::unordered_map<std::string, Symbol>& kwarg
   return s;
 }
 
-bool Symbol::InferShape(std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape) const {
+bool Symbol::InferShape(std::vector<TShape> *arg_shapes,
+                        std::vector<TShape> *out_shapes) const {
+  StaticGraph g;
+  this->ToStaticGraph(&g);
+  return g.InferShape(arg_shapes, out_shapes);
+}
+
+bool Symbol::InferShape(const std::unordered_map<std::string, TShape>& known_arg_shapes,
+                        std::vector<TShape> *arg_shapes,
+                        std::vector<TShape> *out_shapes) const {
   StaticGraph g;
   this->ToStaticGraph(&g);
-  return g.InferShape(in_shape, out_shape);
+  arg_shapes->clear();
+  arg_shapes->resize(g.arg_nodes.size(), TShape());
+  size_t nmatched = 0;
+  for (size_t i = 0; i < g.arg_nodes.size(); ++i) {
+    const std::string& name = g.nodes[g.arg_nodes[i]].name;
+    auto it = known_arg_shapes.find(name);
+    if (it != known_arg_shapes.end()) {
+      arg_shapes->at(i) = it->second;
+      ++nmatched;
+    }
+  }
+  if (nmatched != known_arg_shapes.size()) {
+    KeywordArgumentMismatch(
+        "Symbol.InterShape", known_arg_shapes, ListArguments());
+  }
+  return g.InferShape(arg_shapes, out_shapes);
 }
 
 Symbol Symbol::Create(OperatorProperty *op)  {
@@ -424,12 +458,12 @@ void Symbol::ToStaticGraph(StaticGraph *out_graph) const {
     }
   }
   // setup heads
-  out_graph->outputs.clear();
+  out_graph->heads.clear();
   for (auto &head : heads_) {
     StaticGraph::DataEntry e;
     e.source_id = node_index[head.source.get()];
     e.index = head.index;
-    out_graph->outputs.push_back(e);
+    out_graph->heads.push_back(e);
   }
 }
 }  // namespace mxnet

From 8fe878e1746338c3c67dcdc2dfc0961b0d9ef6f2 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 15 Aug 2015 21:09:40 -0600
Subject: [PATCH 03/11] minor change in act

---
 src/operator/activation-inl.h                 | 52 +++++++---------
 src/operator/activation.cc                    |  5 +-
 src/operator/activation.cu                    |  5 +-
 .../{static_operator => }/mshadow_op.h        | 19 +++---
 src/operator/operator_common.h                |  1 +
 .../static_operator/activation_op-inl.h       | 61 -------------------
 6 files changed, 40 insertions(+), 103 deletions(-)
 rename src/operator/{static_operator => }/mshadow_op.h (87%)
 delete mode 100644 src/operator/static_operator/activation_op-inl.h

diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 564f81d47833..27bde578a3b1 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -20,7 +20,7 @@ namespace op {
 // // These enums are only visible within this header
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
-enum ActivationOpType {kReLU};
+enum ActivationOpType {kUnknown, kReLU, kSigmoid, kTanh};
 /**
  * \brief This is the implementation of activation operator.
  * \tparam xpu The device that the op will be executed on.
@@ -34,7 +34,6 @@ class ActivationOp : public Operator {
                        const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(req[kOut], kWriteTo);
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -56,9 +55,9 @@ class ActivationOp : public Operator {
     CHECK_EQ(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> out_gradient = out_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> output = out_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(grad, req[kData], F<BackwardOp>(out_gradient * F<ForwardOp>(data)));
+    Assign(grad, req[kData], F<BackwardOp>(out_gradient * output));
   }
 };  // class ActivationOp
 
@@ -69,33 +68,35 @@ Operator* CreateActivationOp(ActivationOpType type);
 #if DMLC_USE_CXX11
 class ActivationProp : public OperatorProperty {
  public:
+  ActivationProp() : type_(kUnknown) {}
+
   virtual void SetParam(const char *name, const char *val) {
     if (!strcmp(name, "type")) {
-      if (!strcmp(val, "relu")) {
-        type_ = kReLU;
-      }
+      if (!strcmp(val, "relu")) type_ = kReLU;
+      if (!strcmp(val, "sigmoid")) type_ = kSigmoid;
+      if (!strcmp(val, "tanh")) type_ = kTanh;
     }
-    // TODO(bing): check optype valid
+    CHECK(type_ >= kReLU && type_ <= kTanh) << "Invalid activation type";
   }
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) const {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
     const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
     out_shape->clear();
     out_shape->push_back(dshape);
     return true;
   }
 
   virtual OperatorProperty* Copy() const {
-    return new ActivationProp();
+    auto ptr = new ActivationProp();
+    ptr->type_ = this->type_;
+    return ptr;
   }
 
   virtual std::string TypeString() const {
-    switch (type_) {
-      case kReLU: return "Activation : ReLU";
-      default: return "Invalid Activation";
-    }
+    return "Activation";
   }
 
   // decalre dependency and inplace optimization options
@@ -103,7 +104,7 @@ class ActivationProp : public OperatorProperty {
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data) const {
-    return {out_grad[kOut], in_data[kData]};
+    return {out_grad[kOut], out_data[kData]};
   }
 
   virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
@@ -111,7 +112,13 @@ class ActivationProp : public OperatorProperty {
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
       const std::vector<int> &in_grad) const {
-    return {};
+    return {{out_grad[kData], in_grad[kData]}};
+  }
+
+  virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {{in_data[kData], out_data[kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
@@ -120,21 +127,6 @@ class ActivationProp : public OperatorProperty {
   ActivationOpType type_;
 };
 #endif  // DMLC_USE_CXX11
-
-namespace act {
-/*! \brief Rectified Linear Operation */
-struct relu {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a > 0.0f ? a : 0.0f;
-  }
-};
-struct relu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a > 0.0f ? 1.0f : 0.0f;
-  }
-};
-
-}  // namespace act
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_ACTIVATION_INL_H_
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index b26c1e24dc53..6b822a68c8e5 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -7,13 +7,16 @@
 
 #include <mxnet/registry.h>
 #include "./activation-inl.h"
+#include "./mshadow_op.h"
 
 namespace mxnet {
 namespace op {
 template<>
 Operator *CreateActivationOp<cpu>(ActivationOpType type) {
   switch (type) {
-    case kReLU: return new ActivationOp<cpu, act::relu, act::relu_grad>();
+    case kReLU: return new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad>();
+    case kSigmoid: return new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
+    case kTanh: return new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
     default: return NULL;
   }
 }
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index b6a523c003ec..b7c771d653f3 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -6,13 +6,16 @@
 */
 
 #include "./activation-inl.h"
+#include "./mshadow_op.h"
 
 namespace mxnet {
 namespace op {
 template<>
 Operator *CreateActivationOp<gpu>(ActivationOpType type) {
   switch(type) {
-    case kReLU: return new ActivationOp<gpu, act::relu, act::relu_grad>();
+    case kReLU: return new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad>();
+    case kSigmoid: return new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
+    case kTanh: return new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
     default: return NULL;
   }
 }
diff --git a/src/operator/static_operator/mshadow_op.h b/src/operator/mshadow_op.h
similarity index 87%
rename from src/operator/static_operator/mshadow_op.h
rename to src/operator/mshadow_op.h
index bb33471f168a..010cf0ce7cc9 100644
--- a/src/operator/static_operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -1,17 +1,18 @@
 /*!
  * Copyright (c) 2015 by Contributors
  * \file mshadow_op.h
- * \brief extra mshadow operation for mxnet
+ * \brief
  * \author Bing Xu
- */
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_MSHADOW_OP_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_MSHADOW_OP_H_
+*/
+#ifndef MXNET_OPERATOR_MSHADOW_OP_H_
+#define MXNET_OPERATOR_MSHADOW_OP_H_
+
 #include <mxnet/base.h>
-#include <algorithm>
 
 namespace mxnet {
-/*! \brief operations for ActivationLayer */
 namespace op {
+namespace mshadow_op {
+/*! \brief identity Operation */
 struct identity {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return a;
@@ -98,9 +99,7 @@ struct square_root {
     return sqrt(a);
   }
 };
-
+}  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_MSHADOW_OP_H_
-
+#endif  // MXNET_OPERATOR_MSHADOW_OP_H_
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 7ffa3d1456d2..eea731c8fbe6 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -11,6 +11,7 @@
 #include <dmlc/logging.h>
 #include <mxnet/operator.h>
 #include <mxnet/base.h>
+#include <string>
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/static_operator/activation_op-inl.h b/src/operator/static_operator/activation_op-inl.h
deleted file mode 100644
index cfb0b7cec8b5..000000000000
--- a/src/operator/static_operator/activation_op-inl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file activation_op-inl.h
- * \brief activation operator of mxnet
- */
-
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_ACTIVATION_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_ACTIVATION_OP_INL_H_
-
-#include <dmlc/logging.h>
-#include <mxnet/operator.h>
-#include <vector>
-#include "./static_operator_common.h"
-
-namespace mxnet {
-namespace op {
-template<typename xpu, typename ForwardOp, typename BackOp>
-class ActivationOp : public StaticOperator {
- public:
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    CHECK_EQ(in_shape->size(), 1) << "Only 1 input is allowed";
-    CHECK_NE((*in_shape)[0].ndim(), 0) << "Require data shape to be known";
-    out_shape->clear();
-    out_shape->push_back((*in_shape)[0]);
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    mshadow::Stream<xpu> *stream = \
-      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> in = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(stream);
-    out = mshadow::expr::F<ForwardOp>(in);
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    CHECK_EQ(grad_next.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    mshadow::Stream<xpu> *stream = \
-      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> out = out_grad[0].FlatTo2D<xpu, real_t>(stream);
-    Assign(out, req[0], mshadow::expr::F<BackOp>(
-        mshadow::expr::F<ForwardOp>(data)) * grad);
-  }
-};  // class ActivationOp
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_ACTIVATION_OP_INL_H_

From a3cde4ac8136ef2e1ff3f1b85a6dd6ac6740b315 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 15 Aug 2015 23:21:17 -0600
Subject: [PATCH 04/11] tmp save

---
 src/operator/activation-inl.h |   9 +-
 src/operator/elem_plus-inl.h  | 202 ++++++++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+), 4 deletions(-)
 create mode 100644 src/operator/elem_plus-inl.h

diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 27bde578a3b1..15ded9761b27 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -39,7 +39,7 @@ class ActivationOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    out = F<ForwardOp>(data);
+    Assign(out, req[kOut], F<ForwardOp>(data));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -68,7 +68,9 @@ Operator* CreateActivationOp(ActivationOpType type);
 #if DMLC_USE_CXX11
 class ActivationProp : public OperatorProperty {
  public:
-  ActivationProp() : type_(kUnknown) {}
+  explicit ActivationProp() : type_(kUnknown) {}
+
+  explicit ActivationProp(ActivationOpType type) : type_(type) {}
 
   virtual void SetParam(const char *name, const char *val) {
     if (!strcmp(name, "type")) {
@@ -90,8 +92,7 @@ class ActivationProp : public OperatorProperty {
   }
 
   virtual OperatorProperty* Copy() const {
-    auto ptr = new ActivationProp();
-    ptr->type_ = this->type_;
+    auto ptr = new ActivationProp(type_);
     return ptr;
   }
 
diff --git a/src/operator/elem_plus-inl.h b/src/operator/elem_plus-inl.h
new file mode 100644
index 000000000000..df2f50fbb508
--- /dev/null
+++ b/src/operator/elem_plus-inl.h
@@ -0,0 +1,202 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elem_plus-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_ELEM_PLUS_INL_H_
+#define MXNET_OPERATOR_ELEM_PLUS_INL_H_
+namespace mxnet {
+namespace op {
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+enum ElemsPlusOpInputs {kData0, kData1, kData2, kData3};
+enum ElemsPlusOpOutputs {kOut};
+
+template<typename xpu>
+class ElemPlusOp : public Operator {
+ public:
+  explicit ElemPlusOp(uint32_t size) : size_(size) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), cnt_) << "Invalid Input TBlobs";
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    switch (size_) {
+      case 1: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0);
+        break;
+      }
+      case 2: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1);
+        break;
+      }
+      case 3: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1 + in_3);
+        break;
+      }
+      case 4: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1 + in_3 + in_4);
+        break;
+      }
+      default: {
+        LOG_FATAL;
+      }
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), size_);
+    CHECK_EQ(out_data.size(), size_);
+    switch (size_) {
+      case 1: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kData0], F<mshadow_op::identity_grad>(in_0));
+        break;
+      }
+      case 2: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out_0, req[kData0], F<mshadow_op::identity_grad>(in_0));
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_1 = out_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out_1, req[kData1], F<mshadow_op::identity_grad>(in_1));
+        break;
+      }
+      case 3: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out_0, req[kData0], F<mshadow_op::identity_grad>(in_0));
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_1 = out_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out_1, req[kData1], F<mshadow_op::identity_grad>(in_1));
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_2 = out_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out_2, req[kData2], F<mshadow_op::identity_grad>(in_2));
+        break;
+      }
+      case 4: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out_0, req[kData0], F<mshadow_op::identity_grad>(in_0));
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_1 = out_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out_1, req[kData1], F<mshadow_op::identity_grad>(in_1));
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_2 = out_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out_2, req[kData2], F<mshadow_op::identity_grad>(in_2));
+        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> out_3 = out_data[kData3].FlatTo2D<xpu, real_t>(s);
+        Assign(out_3, req[kData3], F<mshadow_op::identity_grad>(in_3));
+        break;
+      }
+      default: {
+        LOG_FATAL;
+      }
+    }
+  }
+
+ private:
+  uint32_t size_;
+};  // class ElemPlusOp
+
+template<typename xpu>
+Operator* CreateElemPlusOp(uint32_t size);
+
+#if DMLC_USE_CXX11
+class ElemPlusProp : public OperatorProperty {
+ public:
+  explicit ElemPlusProp() : size_(0) {}
+
+  explicit ElemPlusProp(uint32_t sz) : size_(sz) {}
+
+  virtual void SetParam(const char *name, const char *val) {
+    if (!strcmp(name, "size")) size_ = static_cast<uint32_t>(atoi(val));
+    CHECK_GE(size_, 0);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    CHECK_GE(size_, 0);
+    CHECK_EQ(in_shape->size(), size_) << "Input should be: " << size_ << \
+      "(Given: " << in_shape->size() << ")";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    for (auto i : size_) {
+      CHECK_EQ(dshape, in_shape->at(i)) << "Input at " << i << " has different shape";
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new ElemPlusProp(size_);
+    return ptr;
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    std::vector<int> ret(size_);
+    for (auto i : size_) {
+      ret[i] = in_data[i];
+    }
+    return ret;
+  }
+
+  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<int> &in_grad) const {
+    std::vector<std::pair<int, int> > ret;
+    for (auto i : size_) {
+      ret.emplace_back(in_data[i], in_grad[i]);
+    }
+    return ret;
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  uint32_t size_;
+};  // class ElemPlusProp
+
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_ELEM_PLUS_INL_H_

From a2a6795ed72597eac20ec7275919ef8f9798a109 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 16 Aug 2015 18:29:01 -0600
Subject: [PATCH 05/11] adapt things to parameter

---
 Makefile                           |   6 +-
 include/mxnet/operator.h           |  12 +-
 include/mxnet/symbolic.h           |  14 ++
 src/c_api.cc                       |   4 +-
 src/operator/activation-inl.h      |  53 ++++----
 src/operator/activation.cc         |  14 +-
 src/operator/activation.cu         |   9 +-
 src/operator/elem_plus-inl.h       | 202 -----------------------------
 src/operator/elementwise_sum-inl.h | 172 ++++++++++++++++++++++++
 src/operator/elementwise_sum.cc    |  24 ++++
 src/operator/elementwise_sum.cu    |  14 ++
 src/operator/fully_connected-inl.h |  41 ++++--
 src/operator/fully_connected.cc    |   6 +-
 src/operator/fully_connected.cu    |   2 +-
 src/registry.cc                    |  10 +-
 15 files changed, 321 insertions(+), 262 deletions(-)
 delete mode 100644 src/operator/elem_plus-inl.h
 create mode 100644 src/operator/elementwise_sum-inl.h
 create mode 100644 src/operator/elementwise_sum.cc
 create mode 100644 src/operator/elementwise_sum.cu

diff --git a/Makefile b/Makefile
index da029f77ef27..e8a0a60844a1 100644
--- a/Makefile
+++ b/Makefile
@@ -58,14 +58,14 @@ endif
 BIN = test/api_registry_test
 OBJ = storage.o narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o fully_connected_cpu.o static_graph.o activation_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o
+	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -89,6 +89,8 @@ fully_connected_cpu.o: src/operator/fully_connected.cc
 fully_connected_gpu.o: src/operator/fully_connected.cu
 activation_cpu.o: src/operator/activation.cc
 activation_gpu.o: src/operator/activation.cu
+elementwise_sum_cpu.o: src/operator/elementwise_sum.cc
+elementwise_sum_gpu.o: src/operator/elementwise_sum.cu
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 65d6e3e92637..1c6bd860ab27 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -126,6 +126,12 @@ class OperatorProperty {
    * \brief virtual destructor
    */
   virtual ~OperatorProperty() {}
+  /*!
+   *  \brief Initialize the Operator by setting the parameters
+   *  This function need to be called before all other functions.
+   *  \param kwargs the keyword arguments parameters
+   */
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) = 0;
   /*!
    * \brief Get input arguments of the Operator.
    * \return vector of arguments.
@@ -159,12 +165,6 @@ class OperatorProperty {
   virtual int NumVisibleReturns() const {
     return NumReturns();
   }
-  /*!
-   *  \brief Set the parameters of the Operator.
-   *  \param name parameter name
-   *  \param val string for the configuration
-   */
-  virtual void SetParam(const char *name, const char *val) {}
   /*!
    * \brief infer the shapes of outputs and unknown input arguments
    * \param in_shape the shape of input arguments of the operator
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index e04f82b4f30f..106fd31923c7 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -121,6 +121,20 @@ class StaticGraph {
    */
   bool InferShape(std::vector<TShape>* in_shape,
                   std::vector<TShape>* out_shape) const;
+  /*!
+   * \brief Add a full backward pass in the static graph.
+   *  This function will add gradient nodes for each heads,
+   *  and add the backward pass to backprop the gradients all
+   *  the way to the arguments.
+   *
+   *  This will change the nodes field in the StaticGraph, but will not change other fields.
+   *  The head and input of Backward pass will be returned by head_grad_nodes and arg_grads.
+   *
+   * \param head_grad_nodes used to store the created head gradient inputs for backward pass.
+   * \param arg_grads used to store the gradient nodes
+   */
+  void MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
+                        std::vector<DataEntry> *arg_grads) const;
 };
 
 /*!
diff --git a/src/c_api.cc b/src/c_api.cc
index 896e0b5e5532..ed5446fc816a 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -316,9 +316,11 @@ int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
   API_BEGIN();
   OperatorPropertyEntry *e = static_cast<OperatorPropertyEntry *>(creator);
   op = (*e)();
+  std::vector<std::pair<std::string, std::string> > kwargs;
   for (int i = 0; i < num_param; ++i) {
-    op->SetParam(keys[i], vals[i]);
+    kwargs.push_back({std::string(keys[i]), std::string(vals[i])});
   }
+  op->Init(kwargs);
   *s = Symbol::Create(op);
   *out = s;
   API_END_HANDLE_ERROR(delete s; delete op);
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 15ded9761b27..6374d02cc53b 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -1,12 +1,14 @@
 /*!
  * Copyright (c) 2015 by Contributors
  * \file activation-inl.h
- * \brief
+ * \brief Activation operator
  * \author Bing Xu
 */
 #ifndef MXNET_OPERATOR_ACTIVATION_INL_H_
 #define MXNET_OPERATOR_ACTIVATION_INL_H_
+
 #include <dmlc/logging.h>
+#include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <cstring>
 #include <string>
@@ -20,7 +22,17 @@ namespace op {
 // // These enums are only visible within this header
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
-enum ActivationOpType {kUnknown, kReLU, kSigmoid, kTanh};
+enum ActivationOpType {kReLU, kSigmoid, kTanh};
+
+struct ActivationParam : public dmlc::Parameter<ActivationParam> {
+  // use int for enumeration
+  int type;
+  DMLC_DECLARE_PARAMETER(ActivationParam) {
+    // TODO(bing) support enum, str->int mapping
+    DMLC_DECLARE_FIELD(type).set_default(kReLU);
+  }
+};
+
 /**
  * \brief This is the implementation of activation operator.
  * \tparam xpu The device that the op will be executed on.
@@ -54,32 +66,26 @@ class ActivationOp : public Operator {
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
     CHECK_EQ(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> out_gradient = out_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> output = out_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(grad, req[kData], F<BackwardOp>(out_gradient * output));
+    Tensor<xpu, 2> m_out_grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_out_data = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_in_grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+    Assign(m_in_grad, req[kData], F<BackwardOp>(m_out_data) * m_out_grad);
   }
 };  // class ActivationOp
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateActivationOp(ActivationOpType type);
+Operator* CreateOp(ActivationParam type);
 
 #if DMLC_USE_CXX11
 class ActivationProp : public OperatorProperty {
  public:
-  explicit ActivationProp() : type_(kUnknown) {}
-
-  explicit ActivationProp(ActivationOpType type) : type_(type) {}
-
-  virtual void SetParam(const char *name, const char *val) {
-    if (!strcmp(name, "type")) {
-      if (!strcmp(val, "relu")) type_ = kReLU;
-      if (!strcmp(val, "sigmoid")) type_ = kSigmoid;
-      if (!strcmp(val, "tanh")) type_ = kTanh;
-    }
-    CHECK(type_ >= kReLU && type_ <= kTanh) << "Invalid activation type";
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    // TODO(bing) change directly to vector of pairs begin end
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.Init(kmap);
   }
+
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) const {
     using namespace mshadow;
@@ -92,7 +98,8 @@ class ActivationProp : public OperatorProperty {
   }
 
   virtual OperatorProperty* Copy() const {
-    auto ptr = new ActivationProp(type_);
+    auto ptr = new ActivationProp();
+    ptr->param_ = param_;
     return ptr;
   }
 
@@ -105,7 +112,7 @@ class ActivationProp : public OperatorProperty {
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data) const {
-    return {out_grad[kOut], out_data[kData]};
+    return {out_grad[kOut], out_data[kOut]};
   }
 
   virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
@@ -113,19 +120,19 @@ class ActivationProp : public OperatorProperty {
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
       const std::vector<int> &in_grad) const {
-    return {{out_grad[kData], in_grad[kData]}};
+    return {{out_grad[kOut], in_grad[kData]}};
   }
 
   virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
       const std::vector<int> &in_data,
       const std::vector<int> &out_data) const {
-    return {{in_data[kData], out_data[kData]}};
+    return {{in_data[kData], out_data[kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
 
  private:
-  ActivationOpType type_;
+  ActivationParam param_;
 };
 #endif  // DMLC_USE_CXX11
 }  // namespace op
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index 6b822a68c8e5..275588e099af 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -1,7 +1,7 @@
 /*!
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
- * \brief
+ * \brief activation op
  * \author Bing Xu
 */
 
@@ -12,20 +12,24 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateActivationOp<cpu>(ActivationOpType type) {
-  switch (type) {
+Operator *CreateOp<cpu>(ActivationParam param) {
+  switch (param.type) {
     case kReLU: return new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad>();
     case kSigmoid: return new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
     case kTanh: return new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
-    default: return NULL;
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return NULL;
   }
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
 Operator *ActivationProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateActivationOp, type_);
+  DO_BIND_DISPATCH(CreateOp, param_);
 }
 
+DMLC_REGISTER_PARAMETER(ActivationParam);
+
 REGISTER_OP_PROPERTY(Activation, ActivationProp);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index b7c771d653f3..5b7b576e59d7 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -4,19 +4,20 @@
  * \brief
  * \author Bing Xu
 */
-
 #include "./activation-inl.h"
 #include "./mshadow_op.h"
 
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateActivationOp<gpu>(ActivationOpType type) {
-  switch(type) {
+Operator *CreateOp<gpu>(ActivationParam param) {
+  switch(param.type) {
     case kReLU: return new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad>();
     case kSigmoid: return new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
     case kTanh: return new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
-    default: return NULL;
+    default:
+      LOG(FATAL) << "unknown activation";
+      return NULL;
   }
 }
 }  // op
diff --git a/src/operator/elem_plus-inl.h b/src/operator/elem_plus-inl.h
deleted file mode 100644
index df2f50fbb508..000000000000
--- a/src/operator/elem_plus-inl.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file elem_plus-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_ELEM_PLUS_INL_H_
-#define MXNET_OPERATOR_ELEM_PLUS_INL_H_
-namespace mxnet {
-namespace op {
-#include <dmlc/logging.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <string>
-#include <vector>
-#include <utility>
-#include "./operator_common.h"
-
-enum ElemsPlusOpInputs {kData0, kData1, kData2, kData3};
-enum ElemsPlusOpOutputs {kOut};
-
-template<typename xpu>
-class ElemPlusOp : public Operator {
- public:
-  explicit ElemPlusOp(uint32_t size) : size_(size) {}
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), cnt_) << "Invalid Input TBlobs";
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    switch (size_) {
-      case 1: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0);
-        break;
-      }
-      case 2: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1);
-        break;
-      }
-      case 3: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1 + in_3);
-        break;
-      }
-      case 4: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1 + in_3 + in_4);
-        break;
-      }
-      default: {
-        LOG_FATAL;
-      }
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), size_);
-    CHECK_EQ(out_data.size(), size_);
-    switch (size_) {
-      case 1: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kData0], F<mshadow_op::identity_grad>(in_0));
-        break;
-      }
-      case 2: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out_0, req[kData0], F<mshadow_op::identity_grad>(in_0));
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_1 = out_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Assign(out_1, req[kData1], F<mshadow_op::identity_grad>(in_1));
-        break;
-      }
-      case 3: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out_0, req[kData0], F<mshadow_op::identity_grad>(in_0));
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_1 = out_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Assign(out_1, req[kData1], F<mshadow_op::identity_grad>(in_1));
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_2 = out_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Assign(out_2, req[kData2], F<mshadow_op::identity_grad>(in_2));
-        break;
-      }
-      case 4: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_0 = out_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out_0, req[kData0], F<mshadow_op::identity_grad>(in_0));
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_1 = out_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Assign(out_1, req[kData1], F<mshadow_op::identity_grad>(in_1));
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_2 = out_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Assign(out_2, req[kData2], F<mshadow_op::identity_grad>(in_2));
-        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> out_3 = out_data[kData3].FlatTo2D<xpu, real_t>(s);
-        Assign(out_3, req[kData3], F<mshadow_op::identity_grad>(in_3));
-        break;
-      }
-      default: {
-        LOG_FATAL;
-      }
-    }
-  }
-
- private:
-  uint32_t size_;
-};  // class ElemPlusOp
-
-template<typename xpu>
-Operator* CreateElemPlusOp(uint32_t size);
-
-#if DMLC_USE_CXX11
-class ElemPlusProp : public OperatorProperty {
- public:
-  explicit ElemPlusProp() : size_(0) {}
-
-  explicit ElemPlusProp(uint32_t sz) : size_(sz) {}
-
-  virtual void SetParam(const char *name, const char *val) {
-    if (!strcmp(name, "size")) size_ = static_cast<uint32_t>(atoi(val));
-    CHECK_GE(size_, 0);
-  }
-
-  virtual bool InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) const {
-    using namespace mshadow;
-    CHECK_GE(size_, 0);
-    CHECK_EQ(in_shape->size(), size_) << "Input should be: " << size_ << \
-      "(Given: " << in_shape->size() << ")";
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    for (auto i : size_) {
-      CHECK_EQ(dshape, in_shape->at(i)) << "Input at " << i << " has different shape";
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  virtual OperatorProperty* Copy() const {
-    auto ptr = new ElemPlusProp(size_);
-    return ptr;
-  }
-
-  virtual std::vector<int> DeclareBackwardDependency(
-      const std::vector<int> &out_grad,
-      const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
-    std::vector<int> ret(size_);
-    for (auto i : size_) {
-      ret[i] = in_data[i];
-    }
-    return ret;
-  }
-
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
-      const std::vector<int> &out_grad,
-      const std::vector<int> &in_data,
-      const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
-    std::vector<std::pair<int, int> > ret;
-    for (auto i : size_) {
-      ret.emplace_back(in_data[i], in_grad[i]);
-    }
-    return ret;
-  }
-
-  Operator* CreateOperator(Context ctx) const;
-
- private:
-  uint32_t size_;
-};  // class ElemPlusProp
-
-#endif  // DMLC_USE_CXX11
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_ELEM_PLUS_INL_H_
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
new file mode 100644
index 000000000000..f0a558b3b0cc
--- /dev/null
+++ b/src/operator/elementwise_sum-inl.h
@@ -0,0 +1,172 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elemementwise_sum-inl.h
+ * \brief elementwise sum
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_ELEMENTWISE_SUM_INL_H_
+#define MXNET_OPERATOR_ELEMENTWISE_SUM_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+enum ElementWiseSumOpInputs {kData0, kData1, kData2, kData3};
+enum ElementWiseSumOpOutputs {kOut};
+
+struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
+  int size;
+  DMLC_DECLARE_PARAMETER(ElementWiseSumParam) {
+    DMLC_DECLARE_FIELD(size).set_range(1, 100);
+  }
+};
+
+template<typename xpu>
+class ElementWiseSumOp : public Operator {
+ public:
+  explicit ElementWiseSumOp(ElementWiseSumParam param)
+      : size_(param.size) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1);
+    if (req[kOut] == kNullOp) return;
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    switch (size_) {
+      case 2: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1);
+        break;
+      }
+      case 3: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1 + in_2);
+        break;
+      }
+      case 4: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1 + in_2 + in_3);
+        break;
+      }
+      default: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0);
+        for (int i = 0; i < size_; ++i) {
+          out += in_data[i].FlatTo2D<xpu, real_t>(s);
+        }
+      }
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), static_cast<size_t>(size_));
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> ograd = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+
+    for (int i = 0; i < size_; ++i) {
+      if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
+      Tensor<xpu, 2> igrad = in_grad[i].FlatTo2D<xpu, real_t>(s);
+      Assign(igrad, req[i], ograd);
+    }
+  }
+
+ private:
+  int size_;
+};  // class ElementWiseSumOp
+
+template<typename xpu>
+Operator* CreateOp(ElementWiseSumParam param);
+
+#if DMLC_USE_CXX11
+class ElementWiseSumProp : public OperatorProperty {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    // TODO(bing) change directly to vector of pairs begin end
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.Init(kmap);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.size));
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    for (int i = 1; i < param_.size; ++i) {
+      SHAPE_ASSIGN_CHECK(*in_shape, i, dshape);
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new ElementWiseSumProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  virtual std::string TypeString() const {
+    return "ElementWiseSum";
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return out_grad;
+  }
+
+  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<int> &in_grad) const {
+    return {{out_grad[0], in_grad[0]}};
+  }
+
+  virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {{in_data[0], out_data[0]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  ElementWiseSumParam param_;
+};  // class ElementWiseSumProp
+
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_ELEMENTWISE_SUM_INL_H_
diff --git a/src/operator/elementwise_sum.cc b/src/operator/elementwise_sum.cc
new file mode 100644
index 000000000000..38e29141c7b3
--- /dev/null
+++ b/src/operator/elementwise_sum.cc
@@ -0,0 +1,24 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elementwise_sum.cc
+ * \brief elementwise sum operator
+*/
+#include <mxnet/registry.h>
+#include "./elementwise_sum-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(ElementWiseSumParam param) {
+  return new ElementWiseSumOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from static_operator_common.h
+Operator* ElementWiseSumProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(ElementWiseSumParam);
+
+REGISTER_OP_PROPERTY(ElementWiseSum, ElementWiseSumProp);
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/elementwise_sum.cu b/src/operator/elementwise_sum.cu
new file mode 100644
index 000000000000..7a9b443dad82
--- /dev/null
+++ b/src/operator/elementwise_sum.cu
@@ -0,0 +1,14 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elementwise_sum.cu
+ * \brief elementwise sum operator
+*/
+#include "./elementwise_sum-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(ElementWiseSumParam param) {
+  return new ElementWiseSumOp<gpu>(param);
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index e2913e65aba8..9dbb9bda8649 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -7,6 +7,7 @@
 #define MXNET_OPERATOR_FULLY_CONNECTED_INL_H_
 
 #include <dmlc/logging.h>
+#include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <vector>
 #include <string>
@@ -22,6 +23,17 @@ namespace op {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
 enum FullyConnectedOpOutputs {kOut};
 
+struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
+  int num_hidden;
+  bool no_bias;
+  DMLC_DECLARE_PARAMETER(FullyConnectedParam) {
+    // TODO(bing) change to only set lower bound
+    // add support for boolean
+    DMLC_DECLARE_FIELD(num_hidden).set_range(1, 100000);
+    DMLC_DECLARE_FIELD(no_bias).set_default(false);
+  }
+};
+
 /**
  * \brief This is the implementation of fully connected operator.
  * \tparam xpu The device that the op will be executed on.
@@ -29,7 +41,7 @@ enum FullyConnectedOpOutputs {kOut};
 template<typename xpu>
 class FullyConnectedOp : public Operator {
  public:
-  explicit FullyConnectedOp(Param p) {
+  explicit FullyConnectedOp(FullyConnectedParam p) {
     this->param_ = p;
   }
 
@@ -40,7 +52,7 @@ class FullyConnectedOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[kOut], kWriteTo);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     // TODO(bing): check the BLAS Handle, be careful
@@ -50,7 +62,7 @@ class FullyConnectedOp : public Operator {
     Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
     Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
     out = dot(data, wmat.T());
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
       out += repmat(bias, data.size(0));
     }
@@ -65,7 +77,7 @@ class FullyConnectedOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    size_t expected = param_.no_bias ? 2 : 3;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
     // TODO(bing): check the BLAS Handle, be careful
@@ -80,7 +92,7 @@ class FullyConnectedOp : public Operator {
     Tensor<xpu, 2> gwmat = in_grad[kWeight].get<xpu, 2, real_t>(s);
     Assign(gwmat, req[kWeight], dot(grad.T(), data));
     // gradient of bias
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
       Assign(gbias, req[kBias], sum_rows(grad));
     }
@@ -90,33 +102,34 @@ class FullyConnectedOp : public Operator {
   }
 
  private:
-  /** The param of the fully connected layer.*/
-  Param param_;
+  FullyConnectedParam param_;
 };  // class FullyConnectedOp
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateFullyConnectedOp(Param param);
+Operator* CreateOp(FullyConnectedParam param);
 
 #if DMLC_USE_CXX11
 class FullyConnectedProp : public OperatorProperty {
  public:
   virtual std::vector<std::string> ListArguments() const {
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       return {"data", "weight", "bias"};
     } else {
       return {"data", "weight"};
     }
   }
 
-  virtual void SetParam(const char *name, const char *val) {
-    param_.SetParam(name, val);
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    // TODO(bing) change directly to vector of pairs begin end
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.Init(kmap);
   }
 
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) const {
     using namespace mshadow;
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
@@ -137,7 +150,7 @@ class FullyConnectedProp : public OperatorProperty {
       num_input = dshape[1];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, kWeight, Shape2(param_.num_hidden, num_input));
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_hidden));
     }
     out_shape->clear();
@@ -173,7 +186,7 @@ class FullyConnectedProp : public OperatorProperty {
   Operator* CreateOperator(Context ctx) const;
 
  private:
-  Param param_;
+  FullyConnectedParam param_;
 };  // class FullyConnectedSymbol
 #endif
 }  // namespace op
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 362d3c5698aa..7d529cb3ed64 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -8,15 +8,17 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateFullyConnectedOp<cpu>(Param param) {
+Operator* CreateOp<cpu>(FullyConnectedParam param) {
   return new FullyConnectedOp<cpu>(param);
 }
 
 // DO_BIND_DISPATCH comes from static_operator_common.h
 Operator* FullyConnectedProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateFullyConnectedOp, param_);
+  DO_BIND_DISPATCH(CreateOp, param_);
 }
 
+DMLC_REGISTER_PARAMETER(FullyConnectedParam);
+
 REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/fully_connected.cu b/src/operator/fully_connected.cu
index 223ef5166cc9..b97df8afb44c 100644
--- a/src/operator/fully_connected.cu
+++ b/src/operator/fully_connected.cu
@@ -7,7 +7,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateFullyConnectedOp<gpu>(Param param) {
+Operator* CreateOp<gpu>(FullyConnectedParam param) {
   return new FullyConnectedOp<gpu>(param);
 }
 }  // namespace op
diff --git a/src/registry.cc b/src/registry.cc
index 42fef1df3423..f64980d8bacc 100644
--- a/src/registry.cc
+++ b/src/registry.cc
@@ -25,12 +25,18 @@ Registry<Entry> *Registry<Entry>::Get() {
   return &instance;
 }
 
-#if DMLC_USE_CXX11
+
 template NArrayFunctionEntry &Registry<NArrayFunctionEntry>::Register(const std::string& name);
 template Registry<NArrayFunctionEntry> *Registry<NArrayFunctionEntry>::Get();
-#endif
 
 template OperatorPropertyEntry &Registry<OperatorPropertyEntry>::Register(const std::string& name);
 template Registry<OperatorPropertyEntry> *Registry<OperatorPropertyEntry>::Get();
 
+// implementation of all factory functions
+OperatorProperty *OperatorProperty::Create(const char* type_name) {
+  auto *creator = Registry<OperatorPropertyEntry>::Find(type_name);
+  CHECK_NE(creator, nullptr)
+      << "Cannot find Operator " << type_name << " in registry";
+  return (*creator)();
+}
 }  // namespace mxnet

From c78775a753736e4de7efd6dbc5dc4cecf84780e2 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 16 Aug 2015 23:17:53 -0600
Subject: [PATCH 06/11] implement backward

---
 include/mxnet/symbolic.h   |  32 +++++++++++-
 src/symbol/static_graph.cc | 102 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index 106fd31923c7..e24c03a0cd0b 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -12,6 +12,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <functional>
 #include <unordered_map>
 #include <unordered_set>
 #include "./base.h"
@@ -37,6 +38,32 @@ class StaticGraph {
     uint32_t source_id;
     /*! \brief index of output from the source. */
     uint32_t index;
+    /*! \brief default constructor */
+    DataEntry() {}
+    /*!
+     * \brief constructor with source and index
+     * \param source_id source id
+     * \param index node index
+     */
+    DataEntry(uint32_t source_id, uint32_t index)
+        : source_id(source_id), index(index) {}
+    /*!
+     * \brief compare equality
+     * \param other the other entry to compare
+     * \return whether two entries equals to each other
+     */
+    inline bool operator==(const DataEntry &other) const {
+      return source_id == other.source_id && index == other.index;
+    }
+    /*!
+     * \brief comparator, allows to use map
+     * \param other the other entry to compare
+     * \return whether two entries is smaller than the other
+     */
+    inline bool operator<(const DataEntry &other) const {
+      if (source_id == other.source_id) return index < other.index;
+      return source_id < other.source_id;
+    }
   };
   /*!
    * \brief Operation Node in static graphs.
@@ -131,10 +158,11 @@ class StaticGraph {
    *  The head and input of Backward pass will be returned by head_grad_nodes and arg_grads.
    *
    * \param head_grad_nodes used to store the created head gradient inputs for backward pass.
-   * \param arg_grads used to store the gradient nodes
+<<<<<<< HEAD
+   * \param arg_grads used to store gradients to args, can be multiple one if an argument is used by operator
    */
   void MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
-                        std::vector<DataEntry> *arg_grads) const;
+                        std::vector<std::vector<DataEntry> > *arg_grads);
 };
 
 /*!
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 62de7963638a..3bec3427fbb3 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -7,6 +7,7 @@
 #include <mxnet/symbolic.h>
 #include <vector>
 #include <queue>
+#include <map>
 #include "../operator/operator_common.h"
 
 namespace mxnet {
@@ -76,7 +77,7 @@ bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
         const DataEntry& e = node.inputs[i];
         (*node_out_shapes)[e.source_id][e.index] = in_shape[i];
       }
-    } else if (node.is_backward()) {
+    } else if (nodes[nid].is_backward()) {
       // simply use shapes from forward pass to assign backward shape
       const Node& forward = nodes[node.backward_source_id];
       CHECK(forward.is_forward());
@@ -150,4 +151,103 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   }
   return true;
 }
+
+void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
+                                   std::vector<std::vector<DataEntry> > *arg_grads) {
+  arg_grads->clear();
+  head_grad_nodes->clear();
+  // get topo order of nodes, before new nodes are added
+  std::vector<uint32_t> topo_order = TopoSort();
+  // map out_data entry to out_grad
+  std::map<DataEntry, std::vector<DataEntry> > grad_map;
+  // allocate head gradient nodes
+  for (DataEntry head : heads) {
+    uint32_t nid = static_cast<uint32_t>(nodes.size());
+    // create a variable node for gradient input
+    nodes.push_back(Node());
+    Node &node = nodes[nid];
+    std::ostringstream os;
+    os << nodes[head.source_id].name << '_' << head.index << "_grad";
+    // TODO(bing): add index to name
+    node.name = os.str();
+    DataEntry igrad(nid, 0);
+    head_grad_nodes->push_back(nid);
+    // update gradient map
+    auto it = grad_map.find(head);
+    if (it == grad_map.end()) {
+      grad_map[head] = {igrad};
+    } else {
+      it->second.push_back(igrad);
+    }
+  }
+  // do backward pass traverse
+  for (auto it = topo_order.rbegin(); it != topo_order.rend(); ++it) {
+    uint32_t nid = *it;
+    // skip variables
+    if (nodes[nid].is_variable()) continue;
+    CHECK(nodes[nid].is_forward()) << "Do not support Backward of Backward";
+    // get out_grad and out_data entry
+    std::vector<DataEntry> out_grad, out_data;
+    // nvisible is out_grad.size()
+    int nvisible = nodes[nid].op->NumVisibleReturns();
+    // ntotal is out_data.size()
+    int ntotal = nodes[nid].op->NumReturns();
+    // check all outpus
+    for (int i = 0; i < ntotal; ++i) {
+      DataEntry odata(nid, static_cast<uint32_t>(i));
+      out_data.push_back(odata);
+      if (i >= nvisible) continue;
+      // get out_grad
+      auto it = grad_map.find(odata);
+      CHECK(it != grad_map.end()) << "bad graph";
+      std::vector<DataEntry> &gnodes = it->second;
+      if (gnodes.size() == 1) {
+        out_grad.push_back(gnodes[0]);
+      } else {
+        // find multiple gradients, need aggregate
+        std::ostringstream os_size, os_name;
+        uint32_t agg_node_id = static_cast<uint32_t>(nodes.size());
+        nodes.push_back(Node());
+        Node &agg_node = nodes[agg_node_id];
+        agg_node.op.reset(OperatorProperty::Create("ElementWiseSum"));
+        os_size << gnodes.size();
+        agg_node.op->Init({{"size", os_size.str()}});
+        os_name << nodes[nid].name << '_' << i << "_out_grad_agg";
+        agg_node.name = os_name.str();
+        agg_node.inputs = gnodes;
+        out_grad.push_back(DataEntry(agg_node_id, 0));
+      }
+    }
+    // Create a gradient backward node
+    nodes.push_back(Node());
+    uint32_t grad_node_id = static_cast<uint32_t>(nodes.size());
+    Node &grad_node = nodes[grad_node_id];
+    // Point to the corresponding source
+    grad_node.backward_source_id = nid;
+    // select out the dependent inputs
+    grad_node.inputs = nodes[nid].op->BackwardInputs(
+        out_grad, nodes[nid].inputs, out_data);
+    grad_node.name = nodes[nid].name + "_backward";
+
+    // update gradient map
+    for (size_t i = 0; i < nodes[nid].inputs.size(); ++i) {
+      DataEntry idata = nodes[nid].inputs[i];
+      DataEntry igrad(grad_node_id, static_cast<uint32_t>(i));
+      auto it = grad_map.find(idata);
+      if (it == grad_map.end()) {
+        grad_map[idata] = {igrad};
+      } else {
+        it->second.push_back(igrad);
+      }
+    }
+  }
+  // create return values of arg_grads
+  arg_grads->resize(arg_nodes.size());
+  for (size_t i = 0; i < arg_nodes.size(); ++i) {
+    DataEntry odata(arg_nodes[i], 0);
+    auto it = grad_map.find(odata);
+    CHECK(it != grad_map.end()) << "bad graph";
+    arg_grads->at(i) = it->second;
+  }
+}
 }  // namespace mxnet

From d9748f11d5a0edc57a3fd22fb7e441c3a69f00c6 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Mon, 17 Aug 2015 00:33:40 -0600
Subject: [PATCH 07/11] add enum to activation

---
 Makefile                           |   2 +-
 include/mxnet/c_api.h              |  67 ++++++++++++-
 include/mxnet/symbolic.h           |   2 +-
 python/mxnet/base.py               |   2 +-
 python/mxnet/symbol.py             | 132 +++++++++++++++++++++----
 python/test_mnist.py               | 131 +++++++++++++++++++++++++
 src/c_api.cc                       |  67 +++++++++++++
 src/operator/activation-inl.h      |   5 +-
 src/operator/elementwise_sum-inl.h |   1 +
 src/operator/fully_connected-inl.h |   1 +
 windows/mxnet.sln                  |  28 ------
 windows/mxnet.vcxproj              | 148 -----------------------------
 windows/mxnet.vcxproj.filters      |  48 ----------
 windows/mxnet.vcxproj.user         |   4 -
 14 files changed, 386 insertions(+), 252 deletions(-)
 create mode 100644 python/test_mnist.py
 delete mode 100755 windows/mxnet.sln
 delete mode 100755 windows/mxnet.vcxproj
 delete mode 100755 windows/mxnet.vcxproj.filters
 delete mode 100755 windows/mxnet.vcxproj.user

diff --git a/Makefile b/Makefile
index d13688e6f0dc..b763a406da23 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ endif
 BIN = test/api_registry_test test/test_storage
 OBJ = narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index fe035b21bc7f..cd0b6b2206c1 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -38,8 +38,9 @@ typedef void *AtomicSymbolHandle;
 typedef void *OperatorHandle;
 /*! \brief handle to a DataIterator */
 typedef void *DataIterHandle;
-
-/*!
+/*! \brief handle to an Executor */
+typedef void *ExecutorHandle;
+/*
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
  *  and -1 when an error occured,
@@ -325,6 +326,7 @@ MXNET_DLL int MXSymbolCompose(SymbolHandle sym,
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
  *
+ * \param sym symbol handle
  * \param num_args numbe of input arguments.
  * \param keys the key of keyword args (optional)
  * \param arg_ind_ptr the head pointer of the rows in CSR
@@ -458,4 +460,65 @@ MXNET_DLL int MXIOGetData(DataIterHandle handle,
 MXNET_DLL int MXIOGetLabel(DataIterHandle handle,
                            NArrayHandle *out);
 
+//--------------------------------------------
+// Part 56: Executor
+//--------------------------------------------
+/*!
+ * \brief Executor forward method
+ *
+ * \param handle executor handle
+ * \param len length of narray handles
+ * \param input input NArray handles
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorForward(ExecutorHandle handle,
+                                mx_uint len,
+                                NArrayHandle *input);
+
+/**
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NArray handle for heads' gradient
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
+                                 mx_uint len,
+                                 NArrayHandle *head_grads);
+
+/**
+ * \brief Get executor's head NArray
+ *
+ * \param handle executor handle
+ * \param out_size output narray vector size
+ * \param out out put narray handles
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorHeads(ExecutorHandle handle,
+                              mx_uint *out_size,
+                              NArrayHandle **out);
+
+/**
+ * \brief Generate Executor from symbol
+ *
+ * \param handle executor hanlde (to be generated)
+ * \param symbol_handle symbol handle
+ * \param len length
+ * \param in_args in args array
+ * \param arg_grad_store arg grads handle array
+ * \param grad_req_type grad req array
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
+                             SymbolHandle symbol_handle,
+                             int dev_mask,
+                             int dev_id,
+                             mx_uint len,
+                             NArrayHandle *in_args,
+                             NArrayHandle *arg_grad_store,
+                             mx_uint *grad_req_type);
+
 #endif  // MXNET_C_API_H_
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index e24c03a0cd0b..bee0e921117a 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -365,7 +365,7 @@ class Executor {
    * \brief Perform a Forward operation of Operator
    *  After this operation, user can get the result by using function head.
    */
-  virtual void Forward() = 0;
+  virtual void Forward(const std::vector<NArray> &inputs) = 0;
   /*!
    * \brief Perform a Backward operation of the Operator.
    *  This must be called after Forward.
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 8cb698aa8219..e30c77d382a3 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -69,7 +69,7 @@ def _load_lib():
 FunctionHandle = ctypes.c_void_p
 SymbolCreatorHandle = ctypes.c_void_p
 SymbolHandle = ctypes.c_void_p
-
+ExecutorHandle = ctypes.c_void_p
 #----------------------------
 # helper function definition
 #----------------------------
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 0caa4b6a0a90..c491eacb1ac4 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -1,13 +1,14 @@
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access
+# pylint: disable=invalid-name, protected-access, too-many-locals
 """Symbol support of mxnet"""
 from __future__ import absolute_import
 
 import ctypes
 from .base import _LIB
-from .base import c_array, c_str, mx_uint
+from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle
 from .base import SymbolHandle
 from .base import check_call
+from .narray import NArray
 
 class Symbol(object):
     """SymbolCreator is a function that takes Param and return symbol"""
@@ -162,7 +163,8 @@ def infer_shape(self, *args, **kwargs):
             The order is in the same order as list_returns()
         """
         if len(args) != 0 and len(kwargs) != 0:
-            raise ValueError('Can only specify known argument shapes either by positional or kwargs way.')
+            raise ValueError('Can only specify known argument \
+                    shapes either by positional or kwargs way.')
         sdata = []
         indptr = [0]
         if len(args) != 0:
@@ -188,21 +190,23 @@ def infer_shape(self, *args, **kwargs):
         out_shape_ndim = ctypes.POINTER(mx_uint)()
         out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
         complete = ctypes.c_int()
-        check_call(_LIB.MXSymbolInferShape(
-                self.handle, len(indptr) - 1,
-                c_array(ctypes.c_char_p, keys),
-                c_array(mx_uint, indptr),
-                c_array(mx_uint, sdata),
-                ctypes.byref(arg_shape_size),
-                ctypes.byref(arg_shape_ndim),
-                ctypes.byref(arg_shape_data),
-                ctypes.byref(out_shape_size),
-                ctypes.byref(out_shape_ndim),
-                ctypes.byref(out_shape_data),
+        check_call(_LIB.MXSymbolInferShape( \
+                self.handle, len(indptr) - 1, \
+                c_array(ctypes.c_char_p, keys), \
+                c_array(mx_uint, indptr), \
+                c_array(mx_uint, sdata), \
+                ctypes.byref(arg_shape_size), \
+                ctypes.byref(arg_shape_ndim), \
+                ctypes.byref(arg_shape_data), \
+                ctypes.byref(out_shape_size), \
+                ctypes.byref(out_shape_ndim), \
+                ctypes.byref(out_shape_data), \
                 ctypes.byref(complete)))
         if complete.value != 0:
-            arg_shapes = [tuple(arg_shape_data[i][:arg_shape_ndim[i]]) for i in range(arg_shape_size.value)]
-            out_shapes = [tuple(out_shape_data[i][:out_shape_ndim[i]]) for i in range(out_shape_size.value)]
+            arg_shapes = [tuple(arg_shape_data[i][:arg_shape_ndim[i]]) \
+                    for i in range(arg_shape_size.value)]
+            out_shapes = [tuple(out_shape_data[i][:out_shape_ndim[i]]) \
+                    for i in range(out_shape_size.value)]
             return (arg_shapes, out_shapes)
         else:
             return (None, None)
@@ -216,6 +220,100 @@ def debug_str(self):
             Debug string of the symbol.
         """
         debug_str = ctypes.c_char_p()
-        check_call(_LIB.MXSymbolPrint(
+        check_call(_LIB.MXSymbolPrint( \
                 self.handle, ctypes.byref(debug_str)))
         return debug_str.value
+
+class Executor(object):
+    """handle of executor"""
+    handle = None
+    def __init__(self, handle):
+        """Init an executor from handle
+
+        Parameters
+        ----------
+        handle: ExecutorHandle
+            ExecutorHandle generated by calling Bind
+        """
+        if not isinstance(ExecutorHandle):
+            raise TypeError("Handle type error")
+        self.handle = handle
+
+    def forward(self, inputs):
+        """do forward on inputs data
+
+        Parameters
+        ----------
+        inputs: Array of NArray
+            inputs narray to executor
+        """
+        if self.handle == None:
+            raise Exception("Bind symbol before use executor")
+        for obj in inputs:
+            if not isinstance(obj, NArray):
+                raise TypeError("inputs must be NArray")
+        narray = c_array([item.handle for item in inputs])
+        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(inputs), narray))
+
+    def backward(self, grads):
+        """do backward on heads' grads
+
+        Parameters
+        ----------
+        grads: Array of NArray
+            heads' gradient
+        """
+        if self.handle == None:
+            raise Exception("Bind symbol before use executor")
+        for obj in grads:
+            if not isinstance(obj, NArray):
+                raise TypeError("inputs must be NArray")
+        narray = c_array(NArrayHandle, [item.handle for item in grads])
+        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(grads), narray))
+
+    def heads(self):
+        """list all heads' output narray
+
+        Returns
+        -------
+        a list of narray binded to the heads of executor
+        """
+        if self.handle == None:
+            raise Exception("Bind symbol before use executor")
+        out_size = mx_uint()
+        handles = ctypes.POINTER(ctypes.POINTER(NArrayHandle))()
+        check_call(_LIB.MXExecutorHeads(self.handle, ctypes.byref(out_szie), narrays))
+        return [NArray(handle[i]) for i in xrange(out_size)]
+
+
+def Bind(sym, ctx, args, args_grad, reqs):
+    """Bind a symbol to get an executor
+
+    Parameters
+    ----------
+    sym: Symbol
+        symbol to be binded
+    ctx: Context
+        context executor to run on
+    args: Array of NArray
+        input args to the symbol
+    args_grad: Array of NArray
+        input args' gradient
+    reqs: Array of enum
+        graident requirements
+    """
+    """gradient requirements enum"""
+    enum = {"null" : 0, "write_to" : 1, "in_place":2, "add_to" : 3}
+
+    if not isinstance(sym, Symbol):
+        raise TypeError("Symbol type error")
+    if not isinstance(ctx, Context):
+        raise TypeError("Context type error")
+    args_handle = c_array(NArrayHandle, [item.handle for item in args])
+    args_grad_handle = c_array(NArrayHandle, [item.handle for item in args_grad])
+    reqs_array = c_array(mx_uint, mx_uint(enum[item]) for item in req)
+    handle = ExecutorHandle()
+    check_call(_LIB.MXExecutorBind(handle, sym.handle, \
+        mx_uint(ctx.device_mask), mx_uint(ctx.device_id), \
+        args_handle, args_grad_handle, reqs_array)
+    return Executor(handle);
\ No newline at end of file
diff --git a/python/test_mnist.py b/python/test_mnist.py
new file mode 100644
index 000000000000..71d79dd607e6
--- /dev/null
+++ b/python/test_mnist.py
@@ -0,0 +1,131 @@
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import os, cPickle, gzip
+
+def Softmax(x):
+    maxes = np.max(x, axis=1)
+    x -= maxes.reshape(maxes.shape[0], 1)
+    e = np.exp(x)
+    return e / np.sum(e, axis=1)
+
+def CalAcc(out, label):
+    pred = np.argmax(out, axis=1)
+    return np.sum(pred == label) * 1.0 / out.shape[0]
+
+def SetGradient(out_grad, label):
+    assert(out_grad.shape[0] == label.shape[0])
+    for i in xrange(label.shape[0]):
+        k = label[i]
+        out_grad[i][k] -= 1.0
+
+# load data
+class MNISTIter(object):
+    def __init__(self, which_set, batch_size=100):
+        if not os.path.exists('mnist.pkl.gz'):
+            os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz")
+        f = gzip.open('mnist.pkl.gz', 'rb')
+        train_set, valid_set, test_set = cPickle.load(f)
+        f.close()
+        if which_set == 'train':
+            self.data = train_set[0]
+            self.label = np.asarray(train_set[1])
+        elif which_set == 'valid':
+            self.data = valid_set[0]
+            self.label = np.asarray(valid_set[1])
+        else:
+            self.data = test_set[0]
+            self.data = np.asarray(test_set[1])
+        self.batch_size = batch_size
+        self.nbatch = self.data.shape[0] / batch_size
+        assert(self.data.shape[0] % batch_size == 0) # I am lazy
+        self.now_idx = -1
+    def BeforeFirst(self):
+        self.now_idx = -1
+    def Next(self):
+        self.now_idx += 1
+        if self.now_idx == self.nbatch:
+            return False
+        return True
+    def Get(self):
+        if self.now_idx < 0:
+            raise Exception("Iterator is at head")
+        elif self.now_idx >= self.nbatch:
+            raise Exception("Iterator is at end")
+        start = self.now_idx * self.batch_size
+        end = (self.now_idx + 1) * self.batch_size
+        return (self.data[start:end, :], self.label[start:end])
+
+
+
+# symbol net
+batch_size = 100
+data = mx.sym.Variable('data')
+fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=160)
+act1 = mx.sym.Activation(data = fc1, name='relu1', type="relu")
+fc2 = mx.sym.FullyConnected(data=act1, name='fc2', num_hidden=10)
+args_list = fc2.list_arguments()
+
+# infer shape
+data_shape = (batch_size, 784)
+arg_shapes, out_shapes = fc2.infer_shape(data=data_shape)
+arg_narrays = [mx.narray.create(shape) for shape in arg_shapes]
+grad_narrays = [mx.narray.create(shape) for shape in arg_shapes]
+mom_narrays = [mx.narray.create(shape) for shape in arg_shapes]
+out_narray = mx.narray.create(out_shapes[0])
+inputs = dict(zip(args_list, arg_narrays))
+
+# set random weight
+for name, narray in inputs.items():
+    if "weight" in name:
+        narray.numpy[:, :] = np.random.uniform(-0.01, 0.01, narray.numpy.shape)
+
+
+# bind executer
+# exec = bind(fc2, args_narray, grad_narray, req)
+# update
+
+epoch = 10
+momentum = 0.9
+lr = 0.01
+wd = 0.0004
+
+def Update(mom, grad, weight):
+    if len(mom.numpy.shape) == 1:
+        mom.numpy[:] = mom.numpy * momentum - lr * (grad.numpy + wd * weight.numpy)
+    else:
+        mom.numpy[:, :] = mom.numpy * momentum - lr * (grad.numpy + wd * weight.numpy)
+    weight += mom
+
+block = zip(mom_narrays, grad_narrays, arg_narrays)
+
+
+train = MNISTIter("train")
+valid = MNISTIter("valid")
+for i in xrange(epoch):
+    # train
+    print "Epoch %d" % i
+    train_acc = 0.0
+    val_acc = 0.0
+    while train.Next():
+        data, label = train.Get()
+        inputs["data"].numpy[:,:] = data
+        # exec.Forward(args_narray)
+        train_acc += CalAcc(out_narray.numpy, label)
+        SetGradient(out_narray.numpy, label)
+        # exec.Backward(out_narray)
+        for mom, grad, weight in block:
+            Update(mom, grad, weight)
+    # evaluate
+    while valid.Next():
+        data, label = valid.Get()
+        inputs["data"].numpy[:,:] = data
+        # exec.Forward([ inputs["data"] ])
+        val_acc += CalAcc(out_narray.numpy, label)
+    print "Train Acc: ", train_acc / train.nbatch
+    print "Valid Acc: ", val_acc / valid.nbatch
+    train.BeforeFirst()
+    valid.BeforeFirst()
+
+
+
diff --git a/src/c_api.cc b/src/c_api.cc
index ed5446fc816a..2e97b916af9b 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -480,3 +480,70 @@ int MXSymbolInferShape(SymbolHandle sym,
   }
   API_END();
 }
+
+MXNET_DLL int MXExecutorForward(ExecutorHandle handle,
+                                mx_uint len,
+                                NArrayHandle *args) {
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  NArray **args_ptr = reinterpret_cast<NArray**>(args);
+  std::vector<NArray> narrays;
+  for (mx_uint i = 0; i < len; ++i) {
+    narrays.emplace_back(*(args_ptr[i]));
+  }
+  exec->Forward(narrays);
+  API_END();
+}
+
+
+MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
+                                 mx_uint len,
+                                NArrayHandle *head_grads) {
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  std::vector<NArray> narrays;
+  NArray **args_ptr = reinterpret_cast<NArray**>(head_grads);
+  for (mx_uint i = 0; i < len; ++i) {
+    narrays.push_back(*(args_ptr[i]));
+  }
+  exec->Backward(narrays);
+  API_END();
+}
+
+
+MXNET_DLL int MXExecutorHeads(ExecutorHandle handle,
+                              mx_uint *out_size,
+                              NArrayHandle **out) {
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  std::vector<NArray> ret = exec->heads();
+
+  API_END();
+}
+
+MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
+                             SymbolHandle symbol_handle,
+                             int dev_mask,
+                             int dev_id,
+                             mx_uint len,
+                             NArrayHandle *in_args,
+                             NArrayHandle *arg_grad_store,
+                             mx_uint *grad_req_type) {
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  Symbol *symb = static_cast<Symbol*>(symbol_handle);
+  Context ctx = Context(dev_mask, dev_id);
+  NArray **in_args_ptr = reinterpret_cast<NArray**>(in_args);
+  NArray **arg_grad_ptr = reinterpret_cast<NArray**>(arg_grad_store);
+  std::vector<NArray> in_args_vec;
+  std::vector<NArray> arg_grad_vec;
+  std::vector<OpReqType> grad_req_vec;
+  for (mx_uint i = 0; i < len; ++i) {
+    in_args_vec.push_back(*(in_args_ptr[i]));
+    arg_grad_vec.push_back(*(arg_grad_ptr[i]));
+    grad_req_vec.push_back(static_cast<OpReqType>(grad_req_type[i]));
+  }
+  handle = exec->Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
+  API_END();
+}
+
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 6374d02cc53b..fd643a6405da 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -11,6 +11,7 @@
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <cstring>
+#include <map>
 #include <string>
 #include <vector>
 #include <utility>
@@ -28,8 +29,8 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
   int type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
-    // TODO(bing) support enum, str->int mapping
-    DMLC_DECLARE_FIELD(type).set_default(kReLU);
+    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU).\
+      add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
   }
 };
 
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index f0a558b3b0cc..65a6ba1d5c99 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -11,6 +11,7 @@
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <cstring>
+#include <map>
 #include <string>
 #include <vector>
 #include <utility>
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 9dbb9bda8649..e92c9f1f66dd 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -9,6 +9,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
+#include <map>
 #include <vector>
 #include <string>
 #include <utility>
diff --git a/windows/mxnet.sln b/windows/mxnet.sln
deleted file mode 100755
index 16f82f6b6fb1..000000000000
--- a/windows/mxnet.sln
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2013
-VisualStudioVersion = 12.0.21005.1
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mxnet", "mxnet.vcxproj", "{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|Win32.ActiveCfg = Debug|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|Win32.Build.0 = Debug|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|x64.ActiveCfg = Debug|x64
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|x64.Build.0 = Debug|x64
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|Win32.ActiveCfg = Release|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|Win32.Build.0 = Release|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|x64.ActiveCfg = Release|x64
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
diff --git a/windows/mxnet.vcxproj b/windows/mxnet.vcxproj
deleted file mode 100755
index 2823478cc51f..000000000000
--- a/windows/mxnet.vcxproj
+++ /dev/null
@@ -1,148 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <TargetMachine>MachineX86</TargetMachine>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Windows</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>$(solutionDir)\..\src</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-    </ClCompile>
-    <Link>
-      <TargetMachine>MachineX86</TargetMachine>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Windows</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <AdditionalIncludeDirectories>$(solutionDir)\..\src</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\src\narray\narray.cpp" />
-    <ClCompile Include="..\src\test\testInterface.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\src\common\all_ops.h" />
-    <ClInclude Include="..\src\common\common.h" />
-    <ClInclude Include="..\src\engine\dagengine.h" />
-    <ClInclude Include="..\src\layer\dummylayer.h" />
-    <ClInclude Include="..\src\layer\layer.h" />
-    <ClInclude Include="..\src\narray\narray.h" />
-    <ClInclude Include="..\src\storage\storage.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/windows/mxnet.vcxproj.filters b/windows/mxnet.vcxproj.filters
deleted file mode 100755
index 1ff068b088be..000000000000
--- a/windows/mxnet.vcxproj.filters
+++ /dev/null
@@ -1,48 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <Filter Include="Source Files">
-      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
-      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
-    </Filter>
-    <Filter Include="Header Files">
-      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
-      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
-    </Filter>
-    <Filter Include="Resource Files">
-      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
-      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav</Extensions>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\src\narray\narray.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\src\test\testInterface.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\src\common\all_ops.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\common\common.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\engine\dagengine.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\layer\dummylayer.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\layer\layer.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\narray\narray.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\storage\storage.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-  </ItemGroup>
-</Project>
\ No newline at end of file
diff --git a/windows/mxnet.vcxproj.user b/windows/mxnet.vcxproj.user
deleted file mode 100755
index ef5ff2a1fae6..000000000000
--- a/windows/mxnet.vcxproj.user
+++ /dev/null
@@ -1,4 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup />
-</Project>
\ No newline at end of file

From f5c6d6793d8a976df212f5f5b878e0c381be6a05 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 18 Aug 2015 18:56:03 -0600
Subject: [PATCH 08/11] add pooling op

---
 Makefile                                      |   7 +-
 include/mxnet/narray.h                        |   6 +-
 src/operator/activation-inl.h                 |   4 +-
 src/operator/fully_connected-inl.h            |   8 +-
 src/operator/pooling-inl.h                    | 201 ++++++++++++++++++
 src/operator/pooling.cc                       |  34 +++
 src/operator/pooling.cu                       |  26 +++
 src/operator/static_operator/pooling_op-inl.h | 153 -------------
 8 files changed, 275 insertions(+), 164 deletions(-)
 create mode 100644 src/operator/pooling-inl.h
 create mode 100644 src/operator/pooling.cc
 create mode 100644 src/operator/pooling.cu
 delete mode 100644 src/operator/static_operator/pooling_op-inl.h

diff --git a/Makefile b/Makefile
index b763a406da23..8be91fb4886b 100644
--- a/Makefile
+++ b/Makefile
@@ -58,14 +58,14 @@ endif
 BIN = test/api_registry_test test/test_storage
 OBJ = narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o pooling_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o
+	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -91,6 +91,9 @@ activation_cpu.o: src/operator/activation.cc
 activation_gpu.o: src/operator/activation.cu
 elementwise_sum_cpu.o: src/operator/elementwise_sum.cc
 elementwise_sum_gpu.o: src/operator/elementwise_sum.cu
+pooling_cpu.o: src/operator/pooling.cc
+pooling_gpu.o: src/operator/pooling.cu
+
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 92257b3f0269..798b71627378 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -25,6 +25,7 @@ namespace mxnet {
  */
 class NArray {
  public:
+  typedef std::pair<Storage::Handle, DAGEngine::Variable> ChunkSkin;
   /*! \brief default cosntructor */
   NArray() {}
   /*!
@@ -35,7 +36,8 @@ class NArray {
    */
   NArray(const TShape &shape, Context ctx,
          bool delay_alloc = false)
-      : ptr_(new Chunk(shape, ctx, delay_alloc)) {
+      : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc)) {
+        // Change to std::make_shared
   }
   /*!
    * \brief constructing a static NArray that shares data with TBlob
@@ -45,7 +47,7 @@ class NArray {
    * \param dev_id the device id this tensor sits at
    */
   NArray(const TBlob &data, int dev_id)
-      : ptr_(new Chunk(data, dev_id)) {
+      : ptr_(std::make_shared<Chunk>(data, dev_id)) {
   }
   /*!
    * \return the shape of current NArray
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index fd643a6405da..e78eecfbeddc 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -29,8 +29,8 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
   int type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
-    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU).\
-      add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
+    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU)\
+      .add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
   }
 };
 
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index e92c9f1f66dd..f129a27b228d 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -14,7 +14,7 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
-#include "./param.h"
+
 
 namespace mxnet {
 namespace op {
@@ -122,9 +122,7 @@ class FullyConnectedProp : public OperatorProperty {
   }
 
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
-    // TODO(bing) change directly to vector of pairs begin end
-    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
-    param_.Init(kmap);
+    param_.Init(kwargs);
   }
 
   virtual bool InferShape(std::vector<TShape> *in_shape,
@@ -181,7 +179,7 @@ class FullyConnectedProp : public OperatorProperty {
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
       const std::vector<int> &in_grad) const {
-    return {{in_grad[kData], in_data[kData]}};
+    return {{in_data[kData], in_grad[kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
new file mode 100644
index 000000000000..8b223e2476a2
--- /dev/null
+++ b/src/operator/pooling-inl.h
@@ -0,0 +1,201 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+
+#ifndef MXNET_OPERATOR_POOLING_INL_H_
+#define MXNET_OPERATOR_POOLING_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+enum PoolingOpInputs {kData};
+enum PoolingOpOutputs {kOut};
+enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+
+struct PoolingParam : public dmlc::Parameter<PoolingParam> {
+  int kernel_x;
+  int kernel_y;
+  int stride_x;
+  int stride_y;
+  int pad_x;
+  int pad_y;
+  int type;
+  DMLC_DECLARE_PARAMETER(PoolingParam) {
+    // TODO(bing) change to only set lower bound
+    DMLC_DECLARE_FIELD(kernel_x).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(kernel_y).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(stride_x).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(stride_y).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(pad_x).set_default(0).set_range(0, 10000);
+    DMLC_DECLARE_FIELD(pad_y).set_default(0).set_range(0, 10000);
+    DMLC_DECLARE_FIELD(type).set_default(kMaxPooling)\
+      .add_enum("max", kMaxPooling).add_enum("avg", kAvgPooling)\
+      .add_enum("sum", kSumPooling);
+  }
+};
+
+template<typename xpu, typename Reducer>
+class PoolingOp : public Operator {
+ public:
+  explicit PoolingOp(PoolingParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
+    // TODO(bing): dual stride in mshadow
+    if (param_.type == kMaxPooling || param_.type == kSumPooling) {
+      out = pool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                          out_shape,
+                          param_.kernel_y,
+                          param_.kernel_x,
+                          param_.kernel_y);
+    } else if (param_.type == kAvgPooling) {
+      out = (1.0f / (param_.kernel_y * param_.kernel_x)) * \
+            pool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                          out_shape,
+                          param_.kernel_y,
+                          param_.kernel_x,
+                          param_.kernel_y);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    // TODO(bing): remove pad (0,0)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> output_data = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> input_grad = in_grad[kData].get<xpu, 4, real_t>(s);
+
+    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
+
+    if (param_.type == kMaxPooling || param_.type == kSumPooling) {
+      Assign(input_grad, req[kData],
+             crop(unpool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.kernel_y,
+                                  param_.kernel_x,
+                                  param_.stride_y),
+                  in_shape,
+                  param_.pad_y,
+                  param_.pad_x));
+    } else if (param_.type == kAvgPooling) {
+      Assign(input_grad, req[kData],
+             (1.0f / param_.kernel_y / param_.kernel_x) *\
+             crop(unpool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.kernel_y,
+                                  param_.kernel_x,
+                                  param_.stride_y),
+                  in_shape,
+                  param_.pad_y,
+                  param_.pad_x));
+    }
+  }
+
+ private:
+  PoolingParam param_;
+};  // class PoolingOp
+
+template<typename xpu>
+Operator* CreateOp(PoolingParam param);
+
+
+#if DMLC_USE_CXX11
+class PoolingProp : public OperatorProperty {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK_EQ(dshape.ndim(), 4) << \
+      "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    TShape oshape = dshape;
+    if (dshape.ndim() ==  0) return false;
+    oshape[2] = std::min(dshape[2] + 2 * param_.pad_y - param_.kernel_y + param_.stride_y - 1,
+                         dshape[2] + 2 * param_.pad_y - 1) / param_.stride_y + 1;
+    oshape[3] = std::min(dshape[3] + 2 * param_.pad_x - param_.kernel_x + param_.stride_x - 1,
+                         dshape[3] + 2 * param_.pad_x - 1) / param_.stride_x + 1;
+    CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
+    out_shape->clear();
+    out_shape->push_back(oshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    PoolingProp *prop_sym = new PoolingProp();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  virtual std::string TypeString() const {
+    return "Pooling";
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut], in_data[kData], out_data[kOut]};
+  }
+
+  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<int> &in_grad) const {
+    return {{in_data[kData], in_grad[kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  PoolingParam param_;
+};  // class PoolingProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_POOLING_INL_H_
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
new file mode 100644
index 000000000000..a6ebc91e0873
--- /dev/null
+++ b/src/operator/pooling.cc
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include <mxnet/registry.h>
+#include "./pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(PoolingParam param) {
+  switch (param.type) {
+    case kMaxPooling: return new PoolingOp<cpu, mshadow::red::maximum>(param);
+    case kAvgPooling: return new PoolingOp<cpu, mshadow::red::sum>(param);
+    case kSumPooling: return new PoolingOp<cpu, mshadow::red::sum>(param);
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return NULL;
+  }
+}
+
+Operator* PoolingProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(PoolingParam);
+
+REGISTER_OP_PROPERTY(Pooling, PoolingProp);
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
new file mode 100644
index 000000000000..2db6d9ea549a
--- /dev/null
+++ b/src/operator/pooling.cu
@@ -0,0 +1,26 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(PoolingParam param) {
+  switch (param.type) {
+    case kMaxPooling: return new PoolingOp<gpu, mshadow::red::maximum>(param);
+    case kAvgPooling: return new PoolingOp<gpu, mshadow::red::sum>(param);
+    case kSumPooling: return new PoolingOp<gpu, mshadow::red::sum>(param);
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return NULL;
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/static_operator/pooling_op-inl.h b/src/operator/static_operator/pooling_op-inl.h
deleted file mode 100644
index 8c6014a8c2cf..000000000000
--- a/src/operator/static_operator/pooling_op-inl.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file pooling_op-inl.h
- * \brief pooling operator
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_POOLING_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_POOLING_OP_INL_H_
-
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <vector>
-#include "./param.h"
-#include "./static_operator_common.h"
-
-
-namespace mxnet {
-namespace op {
-template<typename xpu, typename Reducer, OpType mode>
-class PoolingOp : public StaticOperator {
- public:
-  virtual void SetParam(const char *name, const char *val) {
-    param_.SetParam(name, val);
-  }
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    CHECK_GT(param_.kernel_y, 0);
-    CHECK_GT(param_.kernel_x, 0);
-    const int ksize_y = static_cast<index_t>(param_.kernel_y);
-    const int ksize_x = static_cast<index_t>(param_.kernel_x);
-    const int pad_y = static_cast<index_t>(param_.pad_y);
-    const int pad_x = static_cast<index_t>(param_.pad_x);
-    // TODO(bing): dual stride
-    const int kstride = static_cast<index_t>(param_.stride_y);
-    mshadow::Shape<4> ishape = (*in_shape)[0].get<4>();
-    oshape_ = ishape;
-    fea_shape_ = mshadow::Shape2(ishape[2], ishape[3]);
-    oshape_[2] = std::min(ishape[2] + 2 * pad_y - ksize_y + kstride - 1,
-                             ishape[2] + 2 * pad_y - 1) / kstride + 1;
-    oshape_[3] = std::min(ishape[3] + 2 * pad_x - ksize_x + kstride - 1,
-                             ishape[3] + 2 * pad_x - 1) / kstride + 1;
-    CHECK(oshape_[2] > 0 && oshape_[3] > 0) << "kernel size exceed input";
-    out_shape->clear();
-    out_shape->push_back((*in_shape)[0]);
-    (*out_shape)[0][2] = oshape_[2];
-    (*out_shape)[0][3] = oshape_[3];
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 0);
-    if (!(temp_.shape_ == oshape_)) {
-      temp_.Resize(oshape_);
-    }
-    const int ksize_y = param_.kernel_y;
-    const int ksize_x = param_.kernel_x;
-    const int pad_y = param_.pad_y;
-    const int pad_x = param_.pad_x;
-    // TODO(bing): dual stride
-    const int kstride = param_.stride_y;
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[0].get<xpu, 4, real_t>(s);
-    mshadow::Shape<2> pshape = Shape2(out.shape_[2], out.shape_[3]);
-    if (mode == kMaxPooling || mode == kSumPooling) {
-      temp_ = pool<Reducer>(pad(data, pad_y, pad_x),
-                          pshape,
-                          ksize_y,
-                          ksize_x,
-                          kstride);
-    } else if (mode == kAvgPooling) {
-      temp_ = (1.0f / (ksize_y * ksize_x)) * \
-            pool<Reducer>(pad(data, pad_y, pad_x),
-                          pshape,
-                          ksize_y,
-                          ksize_x,
-                          kstride);
-    } else {
-      LOG(FATAL) << "Unknown pooling mode";
-    }
-    Copy(out, temp_, s);
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    CHECK_EQ(grad_next.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    const int ksize_y = param_.kernel_y;
-    const int ksize_x = param_.kernel_x;
-    const int pad_y = param_.pad_y;
-    const int pad_x = param_.pad_x;
-    // TODO(bing): dual stride
-    const int kstride = param_.stride_y;
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> grad = grad_next[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_grad[0].get<xpu, 4, real_t>(s);
-    if (mode == kMaxPooling || mode == kSumPooling) {
-      Assign(out,
-             req[0],
-             crop(unpool<Reducer>(pad(data, pad_y, pad_x),
-                                  pad(temp_, 0, 0),
-                                  pad(grad, 0, 0),
-                                  ksize_y,
-                                  ksize_x,
-                                  kstride),
-                  fea_shape_,
-                  pad_y,
-                  pad_x));
-    } else if (mode == kAvgPooling) {
-      Assign(out,
-             req[0],
-             (1.0f / (ksize_y * ksize_x)) * \
-             crop(unpool<Reducer>(pad(data, pad_y, pad_x),
-                                  pad(temp_, 0, 0),
-                                  pad(grad, 0, 0),
-                                  ksize_y,
-                                  ksize_x,
-                                  kstride),
-                  fea_shape_,
-                  pad_y,
-                  pad_x));
-    } else {
-      LOG(FATAL) << "Unknown pooling mode";
-    }
-  }
-
- private:
-  /*! \brief parameters that potentially be useful */
-  Param param_;
-  /*! \brief temp space to save pooled result */
-  mshadow::TensorContainer<xpu, 4> temp_;
-  /*! \brief pooled output shape */
-  mshadow::Shape<4> oshape_;
-  /*! \brief input feature map shape */
-  mshadow::Shape<2> fea_shape_;
-};  // class PoolingOp
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_POOLING_OP_INL_H_

From 0448d6743f2cced56f1a8cee8c1129fed68ed37e Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 18 Aug 2015 23:47:46 -0600
Subject: [PATCH 09/11] graph exec as discussed

---
 Makefile                            |   3 +-
 include/mxnet/context.h             |   8 +
 include/mxnet/narray.h              |   6 +-
 include/mxnet/operator.h            |  83 +++--
 include/mxnet/symbolic.h            |  10 +-
 src/c_api.cc                        |  10 +-
 src/operator/activation-inl.h       |  12 +-
 src/operator/elementwise_sum-inl.h  |   8 +-
 src/operator/fully_connected-inl.h  |   4 +-
 src/symbol/graph_executor.cc        | 473 ++++++++++++++++++++++++++++
 src/symbol/graph_executor.h         | 179 +++++++++++
 src/symbol/graph_memory_allocator.h | 145 +++++++++
 src/symbol/static_graph.cc          |  41 ++-
 13 files changed, 906 insertions(+), 76 deletions(-)
 create mode 100644 src/symbol/graph_executor.cc
 create mode 100644 src/symbol/graph_executor.h
 create mode 100644 src/symbol/graph_memory_allocator.h

diff --git a/Makefile b/Makefile
index 8be91fb4886b..50e9a21c50e8 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ endif
 BIN = test/api_registry_test test/test_storage
 OBJ = narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o pooling_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o graph_executor.o pooling_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
@@ -81,6 +81,7 @@ narray.o: src/narray/narray.cc
 narray_op_cpu.o: src/narray/narray_op_cpu.cc src/narray/narray_op-inl.h
 narray_op_gpu.o: src/narray/narray_op_gpu.cu src/narray/narray_op-inl.h
 symbol.o: src/symbol/symbol.cc
+graph_executor.o: src/symbol/graph_executor.cc
 static_graph.o : src/symbol/static_graph.cc
 registry.o: src/registry.cc
 c_api.o: src/c_api.cc
diff --git a/include/mxnet/context.h b/include/mxnet/context.h
index 8dfa618ca180..700bb36f0abb 100644
--- a/include/mxnet/context.h
+++ b/include/mxnet/context.h
@@ -33,6 +33,14 @@ struct Context {
   inline bool operator==(const Context &b) const {
     return dev_mask == b.dev_mask && dev_id == b.dev_id;
   }
+  /*!
+   * \brief check if current context not equals another one
+   * \param b another context to compare
+   * \return whether they are not the same
+   */
+  inline bool operator!=(const Context &b) const {
+    return !(*this == b);
+  }
 };
 
 /*!
diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 798b71627378..92257b3f0269 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -25,7 +25,6 @@ namespace mxnet {
  */
 class NArray {
  public:
-  typedef std::pair<Storage::Handle, DAGEngine::Variable> ChunkSkin;
   /*! \brief default cosntructor */
   NArray() {}
   /*!
@@ -36,8 +35,7 @@ class NArray {
    */
   NArray(const TShape &shape, Context ctx,
          bool delay_alloc = false)
-      : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc)) {
-        // Change to std::make_shared
+      : ptr_(new Chunk(shape, ctx, delay_alloc)) {
   }
   /*!
    * \brief constructing a static NArray that shares data with TBlob
@@ -47,7 +45,7 @@ class NArray {
    * \param dev_id the device id this tensor sits at
    */
   NArray(const TBlob &data, int dev_id)
-      : ptr_(std::make_shared<Chunk>(data, dev_id)) {
+      : ptr_(new Chunk(data, dev_id)) {
   }
   /*!
    * \return the shape of current NArray
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 0fa1fb6a0571..bc1d79b20b38 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -8,6 +8,7 @@
 #define MXNET_OPERATOR_H_
 
 #include <dmlc/base.h>
+#include <dmlc/logging.h>
 #include <vector>
 #include <string>
 #include <utility>
@@ -108,7 +109,9 @@ class Operator {
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) = 0;
+                        const std::vector<TBlob> &in_grad) {
+    LOG(FATAL) << "Backward is not implemented";
+  }
 };
 
 #if DMLC_USE_CXX11
@@ -255,28 +258,36 @@ class OperatorProperty {
    *  This function enables optimization to reuse memory of inputs in output.
    *  Only override when necessary, by default in-place is disabled.
    *
+   *  The reason for void* type in the out_data is to distinguish the order
+   *  of mappings between the two, compiler will report error when
+   *  in_data and out_data's order in the pair get reversed.
+   *
    * \code
    *  // The following code says out_data[0] can share data with in_data[0]
-   *  vector<pair<int,int> > ForwardInplaceOption(const vector<int> &in_data,
-   *                                              const vector<int> &out_data) const {
-   *    return {{out_data[0], in_data[0]}};
+   *  vector<pair<int, void*> > ForwardInplaceOption(const vector<int> &in_data,
+   *                                                 const vector<void*> &out_data) const {
+   *    return {{in_data[0], out_data[0]}};
    *  }
    * \endcode
    * \param in_data The input data in forward pass.
    * \param out_data The output data in forward pass.
-   * \return list of pair of integers taken from the inputs vector,
+   * \return list of pair of that maps input->output,
    *   indicating possible in place operations.
    */
-  virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
       const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
-    return std::vector<std::pair<int, int> >();
+      const std::vector<void*> &out_data) const {
+    return std::vector<std::pair<int, void*> >();
   }
   /*!
    * \brief Get possible backward inplace options.
    *  This function enables optimization to reuse memory of inputs in output.
    *  Only override when necessary, by default in-place is disabled.
    *
+   *  The reason for void* type in the in_grad is to distinguish the order
+   *  of mappings between the two, compiler will report error when
+   *  in_data and out_data's order in the pair get reversed.
+   *
    * \code
    *  // The following code says in_grad[0] can share data with in_data[0]
    *  vector<pair<int,int> > BackwardInplaceOption(
@@ -284,22 +295,22 @@ class OperatorProperty {
    *                 const std::vector<int> &in_data,
    *                 const std::vector<int> &out_data,
    *                 const std::vector<int> &in_grad) const {
-   *    return {in_grad[0], in_data[0]}};
+   *    return {in_data[0], in_grad[0]}};
    *  }
    * \endcode
    * \param in_data The input data in forward pass.
    * \param out_data The output data in forward pass.
    * \param in_grad Gradient of inputs in backward pass.
    * \param out_grad Gradient of outputs in backward pass.
-   * \return list of pair of integers taken from the inputs vector,
+   * \return list of pair of that maps input->output,
    *   indicating possible in place operations.
    */
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
-    return std::vector<std::pair<int, int> >();
+      const std::vector<void*> &in_grad) const {
+    return std::vector<std::pair<int, void*> >();
   }
   /*!
    * \brief Get Backward Input Dependency for generic types of data.
@@ -314,31 +325,35 @@ class OperatorProperty {
    * \sa DeclareBackwardDependency
    */
   template<typename T>
-  inline std::vector<T> BackwardInputs(const std::vector<T> &in_data,
-                                       const std::vector<T> &out_data,
-                                       const std::vector<T> &out_grad) const {
-    int cnt = 0;
-    std::vector<T> all_vec;
-    std::vector<int> in_data_idx, out_data_idx, out_grad_idx;
-    for (size_t i = 0; i < in_data.size(); ++i) {
-      in_data_idx.push_back(cnt++);
-      all_vec.push_back(in_data[i]);
+  inline std::vector<T> BackwardInputs(const std::vector<T> &out_grad,
+                                       const std::vector<T> &in_data,
+                                       const std::vector<T> &out_data) const {
+    int counter = 0;
+    std::vector<int> out_grad_index(out_grad.size());
+    std::vector<int> in_data_index(out_data.size());
+    std::vector<int> out_data_index(out_data.size());
+    for (size_t i = 0; i < out_grad_index.size(); ++i) {
+      out_grad_index[i] = counter++;
     }
-    for (size_t i = 0; i < out_data.size(); ++i) {
-      out_data_idx.push_back(cnt++);
-      all_vec.push_back(out_data[i]);
+    for (size_t i = 0; i < in_data_index.size(); ++i) {
+      in_data_index[i] = counter++;
     }
-    for (size_t i = 0; i < out_grad.size(); ++i) {
-      out_grad_idx.push_back(cnt++);
-      all_vec.push_back(out_data[i]);
+    for (size_t i = 0; i < out_data_index.size(); ++i) {
+      out_data_index[i] = counter++;
     }
-    std::vector<int> ret_idx = this->DeclareBackwardDependency(
-        in_data_idx, out_data_idx, out_grad_idx);
-    std::vector<T> ret;
-    for (size_t i = 0; i < ret_idx.size(); ++i) {
-      ret.push_back(all_vec[ret_idx[i]]);
+    std::vector<T> all_data;
+    all_data.insert(all_data.end(), out_grad.begin(), out_grad.end());
+    all_data.insert(all_data.end(), in_data.begin(), in_data.end());
+    all_data.insert(all_data.end(), out_data.begin(), out_data.end());
+
+    std::vector<int> ret_index = this->DeclareBackwardDependency(
+        out_grad_index, in_data_index, out_data_index);
+
+    std::vector<T> ret(ret_index.size());
+    for (size_t i = 0; i < ret_index.size(); ++i) {
+      ret[i] = all_data[ret_index[i]];
     }
-    return ret;
+    return std::move(ret);
   }
   /*!
    * \brief create OperatorProperty
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index bee0e921117a..df06c4913de8 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -158,11 +158,17 @@ class StaticGraph {
    *  The head and input of Backward pass will be returned by head_grad_nodes and arg_grads.
    *
    * \param head_grad_nodes used to store the created head gradient inputs for backward pass.
-<<<<<<< HEAD
    * \param arg_grads used to store gradients to args, can be multiple one if an argument is used by operator
    */
   void MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
                         std::vector<std::vector<DataEntry> > *arg_grads);
+
+  /*!
+   * \brief create a sum node that aggregates gradient together
+   * \param grad_source the source of the inputs.
+   * \return a created ElementWiseSum node
+   */
+  static Node CreateSumNode(const std::vector<DataEntry> &grad_source);
 };
 
 /*!
@@ -365,7 +371,7 @@ class Executor {
    * \brief Perform a Forward operation of Operator
    *  After this operation, user can get the result by using function head.
    */
-  virtual void Forward(const std::vector<NArray> &inputs) = 0;
+  virtual void Forward() = 0;
   /*!
    * \brief Perform a Backward operation of the Operator.
    *  This must be called after Forward.
diff --git a/src/c_api.cc b/src/c_api.cc
index 2e97b916af9b..ab7899767555 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -486,12 +486,10 @@ MXNET_DLL int MXExecutorForward(ExecutorHandle handle,
                                 NArrayHandle *args) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
-  NArray **args_ptr = reinterpret_cast<NArray**>(args);
-  std::vector<NArray> narrays;
-  for (mx_uint i = 0; i < len; ++i) {
-    narrays.emplace_back(*(args_ptr[i]));
-  }
-  exec->Forward(narrays);
+  CHECK_EQ(len, 0)
+      << "forward do not take narray for now";
+  // TODO(bing): remove args for now
+  exec->Forward();
   API_END();
 }
 
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index e78eecfbeddc..3d57d6a88102 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -29,8 +29,8 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
   int type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
-    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU)\
-      .add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
+    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU).\
+      add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
   }
 };
 
@@ -116,17 +116,17 @@ class ActivationProp : public OperatorProperty {
     return {out_grad[kOut], out_data[kOut]};
   }
 
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
+      const std::vector<void*> &in_grad) const {
     return {{out_grad[kOut], in_grad[kData]}};
   }
 
-  virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
       const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
+      const std::vector<void*> &out_data) const {
     return {{in_data[kData], out_data[kOut]}};
   }
 
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index 65a6ba1d5c99..4a0d6e3fdd57 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -146,17 +146,17 @@ class ElementWiseSumProp : public OperatorProperty {
     return out_grad;
   }
 
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
+      const std::vector<void*> &in_grad) const {
     return {{out_grad[0], in_grad[0]}};
   }
 
-  virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
       const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
+      const std::vector<void*> &out_data) const {
     return {{in_data[0], out_data[0]}};
   }
 
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index f129a27b228d..b49e5c422739 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -174,11 +174,11 @@ class FullyConnectedProp : public OperatorProperty {
     return {out_grad[kOut], in_data[kData], in_data[kWeight]};
   }
 
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
+      const std::vector<void*> &in_grad) const {
     return {{in_data[kData], in_grad[kData]}};
   }
 
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
new file mode 100644
index 000000000000..8cf50541959e
--- /dev/null
+++ b/src/symbol/graph_executor.cc
@@ -0,0 +1,473 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_executor.cc
+ * \brief Executor to execute the Graph.
+*/
+#include <dmlc/logging.h>
+#include <mxnet/symbolic.h>
+#include <memory>
+#include "./graph_executor.h"
+
+namespace mxnet {
+/*!
+ * \brief wrapper class that wraps Backward operation as Forward.
+ */
+class GraphExecutor::BackwardOpWrapper : public Operator {
+ public:
+  /*!
+   * \brief create a backward Operator wrapper given forward op.
+   * \param prop pointer to the property of forward wrapper
+   * \param forward_op the shared ptr to Forward operator
+   * \return the created wrapper.
+   */
+  explicit BackwardOpWrapper(const OperatorProperty *prop,
+                             std::shared_ptr<Operator> forward_op)
+      : op_(forward_op) {
+    out_grad_.resize(prop->NumReturns());
+    in_data_.resize(prop->ListArguments().size());
+    out_data_.resize(prop->NumVisibleReturns());
+
+    std::vector<TBlob*> out_grad_ptr(out_grad_.size());
+    for (size_t i = 0; i < out_grad_.size(); ++i) {
+      out_grad_ptr[i] = &out_grad_[i];
+    }
+    std::vector<TBlob*> in_data_ptr(in_data_.size());
+    for (size_t i = 0; i < in_data_.size(); ++i) {
+      in_data_ptr[i] = &in_data_[i];
+    }
+    std::vector<TBlob*> out_data_ptr(out_data_.size());
+    for (size_t i = 0; i < out_data_.size(); ++i) {
+      out_data_ptr[i] = &out_data_[i];
+    }
+    arg_data_ptr_ = prop->BackwardInputs(
+        out_grad_ptr, out_data_ptr, in_data_ptr);
+  }
+  // implement forward
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    // set things correctly
+    CHECK(arg_data_ptr_.size() == in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      *(arg_data_ptr_[i]) = in_data[i];
+    }
+    // redirect internally
+    op_->Backward(ctx, out_grad_, in_data_, out_data_, req, out_data);
+  }
+
+ private:
+  /*! \brief internal forward operator */
+  std::shared_ptr<Operator> op_;
+  /*! \brief internal space for out_grad */
+  std::vector<TBlob> out_grad_;
+  /*! \brief internal space for in_data */
+  std::vector<TBlob> in_data_;
+  /*! \brief internal space for out_data */
+  std::vector<TBlob> out_data_;
+  /*!
+   * \brief pointer to places in the internal space.
+   *  arg_data_ptr_ maps in_data in Forward to the internal space.
+   */
+  std::vector<TBlob*> arg_data_ptr_;
+};
+
+// get resource
+inline std::vector<ResourceRequest>
+GraphExecutor::GetResource(uint32_t node_id) const {
+  const StaticGraph::Node &node = graph_.nodes[node_id];
+  if (node.is_forward()) {
+    return node.op->ForwardResource();
+  } else {
+    CHECK(node.is_backward());
+    return graph_.nodes[node.backward_source_id].op->BackwardResource();
+  }
+}
+
+inline int GraphExecutor::GetNumOutputs(uint32_t node_id) const {
+  const StaticGraph::Node &node = graph_.nodes[node_id];
+  if (node.is_forward()) {
+    return node.op->NumReturns();
+  } else if (node.is_backward()) {
+    return static_cast<int>(
+        graph_.nodes[node.backward_source_id].op->ListArguments().size());
+  } else {
+    CHECK(node.is_variable());
+    return 1;
+  }
+}
+
+// implement get input option
+template<typename T>
+inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
+    uint32_t node_id,
+    const std::vector<T> &in_data,
+    const std::vector<T> &out_data) const {
+  // get the node
+  const StaticGraph::Node &node = graph_.nodes[node_id];
+
+  if (node.is_forward()) {
+    std::vector<int> in_data_index(in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      in_data_index[i] = static_cast<int>(i);
+    }
+    std::vector<void*> out_data_ptr(out_data.size());
+    for (size_t i = 0; i < out_data.size(); ++i) {
+      out_data_ptr[i] = (void*)&out_data[i];  // NOLINT(*)
+    }
+    auto rmap_index = node.op->ForwardInplaceOption(in_data_index, out_data_ptr);
+    std::vector<std::pair<T, T> > remap(rmap_index.size());
+    for (size_t i = 0; i < remap.size(); ++i) {
+      remap[i].first = in_data[rmap_index[i].first];
+      remap[i].second = *static_cast<const T*>(rmap_index[i].second);
+    }
+    return std::move(remap);
+  } else {
+    CHECK(node.is_backward());
+    // forward property
+    const OperatorProperty *fwd = graph_.nodes[node.backward_source_id].op.get();
+
+    std::vector<int> out_grad_index(fwd->NumReturns());
+    std::vector<int> out_data_index(fwd->NumVisibleReturns());
+    std::vector<int> in_data_index(fwd->ListArguments().size());
+    CHECK_EQ(in_data_index.size(), out_data.size());
+    int counter = 0;
+    for (size_t i = 0; i < out_grad_index.size(); ++i) {
+      out_grad_index[i] = counter++;
+    }
+    for (size_t i = 0; i < in_data_index.size(); ++i) {
+      in_data_index[i] = counter++;
+    }
+    for (size_t i = 0; i < out_data_index.size(); ++i) {
+      out_data_index[i] = counter++;
+    }
+    auto args_index = fwd->DeclareBackwardDependency(
+        out_grad_index, in_data_index, out_data_index);
+    std::vector<const T*> args_array(counter, nullptr);
+    CHECK_EQ(args_index.size(), in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      args_array[args_index[i]] = &in_data[i];
+    }
+    std::vector<void*> in_grad_ptr(out_data.size());
+    for (size_t i = 0; i < in_grad_ptr.size(); ++i) {
+      in_grad_ptr[i] = (void*)&out_data[i];  // NOLINT(*)
+    }
+    auto remap_index = fwd->BackwardInplaceOption(
+        out_grad_index, in_data_index, out_data_index, in_grad_ptr);
+    std::vector<std::pair<T, T> > remap(remap_index.size());
+    for (size_t i = 0; i < remap_index.size(); ++i) {
+      CHECK_NE(args_array[remap_index[i].first], nullptr)
+          << "BackwardInplaceOption uses input that is returned by DeclareBackwardDependency";
+      remap[i].first = *args_array[remap_index[i].first];
+      remap[i].second = *static_cast<T*>(remap_index[i].second);
+    }
+    return std::move(remap);
+  }
+}
+
+inline GraphExecutor::OpExecEntry
+GraphExecutor::GetOpExecEntry(uint32_t nid) {
+  OpNode& op_node = op_nodes_[nid];
+  Operator *op = op_node.op.get();
+  std::vector<OpReqType> req;
+  std::vector<TBlob> in_data, out_data;
+  in_data.reserve(graph_.nodes[nid].inputs.size());
+  out_data.reserve(op_node.outputs.size());
+  req.reserve(op_node.outputs.size());
+
+  OpExecEntry exec;
+  for (const DataEntryInfo& out : op_node.outputs) {
+    out_data.push_back(out.data.data());
+    exec.mutate_vars.push_back(out.data.var());
+    req.push_back(out.op_req);
+  }
+
+  for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+    const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+    in_data.push_back(info.data.data());
+    // skip inplace since they already appear in mutate vars
+    if (info.inplace_op_id != static_cast<int>(nid)) {
+      exec.use_vars.push_back(info.data.var());
+    }
+  }
+
+  OpContext* op_ctx_ptr = &op_node.op_ctx;
+  exec.exec_fun = [op, op_ctx_ptr, in_data, req, out_data] (RunContext ctx) {
+    op_ctx_ptr->run_ctx = ctx;
+    op->Forward(*op_ctx_ptr, in_data, req, out_data);
+  };
+  return std::move(exec);
+}
+
+void GraphExecutor::InitGraph(Symbol symbol, Context ctx, bool need_backward) {
+  // initialize all internal daa structures
+  symbol.ToStaticGraph(&graph_);
+  num_forward_nodes_  = graph_.nodes.size();
+  if (need_backward) {
+    graph_.MakeBackwardPass(&head_grad_nodes_, &arg_grads_);
+  }
+  // reorganize so backward node always follow forward
+  // note that this may not be the case, because existence of head_grad_nodes
+  std::vector<uint32_t> topo = graph_.TopoSort();
+  std::vector<uint32_t>  backward;
+  for (uint32_t nid : topo) {
+    if (nid < num_forward_nodes_) {
+      topo_order_.push_back(nid);
+    } else {
+      backward.push_back(nid);
+    }
+  }
+  topo_order_.insert(topo_order_.end(), backward.begin(), backward.end());
+  // setup all the operator nodes data structure
+  op_nodes_.resize(graph_.nodes.size());
+  for (size_t i = 0; i < graph_.nodes.size(); ++i) {
+    op_nodes_[i].ctx = ctx;
+    op_nodes_[i].outputs.resize(GetNumOutputs(i));
+  }
+}
+
+void GraphExecutor::InitDataEntryInfo(const std::vector<NArray> &in_args,
+                                      const std::vector<NArray> &arg_grad_store,
+                                      const std::vector<OpReqType> &grad_req_type) {
+  CHECK_EQ(arg_grad_store.size(), grad_req_type.size());
+  CHECK_EQ(in_args.size(), graph_.arg_nodes.size());
+  // bind inputs
+  for (size_t i = 0; i < graph_.arg_nodes.size(); ++i) {
+    DataEntryInfo &info = op_nodes_[graph_.arg_nodes[i]].outputs[0];
+    info.type = kBindByExternal;
+    info.data = in_args[i];
+  }
+  // setup ref for head nodes
+  for (StaticGraph::DataEntry e : graph_.heads) {
+    DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+    ++info.ref_count;
+    op_nodes_[e.source_id].activated = true;
+  }
+  // need Backward pass
+  if (arg_grads_.size() != 0) {
+    CHECK_EQ(arg_grads_.size(), arg_grad_store.size());
+    CHECK_EQ(arg_grads_.size(), grad_req_type.size());
+    // setup gradient placeholders
+    for (size_t i = 0; i < arg_grads_.size(); ++i) {
+      if (grad_req_type[i] == kNullOp) continue;
+      CHECK_NE(grad_req_type[i], kWriteInplace)
+          << "Gradient request can only be nullop, add, write";
+      std::vector<StaticGraph::DataEntry> &grad_source = arg_grads_[i];
+      CHECK_GE(grad_source.size(), 1);
+      // TODO(bing) add a aggregation node here
+      if (grad_source.size() > 1) {
+        CHECK_EQ(grad_req_type[i], kAddTo)
+            << "The gradient contains multiple variables,";
+      }
+      for (StaticGraph::DataEntry e : grad_source) {
+        DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        info.type = kBindByExternal;
+        info.op_req = grad_req_type[i];
+        info.data = arg_grad_store[i];
+        ++info.ref_count;
+        op_nodes_[e.source_id].activated = true;
+      }
+    }
+    // setup head gradient
+    for (uint32_t nid : head_grad_nodes_) {
+      DataEntryInfo &info = op_nodes_[nid].outputs[0];
+      info.type = kTobeBindByExternal;
+    }
+  }
+  // update ref counters for all other nodes, in reverse topo order
+  for (auto it = topo_order_.rbegin(); it != topo_order_.rend(); ++it) {
+    uint32_t nid = *it;
+    if (op_nodes_[nid].activated) {
+      for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+        DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        ++info.ref_count;
+        op_nodes_[e.source_id].activated = true;
+      }
+    }
+  }
+
+  // shape inference
+  std::vector<std::vector<TShape> > out_shapes(op_nodes_.size());
+  for (size_t i = 0; i < out_shapes.size(); ++i) {
+    out_shapes[i].resize(op_nodes_[i].outputs.size());
+  }
+  for (size_t i = 0; i < graph_.arg_nodes.size(); ++i) {
+    out_shapes[graph_.arg_nodes[i]][0] = in_args[i].shape();
+  }
+  CHECK(graph_.InferNodeShapes(topo_order_, &out_shapes))
+      << "Shape inference cannot be complete in bind";
+  for (size_t i = 0; i < out_shapes.size(); ++i) {
+    for (size_t j = 0; j < out_shapes[i].size(); ++j) {
+      op_nodes_[i].outputs[j].shape = out_shapes[i][j];
+    }
+  }
+}
+
+void GraphExecutor::InitDataEntryMemory() {
+  // use allocator to allocate memory.
+  GraphStorageAllocator allocator(&graph_);
+
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+
+    // check inplace option
+    std::vector<DataEntryInfo*> in_data;
+    in_data.reserve(graph_.nodes[nid].inputs.size());
+    // check inputs are ready.
+    for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+      DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+      CHECK_NE(info.type, kNotInitialized);
+      CHECK_NE(info.ref_count, 0);
+      in_data.push_back(&info);
+    }
+    std::vector<DataEntryInfo*> out_data(op_nodes_[nid].outputs.size());
+    for (size_t i = 0; i < op_nodes_[nid].outputs.size(); ++i) {
+      out_data[i] = &op_nodes_[nid].outputs[i];
+      CHECK_NE(out_data[i]->type, kInternalAllocated);
+    }
+    auto inplace = GetInplaceOption(nid, in_data, out_data);
+    for (std::pair<DataEntryInfo*, DataEntryInfo*> kv : inplace) {
+      DataEntryInfo* in = kv.first;
+      DataEntryInfo* out = kv.second;
+      if (in->ref_count == 1 &&
+          in->type == kInternalAllocated &&
+          out->type == kNotInitialized) {
+        // we can only do inplace if we are last user of in
+        // and out is not initialized.
+        out->type = kInternalAllocated;
+        out->op_req = kWriteInplace;
+        out->storage_id = in->storage_id;
+        // set inplace op id
+        in->ref_count = 0;
+        in->inplace_op_id = static_cast<int>(nid);
+      }
+    }
+    // allocate output,
+    for (DataEntryInfo *out : out_data) {
+      if (out->op_req == kNullOp && out->ref_count != 0) {
+        out->op_req = kWriteTo;
+      }
+      if (out->type == kNotInitialized) {
+        out->storage_id = allocator.Request(
+            op_nodes_[nid].ctx, out->shape, nid);
+        out->type = kInternalAllocated;
+      }
+    }
+    // then free inputs
+    for (DataEntryInfo *in : in_data) {
+      // ref_count == 0 means it is taken by inplace op
+      if (in->ref_count == 0) {
+        CHECK_EQ(in->inplace_op_id, static_cast<int>(nid));
+        continue;
+      }
+      // if we decrease it to zero, means we are ready to relase
+      if (--in->ref_count == 0) {
+        allocator.Release(in->storage_id, nid);
+      }
+    }
+    // check out again, if there is ref_count == 0, release it
+    for (DataEntryInfo *out : out_data) {
+      if (out->ref_count == 0) {
+        allocator.Release(out->storage_id, nid);
+      }
+    }
+  }
+  // one pass complete, allocate real memory
+  allocator.InitStorages();
+  // get the real data NArray into the DataEntryInfo
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    for (DataEntryInfo &out : op_nodes_[nid].outputs) {
+      CHECK_NE(out.type, kNotInitialized);
+      if (out.type == kInternalAllocated) {
+        out.data = allocator.Get(out.storage_id, out.shape);
+      }
+    }
+  }
+  for (StaticGraph::DataEntry e : graph_.heads) {
+    DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+    CHECK_EQ(info.type, kInternalAllocated);
+    heads_narray_.push_back(info.data);
+  }
+}
+
+void GraphExecutor::InitOpNodes() {
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& op_node = op_nodes_[nid];
+    if (graph_.nodes[nid].is_forward()) {
+      op_node.op.reset(graph_.nodes[nid].op->CreateOperator(op_node.ctx));
+    } else {
+      CHECK(graph_.nodes[nid].is_backward());
+      op_node.op.reset(new BackwardOpWrapper(
+          graph_.nodes[graph_.nodes[nid].backward_source_id].op.get(),
+          op_nodes_[graph_.nodes[nid].backward_source_id].op));
+    }
+    bool allow_cache = true;
+    for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+      DataEntryInfo& info = op_nodes_[e.source_id].outputs[e.index];
+      if (info.type == kTobeBindByExternal) allow_cache = false;
+    }
+    for (DataEntryInfo& info : op_node.outputs) {
+      if (info.type == kTobeBindByExternal) allow_cache = false;
+    }
+    if (allow_cache) {
+      op_node.cached_exec = GetOpExecEntry(nid);
+    }
+  }
+}
+
+void GraphExecutor::RunOps(size_t topo_start, size_t topo_end) {
+  for (size_t i = topo_start; i < topo_end; ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& opnode = op_nodes_[nid];
+    if (opnode.cached_exec.exec_fun != nullptr) {
+      DAGEngine::Get()->Push(
+          opnode.cached_exec.exec_fun,
+          opnode.ctx,
+          opnode.cached_exec.use_vars,
+          opnode.cached_exec.mutate_vars);
+    } else {
+      auto exec = GetOpExecEntry(nid);
+      DAGEngine::Get()->Push(
+          exec.exec_fun,
+          opnode.ctx,
+          exec.use_vars,
+          exec.mutate_vars);
+    }
+  }
+}
+
+void GraphExecutor::Forward() {
+  RunOps(0, num_forward_nodes_);
+}
+
+void GraphExecutor::Backward(const std::vector<NArray> &head_grads) {
+  CHECK_EQ(head_grad_nodes_.size(), head_grads.size());
+  for (size_t i = 0; i < head_grad_nodes_.size(); ++i) {
+    uint32_t nid = head_grad_nodes_[i];
+    CHECK(graph_.nodes[nid].is_variable());
+    DataEntryInfo &info = op_nodes_[nid].outputs[0];
+    CHECK_EQ(info.type, kTobeBindByExternal);
+    info.data = head_grads[i];
+  }
+  RunOps(num_forward_nodes_, topo_order_.size());
+}
+
+Executor *Executor::Bind(Symbol symbol,
+                         Context ctx,
+                         const std::vector<NArray> &in_args,
+                         const std::vector<NArray> &arg_grad_store,
+                         const std::vector<OpReqType> &grad_req_type) {
+  GraphExecutor *exec = new GraphExecutor();
+  exec->Init(symbol, ctx, in_args, arg_grad_store, grad_req_type);
+  return exec;
+}
+}  // namespace mxnet
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
new file mode 100644
index 000000000000..ccc4e64a904f
--- /dev/null
+++ b/src/symbol/graph_executor.h
@@ -0,0 +1,179 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_executor.h
+ * \brief Executor to execute the Forward and Backward on Composition Graph.
+*/
+#ifndef MXNET_SYMBOL_GRAPH_EXECUTOR_H_
+#define MXNET_SYMBOL_GRAPH_EXECUTOR_H_
+
+#include <mxnet/symbolic.h>
+#include <memory>
+#include <vector>
+#include <utility>
+#include "./graph_memory_allocator.h"
+
+namespace mxnet {
+/*!
+ * \brief Executor of a computation graph.
+ */
+class GraphExecutor : public Executor {
+ public:
+  virtual ~GraphExecutor() {}
+  virtual void Forward();
+  virtual void Backward(const std::vector<NArray> &head_grads);
+  virtual const std::vector<NArray> &heads() const {
+    return heads_narray_;
+  }
+  // implement Executor::Bind, only call it once.
+  inline void Init(Symbol symbol,
+                   Context ctx,
+                   const std::vector<NArray> &in_args,
+                   const std::vector<NArray> &arg_grad_store,
+                   const std::vector<OpReqType> &grad_req_type) {
+    CHECK_EQ(grad_req_type.size(), arg_grad_store.size());
+    bool need_backward = false;
+    for (auto req : grad_req_type) {
+      if (req != kNullOp) need_backward = true;
+    }
+    this->InitGraph(symbol, ctx, need_backward);
+    this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type);
+    this->InitDataEntryMemory();
+    this->InitOpNodes();
+  }
+
+ protected:
+  // internal class of wrapping BackwardOp as ForwardOp
+  class BackwardOpWrapper;
+  // type of data entry
+  enum DataEntryType {
+    // memory is binded by external NArray in Bind
+    kBindByExternal,
+    // to be binded by external NArray in Forward and Backward
+    kTobeBindByExternal,
+    // internal memory, allocated
+    kInternalAllocated,
+    // internal memory, to be allocated
+    kNotInitialized
+  };
+  // Additional information about each data entry
+  struct DataEntryInfo {
+    // the actual data for the entry
+    NArray data;
+    // write request to this entry
+    OpReqType op_req;
+    // the operatio node that will take
+    // this DataEntry as inplace input
+    int inplace_op_id;
+    // data entry type
+    DataEntryType type;
+    // shape of this entry
+    TShape shape;
+    // storage id from allocator if it is internal allocation.
+    GraphStorageAllocator::StorageID storage_id;
+    // reference count on how many times this entry is being used.
+    // That is how many operators and heads need this DataEntry
+    // this is a temporal variable that is used during initialization.
+    uint32_t ref_count;
+    // constructor
+    DataEntryInfo()
+        : op_req(kNullOp),
+          inplace_op_id(-1),
+          type(kNotInitialized),
+          ref_count(0) {}
+  };
+  // all the information needed to push the op to engine
+  struct OpExecEntry {
+    // execution function for
+    DAGEngine::Op exec_fun;
+    // variables to read from
+    std::vector<DAGEngine::Variable> use_vars;
+    // variables to mutate
+    std::vector<DAGEngine::Variable> mutate_vars;
+    // constructor
+    OpExecEntry() : exec_fun(nullptr) {}
+  };
+  // Information about operational node
+  struct OpNode {
+    // whether this op node is activated
+    bool activated;
+    // the context of the node
+    Context ctx;
+    // data entry information about outputs of op
+    std::vector<DataEntryInfo> outputs;
+    // The following parts are constructed in InitOpNodes
+    // the real operator
+    std::shared_ptr<Operator> op;
+    // op context, that is defined for this op.
+    OpContext op_ctx;
+    // executor, this is only allocated for nodes
+    // whose inputs, outputs are pre-defined.
+    // otherwise cached_exec.exec_fun == nullptr
+    OpExecEntry cached_exec;
+    // constructor
+    OpNode() : activated(false) {}
+  };
+  /*!
+   * \brief Get input option of a node.
+   *  This function is overriden for both Forward and Backward node.
+   *
+   * \param node_id node index of node in StaticGraph
+   * \param in_data the input data entry to the node
+   * \param out_data the output data entry in the graph
+   * \return the paired inplace option.
+   */
+  template<typename T>
+  inline std::vector<std::pair<T, T> > GetInplaceOption(
+      uint32_t node_id,
+      const std::vector<T> &in_data,
+      const std::vector<T> &out_data) const;
+  /*!
+   * \brief Get resource requirement of a node.
+   *  This function is overriden for both Forward and Backward node.
+   * \param node_id node index of node in StaticGraph
+   * \return the desired resource request.
+   */
+  inline std::vector<ResourceRequest> GetResource(uint32_t node_id) const;
+  /*!
+   * \brief Get number of outputs of a node.
+   *  This function is overriden for both Forward and Backward node.
+   * \param node_id node index of node in StaticGraph
+   * \return the number of outputs of the node.
+   */
+  inline int GetNumOutputs(uint32_t node_id) const;
+  /*!
+   * \brief get execution entry for an OpNode.
+   *  This function can only be called after initialization is done.
+   * \param node_id the id of operational node.
+   * \return the execution entry.
+   */
+  inline OpExecEntry GetOpExecEntry(uint32_t node_id);
+  // initialize the internal graph structure
+  void InitGraph(Symbol symbol, Context ctx, bool need_backward);
+  // initialize internal DataEntryInfo, reference counting
+  void InitDataEntryInfo(const std::vector<NArray> &in_args,
+                         const std::vector<NArray> &arg_grad_store,
+                         const std::vector<OpReqType> &grad_req_type);
+  // initialize internal data entries NArray
+  void InitDataEntryMemory();
+  // initialize OpNode data structure
+  void InitOpNodes();
+  // run ops from topo order start to end
+  void RunOps(size_t topo_start, size_t topo_end);
+  // internal computational graph
+  StaticGraph graph_;
+  // topological order of nodes in computation graph
+  // backward nodes always follow forward nodes
+  std::vector<uint32_t> topo_order_;
+  // number of forward nodes in the graph
+  size_t num_forward_nodes_;
+  // head gradient node in the graph, if there is backward pass
+  std::vector<uint32_t> head_grad_nodes_;
+  // argument node in the graph, if there is backward pass
+  std::vector<std::vector<StaticGraph::DataEntry> > arg_grads_;
+  // operational nodes
+  std::vector<OpNode> op_nodes_;
+  // head NArrays
+  std::vector<NArray> heads_narray_;
+};  // class GraphExecutor
+}  // namespace mxnet
+#endif  // MXNET_SYMBOL_GRAPH_EXECUTOR_H_
diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h
new file mode 100644
index 000000000000..4c047040a041
--- /dev/null
+++ b/src/symbol/graph_memory_allocator.h
@@ -0,0 +1,145 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_memory_allocator.h
+ * \brief Memory allocator for graph executor.
+*/
+#ifndef MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_
+#define MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_
+
+#include <mxnet/symbolic.h>
+#include <mxnet/narray.h>
+#include <map>
+#include <vector>
+
+namespace mxnet {
+/*!
+ * \brief Memory allocators for the GraphExecutor.
+ *  This class is intended to be used by GraphExecutor
+ *  to allocate the memory for each DataEntryInfo.
+ *
+ *  The class algorithm works in two phase:
+ *  (1) Planning Phase: GraphExecutor call Request and Release
+ *      to request and release resources according to dependency.
+ *      - Each call to Request will get a ResourceID that is used to
+ *        identify the memory block assigned to each DataEntryInfo.
+ *  (2) Allocating phase: GraphExecutor call InitMemory.
+ *      - Then each DataEntry will call Get to get the real NArray.
+ *  (3) All the memory will be freed up when reference to all the related NArray ends.
+ */
+class GraphStorageAllocator {
+ public:
+  /*! \brief resource index */
+  typedef uint64_t StorageID;
+  /*! \brief constructor to the graph memory allocator */
+  explicit GraphStorageAllocator(StaticGraph *graph);
+  /*!
+   * \brief Request a memory.
+   * \param ctx the context of the graph
+   * \param shape shape of the NArray we want
+   * \param node_id the node that is requesting the memory, used as hint.
+   */
+  StorageID Request(Context ctx, TShape shape, uint32_t node_id);
+  /*!
+   * \brief Release a memory.
+   * \param id the storage ID of the memory.
+   * \param node_id the node id in the graph that is releasing the memory.
+   */
+  void Release(StorageID id, uint32_t node_id);
+  /*! \brief Initialize all the memories requested */
+  void InitStorages();
+  /*!
+   * \brief Get the the memory allocated in planning phase.
+   * \param id the storage id allocated in planning phase.
+   * \param shape the shape of the NArray requested.
+   */
+  NArray Get(StorageID id, TShape shape);
+
+ private:
+  /*! \brief internal storage entry */
+  struct StorageEntry {
+    /*! \brief id of the storage */
+    StorageID id;
+    /*! \brief the context of the storage */
+    Context ctx;
+    /*! \brief maximum size of the storage that is requested */
+    size_t max_size;
+    /*! \brief the actual NArray to hold the data */
+    NArray data;
+    /*! \brief constructor */
+    StorageEntry() : max_size(0) {}
+  };
+  /*!
+   * \brief Allocate a StorageID when Request cannot found existing ones.
+   * \param ctx the context of the graph
+   * \param shape shape of the NArray we want
+   */
+  StorageID Alloc(Context ctx, size_t size);
+
+  /*! \brief reference to the computation graph */
+  StaticGraph *graph_;
+  /*! \brief all the resources available */
+  std::vector<std::unique_ptr<StorageEntry> > data_;
+  /*!
+   * \brief free list of storage entries, maps size to free list
+   */
+  std::multimap<size_t, StorageEntry*> free_;
+};
+
+// put implementation in header files for now
+GraphStorageAllocator::GraphStorageAllocator(StaticGraph *graph)
+    : graph_(graph) {}
+
+GraphStorageAllocator::StorageID
+GraphStorageAllocator::Alloc(Context ctx, size_t size) {
+  StorageID id = static_cast<StorageID>(data_.size());
+  std::unique_ptr<StorageEntry> ptr(new StorageEntry());
+  ptr->id = id;
+  ptr->ctx = ctx;
+  ptr->max_size = size;
+  data_.push_back(std::move(ptr));
+  return id;
+}
+
+GraphStorageAllocator::StorageID
+GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
+  size_t size = shape.Size();
+  if (free_.count(size) != 0) {
+    auto begin = free_.lower_bound(size);
+    auto end = free_.upper_bound(size);
+    // vector of possible candidates
+    for (auto it = begin; it != end; ++it) {
+      StorageEntry *e = it->second;
+      if (e->ctx != ctx) continue;
+      // Use exect matching strategy
+      // TODO(bing): think of other strategies, for example, rough match.
+      if (e->max_size != size) continue;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return e->id;
+    }
+  }
+  // cannot find anything return a new one.
+  return this->Alloc(ctx, size);
+}
+
+void GraphStorageAllocator::Release(StorageID id, uint32_t node_id) {
+  StorageEntry *e = data_[id].get();
+  free_.insert({e->max_size, e});
+}
+
+void GraphStorageAllocator::InitStorages() {
+  for (size_t i = 0; i < data_.size(); ++i) {
+    StorageEntry *e = data_[i].get();
+    TShape shape = mshadow::Shape1(e->max_size);
+    e->data = NArray(shape, e->ctx);
+  }
+}
+
+NArray GraphStorageAllocator::Get(StorageID id, TShape shape) {
+  StorageEntry *e = data_[id].get();
+  // TODO(bing): change to return e->data.Slice(0, shape.Size()).Reshape(shape);
+  // once we are able to get NArray that shares memory from a big chunk.
+  return NArray(shape, e->ctx);
+}
+}  // namespace mxnet
+#endif  // MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 3bec3427fbb3..5eb0ad14a282 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -152,6 +152,18 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   return true;
 }
 
+StaticGraph::Node StaticGraph::CreateSumNode(
+    const std::vector<DataEntry> &grad_source) {
+  // find multiple gradients, need aggregate
+  std::ostringstream os_size;
+  Node agg_node;
+  agg_node.op.reset(OperatorProperty::Create("ElementWiseSum"));
+  os_size << grad_source.size();
+  agg_node.op->Init({{"size", os_size.str()}});
+  agg_node.inputs = grad_source;
+  return std::move(agg_node);
+}
+
 void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
                                    std::vector<std::vector<DataEntry> > *arg_grads) {
   arg_grads->clear();
@@ -162,14 +174,15 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
   std::map<DataEntry, std::vector<DataEntry> > grad_map;
   // allocate head gradient nodes
   for (DataEntry head : heads) {
-    uint32_t nid = static_cast<uint32_t>(nodes.size());
-    // create a variable node for gradient input
-    nodes.push_back(Node());
-    Node &node = nodes[nid];
+    Node node;
     std::ostringstream os;
     os << nodes[head.source_id].name << '_' << head.index << "_grad";
     // TODO(bing): add index to name
     node.name = os.str();
+    // node id
+    uint32_t nid = static_cast<uint32_t>(nodes.size());
+    nodes.push_back(std::move(node));
+    // create a variable node for gradient input
     DataEntry igrad(nid, 0);
     head_grad_nodes->push_back(nid);
     // update gradient map
@@ -204,31 +217,25 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
       if (gnodes.size() == 1) {
         out_grad.push_back(gnodes[0]);
       } else {
-        // find multiple gradients, need aggregate
-        std::ostringstream os_size, os_name;
-        uint32_t agg_node_id = static_cast<uint32_t>(nodes.size());
-        nodes.push_back(Node());
-        Node &agg_node = nodes[agg_node_id];
-        agg_node.op.reset(OperatorProperty::Create("ElementWiseSum"));
-        os_size << gnodes.size();
-        agg_node.op->Init({{"size", os_size.str()}});
+        std::ostringstream os_name;
+        Node agg_node = StaticGraph::CreateSumNode(gnodes);
         os_name << nodes[nid].name << '_' << i << "_out_grad_agg";
         agg_node.name = os_name.str();
-        agg_node.inputs = gnodes;
+        uint32_t agg_node_id = static_cast<uint32_t>(nodes.size());
+        nodes.push_back(std::move(agg_node));
         out_grad.push_back(DataEntry(agg_node_id, 0));
       }
     }
     // Create a gradient backward node
-    nodes.push_back(Node());
-    uint32_t grad_node_id = static_cast<uint32_t>(nodes.size());
-    Node &grad_node = nodes[grad_node_id];
+    Node grad_node;
     // Point to the corresponding source
     grad_node.backward_source_id = nid;
     // select out the dependent inputs
     grad_node.inputs = nodes[nid].op->BackwardInputs(
         out_grad, nodes[nid].inputs, out_data);
     grad_node.name = nodes[nid].name + "_backward";
-
+    uint32_t grad_node_id = static_cast<uint32_t>(nodes.size());
+    nodes.push_back(std::move(grad_node));
     // update gradient map
     for (size_t i = 0; i < nodes[nid].inputs.size(); ++i) {
       DataEntry idata = nodes[nid].inputs[i];

From 7a1832cb719fa412c85f9c3bc5de22a7f2634c51 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 19 Aug 2015 20:35:55 -0600
Subject: [PATCH 10/11] update NArray

---
 include/mxnet/narray.h | 66 +++++++++++++++++++++++++++++++++---------
 python/mxnet/symbol.py | 10 +++----
 src/c_api.cc           |  3 +-
 src/narray/narray.cc   | 18 ++++++++----
 4 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 92257b3f0269..cc5d2cf1b4a2 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -35,7 +35,7 @@ class NArray {
    */
   NArray(const TShape &shape, Context ctx,
          bool delay_alloc = false)
-      : ptr_(new Chunk(shape, ctx, delay_alloc)) {
+      : ptr_(new Chunk(shape.Size(), ctx, delay_alloc)), shape_(shape), offset_(0) {
   }
   /*!
    * \brief constructing a static NArray that shares data with TBlob
@@ -45,19 +45,20 @@ class NArray {
    * \param dev_id the device id this tensor sits at
    */
   NArray(const TBlob &data, int dev_id)
-      : ptr_(new Chunk(data, dev_id)) {
+      : ptr_(new Chunk(data, dev_id)), shape_(data.shape_), offset_(0) {
   }
   /*!
    * \return the shape of current NArray
    */
   inline const TShape &shape() const {
-    return ptr_->data.shape_;
+    return shape_;
   }
   /*!
    * \return the data TBlob
    */
-  inline const TBlob &data() const {
-    return ptr_->data;
+  inline TBlob data() const {
+    return TBlob(static_cast<real_t*>(ptr_->shandle.dptr) + offset_, \
+                                      shape_, ptr_->shandle.ctx.dev_mask);
   }
   /*!
    * \return the context of NArray, this function is only valid when the NArray is not empty
@@ -123,6 +124,42 @@ class NArray {
    * \return the new copy
    */
   NArray Copy(Context ctx) const;
+  /*!
+   * \brief Slice a NArray
+   * 
+   * \param begin begin index in first dim
+   * \param end end index in first dim
+   * 
+   * \return sliced NArray
+   */
+  NArray Slice(index_t begin, index_t end) {
+    NArray ret = *this;
+    CHECK_GE(shape_.ndim(), 0) << "NArray not initialized";
+    CHECK_GE(shape_[0], end) << "Chunk is smaller than required";
+    size_t length = 1;
+    if (shape_.ndim() == 1) {
+      ret.offset_= begin;
+    } else {
+      for (index_t i = 1; i < shape_.ndim(); ++i) {
+        length *= shape_[i];
+      }
+      ret.offset_ = begin * length;
+    }
+    return ret;
+  }
+  /*!
+   * \brief Reshape current NArray
+   * 
+   * \param shape new shape
+   * \return NArray in new shape
+   */
+  NArray Reshape(const TShape &shape) {
+    CHECK_GE(shape_.Size(), shape.Size()) \
+      << "required shape is larger than chunk";
+    NArray ret = *this;
+    ret.shape_ = shape;
+    return ret;
+  }
 
  private:
   /*! \brief the real data chunk that backs NArray */
@@ -131,8 +168,6 @@ class NArray {
     Storage::Handle shandle;
     /*! \brief variable from DAG engine */
     DAGEngine::Variable var;
-    /*! \brief holds the data content */
-    TBlob data;
     /*!
      * \brief if this is true, this means the data do not come
      * from Storage, and do not need to be freed
@@ -146,25 +181,25 @@ class NArray {
     }
     /*! \brief construct from static data */
     Chunk(const TBlob &data, int dev_id)
-        : data(data),
-          static_data(true),
+        : static_data(true),
           delay_alloc(false) {
       var = DAGEngine::Get()->NewVar();
       shandle.ctx = Context(data.dev_mask_, dev_id);
+      shandle.dptr = data.dptr_;
+      shandle.size = data.shape_.Size() * sizeof(real_t);
     }
     /*! \brief construct a new chunk */
-    Chunk(const TShape &shape, Context ctx, bool delay_alloc_)
+    Chunk(uint64_t size, Context ctx, bool delay_alloc_)
         : static_data(false), delay_alloc(true) {
       var = DAGEngine::Get()->NewVar();
-      data.shape_ = shape;
+      shandle.size = size * sizeof(real_t);
       shandle.ctx = ctx;
       if (!delay_alloc_) this->CheckAndAlloc();
     }
     /*! \brief check if delay alloc is on, do alloc if not yet done */
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
-        shandle = Storage::Get()->Alloc(data.shape_.Size() * sizeof(real_t), shandle.ctx);
-        data = TBlob(static_cast<real_t*>(shandle.dptr), data.shape_, shandle.ctx.dev_mask);
+        shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
         delay_alloc = false;
       }
     }
@@ -183,6 +218,11 @@ class NArray {
   };
   /*! \brief internal data of NArray */
   std::shared_ptr<Chunk> ptr_;
+  /*! \brief shape of current NArray */
+  TShape shape_;
+  /*! \brief offset in chunk */
+  size_t offset_;
+
   // add friend to helper functions
   friend void CopyFromTo(const NArray &from, NArray *to);
   template<typename OP>
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index c491eacb1ac4..dbe60ddb7a78 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -253,7 +253,7 @@ def forward(self, inputs):
             if not isinstance(obj, NArray):
                 raise TypeError("inputs must be NArray")
         narray = c_array([item.handle for item in inputs])
-        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(inputs), narray))
+        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(inputs), narray)))
 
     def backward(self, grads):
         """do backward on heads' grads
@@ -269,7 +269,7 @@ def backward(self, grads):
             if not isinstance(obj, NArray):
                 raise TypeError("inputs must be NArray")
         narray = c_array(NArrayHandle, [item.handle for item in grads])
-        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(grads), narray))
+        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(grads), narray)))
 
     def heads(self):
         """list all heads' output narray
@@ -311,9 +311,9 @@ def Bind(sym, ctx, args, args_grad, reqs):
         raise TypeError("Context type error")
     args_handle = c_array(NArrayHandle, [item.handle for item in args])
     args_grad_handle = c_array(NArrayHandle, [item.handle for item in args_grad])
-    reqs_array = c_array(mx_uint, mx_uint(enum[item]) for item in req)
+    reqs_array = c_array(mx_uint, [mx_uint(enum[item]) for item in req])
     handle = ExecutorHandle()
     check_call(_LIB.MXExecutorBind(handle, sym.handle, \
         mx_uint(ctx.device_mask), mx_uint(ctx.device_id), \
-        args_handle, args_grad_handle, reqs_array)
-    return Executor(handle);
\ No newline at end of file
+        mx_uint(len(args), args_handle, args_grad_handle, reqs_array)))
+    return Executor(handle)
diff --git a/src/c_api.cc b/src/c_api.cc
index ab7899767555..d861ac00fc41 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -528,7 +528,6 @@ MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
                              NArrayHandle *arg_grad_store,
                              mx_uint *grad_req_type) {
   API_BEGIN();
-  Executor *exec = static_cast<Executor*>(handle);
   Symbol *symb = static_cast<Symbol*>(symbol_handle);
   Context ctx = Context(dev_mask, dev_id);
   NArray **in_args_ptr = reinterpret_cast<NArray**>(in_args);
@@ -541,7 +540,7 @@ MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
     arg_grad_vec.push_back(*(arg_grad_ptr[i]));
     grad_req_vec.push_back(static_cast<OpReqType>(grad_req_type[i]));
   }
-  handle = exec->Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
+  handle = Executor::Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
   API_END();
 }
 
diff --git a/src/narray/narray.cc b/src/narray/narray.cc
index 831041bd1496..3618a38c9d59 100644
--- a/src/narray/narray.cc
+++ b/src/narray/narray.cc
@@ -37,14 +37,16 @@ inline void BinaryOp(const NArray &lhs,
     case cpu::kDevMask:
       DAGEngine::Get()->Push([lhs, rhs, ret](RunContext ctx) {
           ret.ptr_->CheckAndAlloc();
-          narray::Eval<cpu, OP>(lhs.ptr_->data, rhs.ptr_->data, &ret.ptr_->data, ctx);
+          TBlob tmp = ret.data();
+          narray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
         }, lhs.ctx(), {lhs.ptr_->var, rhs.ptr_->var}, {ret.ptr_->var});
       break;
 #if MXNET_USE_CUDA
     case gpu::kDevMask:
       DAGEngine::Get()->Push([lhs, rhs, ret](RunContext ctx) {
           ret.ptr_->CheckAndAlloc();
-          narray::Eval<gpu, OP>(lhs.ptr_->data, rhs.ptr_->data, &ret.ptr_->data, ctx);
+          TBlob tmp = ret.data();
+          narray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
         }, lhs.ctx(), {lhs.ptr_->var, rhs.ptr_->var}, {ret.ptr_->var});
       break;
 #endif
@@ -64,14 +66,16 @@ void CopyFromTo(const NArray &from, NArray *to) {
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<cpu, cpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<cpu, cpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var});
   } else if (a == cpu::kDevMask && b == gpu::kDevMask) {
 #if MXNET_USE_CUDA
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<cpu, gpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<cpu, gpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, ret.ctx(), {from.ptr_->var}, {ret.ptr_->var});
 #else
@@ -81,7 +85,8 @@ void CopyFromTo(const NArray &from, NArray *to) {
 #if MXNET_USE_CUDA
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<gpu, cpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<gpu, cpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var});
 #else
@@ -91,7 +96,8 @@ void CopyFromTo(const NArray &from, NArray *to) {
 #if MXNET_USE_CUDA
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<gpu, gpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<gpu, gpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var});
 #else

From 910738d84b9906673711af35c522cbdf778f856f Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 20 Aug 2015 00:14:58 -0600
Subject: [PATCH 11/11] MNIST is OK

---
 include/mxnet/c_api.h               | 151 ++++++++--------------------
 include/mxnet/narray.h              |  11 +-
 include/mxnet/operator.h            |   2 +-
 python/mxnet/executor.py            |  57 +++++++++++
 python/mxnet/symbol.py              | 125 ++++++-----------------
 python/test_mnist.py                |  55 +++++-----
 src/c_api.cc                        |  55 +++++-----
 src/symbol/graph_executor.cc        |  39 +++++--
 src/symbol/graph_executor.h         |   7 ++
 src/symbol/graph_memory_allocator.h |  36 +++----
 10 files changed, 256 insertions(+), 282 deletions(-)
 create mode 100644 python/mxnet/executor.py

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index cd0b6b2206c1..38132cb169a5 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -34,12 +34,10 @@ typedef void *AtomicSymbolCreator;
 typedef void *SymbolHandle;
 /*! \brief handle to a AtomicSymbol */
 typedef void *AtomicSymbolHandle;
-/*! \brief handle to a NArrayOperator */
-typedef void *OperatorHandle;
-/*! \brief handle to a DataIterator */
-typedef void *DataIterHandle;
 /*! \brief handle to an Executor */
 typedef void *ExecutorHandle;
+/*! \brief handle to a DataIterator */
+typedef void *DataIterHandle;
 /*
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
@@ -353,63 +351,59 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  const mx_uint ***out_shape_data,
                                  int *complete);
 //--------------------------------------------
-// Part 4: operator interface on NArray
+// Part 4: Executor interface
 //--------------------------------------------
 /*!
- * \brief create operator from symbol
- * \param sym the symbol to create operator from
- * \param dev_mask device mask to indicate the device type
- * \param dev_id the device id we want to bind the symbol to
- * \param out the corresponding function handle
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXOpCreate(SymbolHandle sym,
-                         int dev_mask,
-                         int dev_id,
-                         OperatorHandle *out);
-/*!
- * \brief free the operator handle
- * \param op the handle to be freed
+ * \brief Executor forward method
+ *
+ * \param handle executor handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXOpFree(OperatorHandle op);
+MXNET_DLL int MXExecutorForward(ExecutorHandle handle);
 /*!
- * \brief return an array to describe the arguments
- *  of this operator
- * \param out_size the size of output array
- * \param out_array the array of parameter requirments
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NArray handle for heads' gradient
+ *
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXOpDescribeArgs(mx_uint *out_size,
-                               int **out_array);
+MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
+                                 mx_uint len,
+                                 NArrayHandle *head_grads);
+
 /*!
- * \brief call forward on the operator
- * \param op the operator handle
- * \param in_data array of input narray to the operator
- * \param out_data array of output NArray to hold the result
+ * \brief Get executor's head NArray
+ *
+ * \param handle executor handle
+ * \param out_size output narray vector size
+ * \param out out put narray handles
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXOpForward(OperatorHandle op,
-                          NArrayHandle *in_data,
-                          NArrayHandle *out_data);
+MXNET_DLL int MXExecutorHeads(ExecutorHandle handle,
+                              mx_uint *out_size,
+                              NArrayHandle **out);
+
 /*!
- * \brief call backward on the operator
- * \param op the operator handle
- * \param grad_next array of output gradients
- * \param in_data array of input narray to the operator
- * \param out_data array of output narray to the operator
- * \param out_grad array to holds the gradient on these input
- *    can be NULL if that position request is kNullOp
- * \param reqs gradient request type
+ * \brief Generate Executor from symbol
+ *
+ * \param symbol_handle symbol handle
+ * \param len length
+ * \param in_args in args array
+ * \param arg_grad_store arg grads handle array
+ * \param grad_req_type grad req array
+ * \param out output executor handle
  * \return 0 when success, -1 when failure happens
- * \sa mxnet::Operator::GradReqType
  */
-MXNET_DLL int MXOpBackward(OperatorHandle op,
-                           NArrayHandle *grad_next,
-                           NArrayHandle *in_data,
-                           NArrayHandle *out_data,
-                           NArrayHandle *out_grad,
-                           mx_uint *reqs);
+MXNET_DLL int MXExecutorBind(SymbolHandle symbol_handle,
+                             int dev_mask,
+                             int dev_id,
+                             mx_uint len,
+                             NArrayHandle *in_args,
+                             NArrayHandle *arg_grad_store,
+                             mx_uint *grad_req_type,
+                             ExecutorHandle *out);
 
 //--------------------------------------------
 // Part 5: IO Interface
@@ -460,65 +454,4 @@ MXNET_DLL int MXIOGetData(DataIterHandle handle,
 MXNET_DLL int MXIOGetLabel(DataIterHandle handle,
                            NArrayHandle *out);
 
-//--------------------------------------------
-// Part 56: Executor
-//--------------------------------------------
-/*!
- * \brief Executor forward method
- *
- * \param handle executor handle
- * \param len length of narray handles
- * \param input input NArray handles
- *
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXExecutorForward(ExecutorHandle handle,
-                                mx_uint len,
-                                NArrayHandle *input);
-
-/**
- * \brief Excecutor run backward
- *
- * \param handle execute handle
- * \param len lenth
- * \param head_grads NArray handle for heads' gradient
- *
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
-                                 mx_uint len,
-                                 NArrayHandle *head_grads);
-
-/**
- * \brief Get executor's head NArray
- *
- * \param handle executor handle
- * \param out_size output narray vector size
- * \param out out put narray handles
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXExecutorHeads(ExecutorHandle handle,
-                              mx_uint *out_size,
-                              NArrayHandle **out);
-
-/**
- * \brief Generate Executor from symbol
- *
- * \param handle executor hanlde (to be generated)
- * \param symbol_handle symbol handle
- * \param len length
- * \param in_args in args array
- * \param arg_grad_store arg grads handle array
- * \param grad_req_type grad req array
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
-                             SymbolHandle symbol_handle,
-                             int dev_mask,
-                             int dev_id,
-                             mx_uint len,
-                             NArrayHandle *in_args,
-                             NArrayHandle *arg_grad_store,
-                             mx_uint *grad_req_type);
-
 #endif  // MXNET_C_API_H_
diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index cc5d2cf1b4a2..ed2b72bc4cc5 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -126,13 +126,13 @@ class NArray {
   NArray Copy(Context ctx) const;
   /*!
    * \brief Slice a NArray
-   * 
+   *
    * \param begin begin index in first dim
    * \param end end index in first dim
-   * 
+   *
    * \return sliced NArray
    */
-  NArray Slice(index_t begin, index_t end) {
+  inline NArray Slice(index_t begin, index_t end) const {
     NArray ret = *this;
     CHECK_GE(shape_.ndim(), 0) << "NArray not initialized";
     CHECK_GE(shape_[0], end) << "Chunk is smaller than required";
@@ -145,15 +145,16 @@ class NArray {
       }
       ret.offset_ = begin * length;
     }
+    ret.shape_[0] = end - begin;
     return ret;
   }
   /*!
    * \brief Reshape current NArray
-   * 
+   *
    * \param shape new shape
    * \return NArray in new shape
    */
-  NArray Reshape(const TShape &shape) {
+  inline NArray Reshape(const TShape &shape) const {
     CHECK_GE(shape_.Size(), shape.Size()) \
       << "required shape is larger than chunk";
     NArray ret = *this;
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index bc1d79b20b38..e60afe6948a7 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -330,7 +330,7 @@ class OperatorProperty {
                                        const std::vector<T> &out_data) const {
     int counter = 0;
     std::vector<int> out_grad_index(out_grad.size());
-    std::vector<int> in_data_index(out_data.size());
+    std::vector<int> in_data_index(in_data.size());
     std::vector<int> out_data_index(out_data.size());
     for (size_t i = 0; i < out_grad_index.size(); ++i) {
       out_grad_index[i] = counter++;
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
new file mode 100644
index 000000000000..7352bfe2f289
--- /dev/null
+++ b/python/mxnet/executor.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+""" code for executor. """
+from __future__ import absolute_import
+
+import ctypes
+from .base import _LIB
+from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle
+from .base import check_call
+from .narray import NArray
+
+class Executor(object):
+    """ Executor is the actual executing object of MXNet."""
+    def __init__(self, handle):
+        """Init an executor from handle
+
+        Parameters
+        ----------
+        handle: ExecutorHandle
+            ExecutorHandle generated by calling Bind
+        """
+        if not isinstance(handle, ExecutorHandle):
+            raise TypeError("Handle type error")
+        self.handle = handle
+
+    def forward(self):
+        """Do forward."""
+        check_call(_LIB.MXExecutorForward(self.handle))
+
+    def backward(self, grads):
+        """Do backward on heads' gradient.
+
+        Parameters
+        ----------
+        grads: Array of NArray
+            heads' gradient
+        """
+        for obj in grads:
+            if not isinstance(obj, NArray):
+                raise TypeError("inputs must be NArray")
+        narray = c_array(NArrayHandle, [item.handle for item in grads])
+        check_call(_LIB.MXExecutorBackward(self.handle, len(grads), narray))
+
+    def heads(self):
+        """list all heads' output narray
+
+        Returns
+        -------
+        A list of narray binded to the heads of executor.
+        """
+        # TODO: think of access, make heads read only.
+        # (consider support read only NArray(NArrayView))
+        # Otherwise some of the internal might depends on out_data
+        # if user set the content of the head, the backward behavior can be incorrect.
+        out_size = mx_uint()
+        handles = ctypes.POINTER(NArrayHandle)()
+        check_call(_LIB.MXExecutorHeads(self.handle, ctypes.byref(out_size), ctypes.byref(handles)))
+        return [NArray(NArrayHandle(handles[i])) for i in range(out_size.value)]
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index dbe60ddb7a78..6c72442cb3f9 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -5,13 +5,14 @@
 
 import ctypes
 from .base import _LIB
-from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle
-from .base import SymbolHandle
+from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle, SymbolHandle
 from .base import check_call
 from .narray import NArray
+from .context import Context
+from .executor import Executor
 
 class Symbol(object):
-    """SymbolCreator is a function that takes Param and return symbol"""
+    """Symbol is symbolic graph of the mxnet."""
     _registry = None
 
     @staticmethod
@@ -224,96 +225,36 @@ def debug_str(self):
                 self.handle, ctypes.byref(debug_str)))
         return debug_str.value
 
-class Executor(object):
-    """handle of executor"""
-    handle = None
-    def __init__(self, handle):
-        """Init an executor from handle
+    def bind(self, ctx, args, args_grad, reqs):
+        """bind current symbol to get an executor.
 
         Parameters
         ----------
-        handle: ExecutorHandle
-            ExecutorHandle generated by calling Bind
-        """
-        if not isinstance(ExecutorHandle):
-            raise TypeError("Handle type error")
-        self.handle = handle
-
-    def forward(self, inputs):
-        """do forward on inputs data
-
-        Parameters
-        ----------
-        inputs: Array of NArray
-            inputs narray to executor
-        """
-        if self.handle == None:
-            raise Exception("Bind symbol before use executor")
-        for obj in inputs:
-            if not isinstance(obj, NArray):
-                raise TypeError("inputs must be NArray")
-        narray = c_array([item.handle for item in inputs])
-        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(inputs), narray)))
-
-    def backward(self, grads):
-        """do backward on heads' grads
-
-        Parameters
-        ----------
-        grads: Array of NArray
-            heads' gradient
-        """
-        if self.handle == None:
-            raise Exception("Bind symbol before use executor")
-        for obj in grads:
-            if not isinstance(obj, NArray):
-                raise TypeError("inputs must be NArray")
-        narray = c_array(NArrayHandle, [item.handle for item in grads])
-        check_call(_LIB.MXExecutorForward (self.hanlde, mx_uint(len(grads), narray)))
-
-    def heads(self):
-        """list all heads' output narray
-
-        Returns
-        -------
-        a list of narray binded to the heads of executor
+        ctx: Context
+            context executor to run on
+        args: Array of NArray
+            input args to the symbol
+        args_grad: Array of NArray
+            input args' gradient
+        reqs: Array of enum
+            graident requirements
         """
-        if self.handle == None:
-            raise Exception("Bind symbol before use executor")
-        out_size = mx_uint()
-        handles = ctypes.POINTER(ctypes.POINTER(NArrayHandle))()
-        check_call(_LIB.MXExecutorHeads(self.handle, ctypes.byref(out_szie), narrays))
-        return [NArray(handle[i]) for i in xrange(out_size)]
-
-
-def Bind(sym, ctx, args, args_grad, reqs):
-    """Bind a symbol to get an executor
-
-    Parameters
-    ----------
-    sym: Symbol
-        symbol to be binded
-    ctx: Context
-        context executor to run on
-    args: Array of NArray
-        input args to the symbol
-    args_grad: Array of NArray
-        input args' gradient
-    reqs: Array of enum
-        graident requirements
-    """
-    """gradient requirements enum"""
-    enum = {"null" : 0, "write_to" : 1, "in_place":2, "add_to" : 3}
-
-    if not isinstance(sym, Symbol):
-        raise TypeError("Symbol type error")
-    if not isinstance(ctx, Context):
-        raise TypeError("Context type error")
-    args_handle = c_array(NArrayHandle, [item.handle for item in args])
-    args_grad_handle = c_array(NArrayHandle, [item.handle for item in args_grad])
-    reqs_array = c_array(mx_uint, [mx_uint(enum[item]) for item in req])
-    handle = ExecutorHandle()
-    check_call(_LIB.MXExecutorBind(handle, sym.handle, \
-        mx_uint(ctx.device_mask), mx_uint(ctx.device_id), \
-        mx_uint(len(args), args_handle, args_grad_handle, reqs_array)))
-    return Executor(handle)
+        # TODO(bing): consider a more friendly interface
+        # For example, pass in args_grad by dict
+
+        enum = {"null" : 0, "write_to" : 1, "in_place":2, "add_to" : 3}
+        if not isinstance(ctx, Context):
+            raise TypeError("Context type error")
+        args_handle = c_array(NArrayHandle, [item.handle for item in args])
+        args_grad_handle = c_array(NArrayHandle, [item.handle for item in args_grad])
+        reqs_array = c_array(mx_uint, [mx_uint(enum[item]) for item in reqs])
+        handle = ExecutorHandle()
+        check_call(_LIB.MXExecutorBind(self.handle,
+                                       mx_uint(ctx.device_mask),
+                                       mx_uint(ctx.device_id),
+                                       len(args),
+                                       args_handle,
+                                       args_grad_handle,
+                                       reqs_array,
+                                       ctypes.byref(handle)))
+        return Executor(handle)
diff --git a/python/test_mnist.py b/python/test_mnist.py
index 71d79dd607e6..f9f37d2e82e3 100644
--- a/python/test_mnist.py
+++ b/python/test_mnist.py
@@ -4,10 +4,13 @@
 import os, cPickle, gzip
 
 def Softmax(x):
+    batch, nidden = x.shape
     maxes = np.max(x, axis=1)
-    x -= maxes.reshape(maxes.shape[0], 1)
-    e = np.exp(x)
-    return e / np.sum(e, axis=1)
+    x -= maxes.reshape(batch, 1)
+    x = np.exp(x)
+    norm = np.sum(x, axis=1)
+    prob = x / norm.reshape((batch, 1))
+    return prob
 
 def CalAcc(out, label):
     pred = np.argmax(out, axis=1)
@@ -63,45 +66,47 @@ def Get(self):
 data = mx.sym.Variable('data')
 fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=160)
 act1 = mx.sym.Activation(data = fc1, name='relu1', type="relu")
-fc2 = mx.sym.FullyConnected(data=act1, name='fc2', num_hidden=10)
+fc2 = mx.sym.FullyConnected(data = act1, name='fc2', num_hidden=10)
 args_list = fc2.list_arguments()
-
 # infer shape
 data_shape = (batch_size, 784)
 arg_shapes, out_shapes = fc2.infer_shape(data=data_shape)
 arg_narrays = [mx.narray.create(shape) for shape in arg_shapes]
 grad_narrays = [mx.narray.create(shape) for shape in arg_shapes]
 mom_narrays = [mx.narray.create(shape) for shape in arg_shapes]
-out_narray = mx.narray.create(out_shapes[0])
 inputs = dict(zip(args_list, arg_narrays))
 
+np.random.seed(0)
 # set random weight
 for name, narray in inputs.items():
     if "weight" in name:
-        narray.numpy[:, :] = np.random.uniform(-0.01, 0.01, narray.numpy.shape)
-
+        narray.numpy[:, :] = np.random.uniform(-0.001, 0.001, narray.numpy.shape)
+    if "bias" in name:
+        narray.numpy[:] = 0.0
 
+req = ['write_to' for i in range(len(arg_narrays))]
 # bind executer
-# exec = bind(fc2, args_narray, grad_narray, req)
+# TODO(bing): think of a better bind interface
+executor = fc2.bind(mx.Context('cpu'), arg_narrays, grad_narrays, req)
 # update
 
+out_narray = executor.heads()[0]
+grad_narray = mx.narray.create(out_narray.shape)
+
 epoch = 10
 momentum = 0.9
-lr = 0.01
+lr = 0.001
 wd = 0.0004
 
 def Update(mom, grad, weight):
-    if len(mom.numpy.shape) == 1:
-        mom.numpy[:] = mom.numpy * momentum - lr * (grad.numpy + wd * weight.numpy)
-    else:
-        mom.numpy[:, :] = mom.numpy * momentum - lr * (grad.numpy + wd * weight.numpy)
-    weight += mom
+    weight.numpy[:] -= lr * grad.numpy[:]
 
 block = zip(mom_narrays, grad_narrays, arg_narrays)
 
 
-train = MNISTIter("train")
-valid = MNISTIter("valid")
+train = MNISTIter("train", batch_size)
+valid = MNISTIter("valid", batch_size)
+
 for i in xrange(epoch):
     # train
     print "Epoch %d" % i
@@ -109,18 +114,22 @@ def Update(mom, grad, weight):
     val_acc = 0.0
     while train.Next():
         data, label = train.Get()
-        inputs["data"].numpy[:,:] = data
-        # exec.Forward(args_narray)
+        inputs["data"].numpy[:] = data
+        executor.forward()
+        out_narray.numpy[:] = Softmax(out_narray.numpy)
         train_acc += CalAcc(out_narray.numpy, label)
-        SetGradient(out_narray.numpy, label)
-        # exec.Backward(out_narray)
+        grad_narray.numpy[:] = out_narray.numpy
+        SetGradient(grad_narray.numpy, label)
+        executor.backward([grad_narray])
+
         for mom, grad, weight in block:
             Update(mom, grad, weight)
+
     # evaluate
     while valid.Next():
         data, label = valid.Get()
-        inputs["data"].numpy[:,:] = data
-        # exec.Forward([ inputs["data"] ])
+        inputs["data"].numpy[:] = data
+        executor.forward()
         val_acc += CalAcc(out_narray.numpy, label)
     print "Train Acc: ", train_acc / train.nbatch
     print "Valid Acc: ", val_acc / valid.nbatch
diff --git a/src/c_api.cc b/src/c_api.cc
index d861ac00fc41..3d5e03cc0748 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -40,6 +40,8 @@ struct MXAPIThreadLocalEntry {
   std::vector<std::string> ret_vec_str;
   /*! \brief result holder for returning string pointers */
   std::vector<const char *> ret_vec_charp;
+  /*! \brief result holder for returning handles */
+  std::vector<void *> ret_handles;
   /*! \brief result holder for returning shapes */
   std::vector<TShape> arg_shapes, out_shapes;
   /*! \brief result holder for returning shape dimensions */
@@ -481,52 +483,53 @@ int MXSymbolInferShape(SymbolHandle sym,
   API_END();
 }
 
-MXNET_DLL int MXExecutorForward(ExecutorHandle handle,
-                                mx_uint len,
-                                NArrayHandle *args) {
+int MXExecutorForward(ExecutorHandle handle) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
-  CHECK_EQ(len, 0)
-      << "forward do not take narray for now";
-  // TODO(bing): remove args for now
   exec->Forward();
   API_END();
 }
 
-
-MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
-                                 mx_uint len,
-                                NArrayHandle *head_grads) {
+int MXExecutorBackward(ExecutorHandle handle,
+                       mx_uint len,
+                       NArrayHandle *head_grads) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
   std::vector<NArray> narrays;
   NArray **args_ptr = reinterpret_cast<NArray**>(head_grads);
   for (mx_uint i = 0; i < len; ++i) {
-    narrays.push_back(*(args_ptr[i]));
+    narrays.push_back(*args_ptr[i]);
   }
   exec->Backward(narrays);
   API_END();
 }
 
-
-MXNET_DLL int MXExecutorHeads(ExecutorHandle handle,
-                              mx_uint *out_size,
-                              NArrayHandle **out) {
+int MXExecutorHeads(ExecutorHandle handle,
+                    mx_uint *out_size,
+                    NArrayHandle **out) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
-  std::vector<NArray> ret = exec->heads();
-
+  std::vector<NArray> heads = exec->heads();
+  ret->ret_handles.resize(heads.size());
+  for (size_t i = 0; i < heads.size(); ++i) {
+    NArray *ptr = new NArray();
+    *ptr = heads[i];
+    ret->ret_handles[i] = ptr;
+  }
+  *out_size = heads.size();
+  *out = dmlc::BeginPtr(ret->ret_handles);
   API_END();
 }
 
-MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
-                             SymbolHandle symbol_handle,
-                             int dev_mask,
-                             int dev_id,
-                             mx_uint len,
-                             NArrayHandle *in_args,
-                             NArrayHandle *arg_grad_store,
-                             mx_uint *grad_req_type) {
+int MXExecutorBind(SymbolHandle symbol_handle,
+                   int dev_mask,
+                   int dev_id,
+                   mx_uint len,
+                   NArrayHandle *in_args,
+                   NArrayHandle *arg_grad_store,
+                   mx_uint *grad_req_type,
+                   ExecutorHandle *out) {
   API_BEGIN();
   Symbol *symb = static_cast<Symbol*>(symbol_handle);
   Context ctx = Context(dev_mask, dev_id);
@@ -540,7 +543,7 @@ MXNET_DLL int MXExecutorBind(ExecutorHandle handle,
     arg_grad_vec.push_back(*(arg_grad_ptr[i]));
     grad_req_vec.push_back(static_cast<OpReqType>(grad_req_type[i]));
   }
-  handle = Executor::Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
+  *out = Executor::Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
   API_END();
 }
 
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 8cf50541959e..a434f22a2fc6 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -23,9 +23,9 @@ class GraphExecutor::BackwardOpWrapper : public Operator {
   explicit BackwardOpWrapper(const OperatorProperty *prop,
                              std::shared_ptr<Operator> forward_op)
       : op_(forward_op) {
-    out_grad_.resize(prop->NumReturns());
+    out_grad_.resize(prop->NumVisibleReturns());
     in_data_.resize(prop->ListArguments().size());
-    out_data_.resize(prop->NumVisibleReturns());
+    out_data_.resize(prop->NumReturns());
 
     std::vector<TBlob*> out_grad_ptr(out_grad_.size());
     for (size_t i = 0; i < out_grad_.size(); ++i) {
@@ -40,7 +40,7 @@ class GraphExecutor::BackwardOpWrapper : public Operator {
       out_data_ptr[i] = &out_data_[i];
     }
     arg_data_ptr_ = prop->BackwardInputs(
-        out_grad_ptr, out_data_ptr, in_data_ptr);
+        out_grad_ptr, in_data_ptr, out_data_ptr);
   }
   // implement forward
   virtual void Forward(const OpContext &ctx,
@@ -127,9 +127,9 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
     // forward property
     const OperatorProperty *fwd = graph_.nodes[node.backward_source_id].op.get();
 
-    std::vector<int> out_grad_index(fwd->NumReturns());
-    std::vector<int> out_data_index(fwd->NumVisibleReturns());
+    std::vector<int> out_grad_index(fwd->NumVisibleReturns());
     std::vector<int> in_data_index(fwd->ListArguments().size());
+    std::vector<int> out_data_index(fwd->NumReturns());
     CHECK_EQ(in_data_index.size(), out_data.size());
     int counter = 0;
     for (size_t i = 0; i < out_grad_index.size(); ++i) {
@@ -306,7 +306,6 @@ void GraphExecutor::InitDataEntryInfo(const std::vector<NArray> &in_args,
 void GraphExecutor::InitDataEntryMemory() {
   // use allocator to allocate memory.
   GraphStorageAllocator allocator(&graph_);
-
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
@@ -328,6 +327,7 @@ void GraphExecutor::InitDataEntryMemory() {
       CHECK_NE(out_data[i]->type, kInternalAllocated);
     }
     auto inplace = GetInplaceOption(nid, in_data, out_data);
+
     for (std::pair<DataEntryInfo*, DataEntryInfo*> kv : inplace) {
       DataEntryInfo* in = kv.first;
       DataEntryInfo* out = kv.second;
@@ -363,13 +363,14 @@ void GraphExecutor::InitDataEntryMemory() {
         continue;
       }
       // if we decrease it to zero, means we are ready to relase
-      if (--in->ref_count == 0) {
+      --in->ref_count;
+      if (in->ref_count == 0 && in->type == kInternalAllocated) {
         allocator.Release(in->storage_id, nid);
       }
     }
     // check out again, if there is ref_count == 0, release it
     for (DataEntryInfo *out : out_data) {
-      if (out->ref_count == 0) {
+      if (out->ref_count == 0 && out->type == kInternalAllocated) {
         allocator.Release(out->storage_id, nid);
       }
     }
@@ -445,6 +446,28 @@ void GraphExecutor::RunOps(size_t topo_start, size_t topo_end) {
   }
 }
 
+std::string GraphExecutor::DebugStr() const {
+  std::ostringstream os;
+  os << "num_forward_nodes=" << num_forward_nodes_ << '\n';
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    os << "Op " << i << ":" << graph_.nodes[nid].name << '\n';
+    for (size_t j = 0; j < op_nodes_[nid].outputs.size(); ++j) {
+      const DataEntryInfo &info = op_nodes_[nid].outputs[j];
+      os << "\toutput[" << j << "]: shape=" << info.shape;
+      if (info.storage_id != GraphStorageAllocator::kBadStorageID) {
+        os << ", storage_id=" << info.storage_id;
+      }
+      if (info.inplace_op_id != -1) {
+        os << ", inplace_consumer=" << graph_.nodes[info.inplace_op_id].name;
+      }
+      os << '\n';
+    }
+  }
+  return os.str();
+}
+
 void GraphExecutor::Forward() {
   RunOps(0, num_forward_nodes_);
 }
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index ccc4e64a904f..a072eee69b68 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -39,6 +39,10 @@ class GraphExecutor : public Executor {
     this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type);
     this->InitDataEntryMemory();
     this->InitOpNodes();
+    // TODO(bing): remove me when things are OK
+    LOG(INFO) << "-----Execution memory plan-----\n"
+              << DebugStr() << '\n'
+              << "------------------------------\n";
   }
 
  protected:
@@ -79,6 +83,7 @@ class GraphExecutor : public Executor {
         : op_req(kNullOp),
           inplace_op_id(-1),
           type(kNotInitialized),
+          storage_id(GraphStorageAllocator::kBadStorageID),
           ref_count(0) {}
   };
   // all the information needed to push the op to engine
@@ -159,6 +164,8 @@ class GraphExecutor : public Executor {
   void InitOpNodes();
   // run ops from topo order start to end
   void RunOps(size_t topo_start, size_t topo_end);
+  // get debug string
+  std::string DebugStr() const;
   // internal computational graph
   StaticGraph graph_;
   // topological order of nodes in computation graph
diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h
index 4c047040a041..b7bd2db2081e 100644
--- a/src/symbol/graph_memory_allocator.h
+++ b/src/symbol/graph_memory_allocator.h
@@ -29,7 +29,9 @@ namespace mxnet {
 class GraphStorageAllocator {
  public:
   /*! \brief resource index */
-  typedef uint64_t StorageID;
+  typedef int64_t StorageID;
+  /*! \brief bad storage id */
+  static const StorageID kBadStorageID = -1;
   /*! \brief constructor to the graph memory allocator */
   explicit GraphStorageAllocator(StaticGraph *graph);
   /*!
@@ -103,26 +105,25 @@ GraphStorageAllocator::Alloc(Context ctx, size_t size) {
 GraphStorageAllocator::StorageID
 GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
   size_t size = shape.Size();
-  if (free_.count(size) != 0) {
-    auto begin = free_.lower_bound(size);
-    auto end = free_.upper_bound(size);
-    // vector of possible candidates
-    for (auto it = begin; it != end; ++it) {
-      StorageEntry *e = it->second;
-      if (e->ctx != ctx) continue;
-      // Use exect matching strategy
-      // TODO(bing): think of other strategies, for example, rough match.
-      if (e->max_size != size) continue;
-      // find a exact match, erase from map and return
-      free_.erase(it);
-      return e->id;
-    }
+  auto begin = free_.lower_bound(size);
+  auto end = free_.upper_bound(size);
+  // vector of possible candidates
+  for (auto it = begin; it != end; ++it) {
+    StorageEntry *e = it->second;
+    if (e->ctx != ctx) continue;
+    // Use exect matching strategy
+    // TODO(bing): think of other strategies, for example, rough match.
+    if (e->max_size != size) continue;
+    // find a exact match, erase from map and return
+    free_.erase(it);
+    return e->id;
   }
   // cannot find anything return a new one.
   return this->Alloc(ctx, size);
 }
 
 void GraphStorageAllocator::Release(StorageID id, uint32_t node_id) {
+  CHECK_NE(id, kBadStorageID);
   StorageEntry *e = data_[id].get();
   free_.insert({e->max_size, e});
 }
@@ -136,10 +137,9 @@ void GraphStorageAllocator::InitStorages() {
 }
 
 NArray GraphStorageAllocator::Get(StorageID id, TShape shape) {
+  CHECK_NE(id, kBadStorageID);
   StorageEntry *e = data_[id].get();
-  // TODO(bing): change to return e->data.Slice(0, shape.Size()).Reshape(shape);
-  // once we are able to get NArray that shares memory from a big chunk.
-  return NArray(shape, e->ctx);
+  return e->data.Slice(0, shape.Size()).Reshape(shape);
 }
 }  // namespace mxnet
 #endif  // MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_