diff --git a/Makefile b/Makefile
index 74944012df8e..50e9a21c50e8 100644
--- a/Makefile
+++ b/Makefile
@@ -58,14 +58,14 @@ endif
 BIN = test/api_registry_test test/test_storage
 OBJ = narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o graph_executor.o pooling_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_op_gpu.o fully_connected_gpu.o
+	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -81,12 +81,19 @@ narray.o: src/narray/narray.cc
 narray_op_cpu.o: src/narray/narray_op_cpu.cc src/narray/narray_op-inl.h
 narray_op_gpu.o: src/narray/narray_op_gpu.cu src/narray/narray_op-inl.h
 symbol.o: src/symbol/symbol.cc
+graph_executor.o: src/symbol/graph_executor.cc
 static_graph.o : src/symbol/static_graph.cc
 registry.o: src/registry.cc
 c_api.o: src/c_api.cc
 operator.o: src/operator/static_operator_wrapper.cc
 fully_connected_cpu.o: src/operator/fully_connected.cc
 fully_connected_gpu.o: src/operator/fully_connected.cu
+activation_cpu.o: src/operator/activation.cc
+activation_gpu.o: src/operator/activation.cu
+elementwise_sum_cpu.o: src/operator/elementwise_sum.cc
+elementwise_sum_gpu.o: src/operator/elementwise_sum.cu
+pooling_cpu.o: src/operator/pooling.cc
+pooling_gpu.o: src/operator/pooling.cu
 
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index a9a15c4a8007..38132cb169a5 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -34,12 +34,11 @@ typedef void *AtomicSymbolCreator;
 typedef void *SymbolHandle;
 /*! \brief handle to a AtomicSymbol */
 typedef void *AtomicSymbolHandle;
-/*! \brief handle to a NArrayOperator */
-typedef void *OperatorHandle;
+/*! \brief handle to an Executor */
+typedef void *ExecutorHandle;
 /*! \brief handle to a DataIterator */
 typedef void *DataIterHandle;
-
-/*!
+/*
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
  *  and -1 when an error occured,
@@ -49,10 +48,9 @@ typedef void *DataIterHandle;
  *  \return error info
  */
 MXNET_DLL const char *MXGetLastError();
-
-//--------------------------------
+//-------------------------------------
 // Part 1: NArray creation and deletion
-//--------------------------------
+//-------------------------------------
 /*!
  * \brief create a NArray handle that is not initialized
  *  can be used to pass in as mutate variables
@@ -189,7 +187,6 @@ MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
                              mx_uint *num_scalars,
                              mx_uint *num_mutate_vars,
                              int *type_mask);
-
 /*!
  * \brief invoke a function, the array size of passed in arguments
  *   must match the values in the
@@ -301,8 +298,8 @@ MXNET_DLL int MXSymbolListArguments(SymbolHandle symbol,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolListReturns(SymbolHandle symbol,
-                                    mx_uint *out_size,
-                                    const char ***out_str_array);
+                                  mx_uint *out_size,
+                                  const char ***out_str_array);
 /*!
  * \brief Compose the symbol on other symbols.
  *
@@ -322,82 +319,91 @@ MXNET_DLL int MXSymbolCompose(SymbolHandle sym,
                               mx_uint num_args,
                               const char** keys,
                               SymbolHandle* args);
+/*!
+ * \brief infer shape of unknown input shapes given the known one.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
+                                 mx_uint num_args,
+                                 const char** keys,
+                                 const mx_uint *arg_ind_ptr,
+                                 const mx_uint *arg_shape_data,
+                                 mx_uint *in_shape_size,
+                                 const mx_uint **in_shape_ndim,
+                                 const mx_uint ***in_shape_data,
+                                 mx_uint *out_shape_size,
+                                 const mx_uint **out_shape_ndim,
+                                 const mx_uint ***out_shape_data,
+                                 int *complete);
 //--------------------------------------------
-// Part 4: operator interface on NArray
+// Part 4: Executor interface
 //--------------------------------------------
 /*!
- * \brief create operator from symbol
- * \param sym the symbol to create operator from
- * \param dev_mask device mask to indicate the device type
- * \param dev_id the device id we want to bind the symbol to
- * \param out the corresponding function handle
+ * \brief Executor forward method
+ *
+ * \param handle executor handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXOpCreate(SymbolHandle sym,
-                         int dev_mask,
-                         int dev_id,
-                         OperatorHandle *out);
+MXNET_DLL int MXExecutorForward(ExecutorHandle handle);
 /*!
- * \brief free the operator handle
- * \param op the handle to be freed
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NArray handle for heads' gradient
+ *
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXOpFree(OperatorHandle op);
+MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
+                                 mx_uint len,
+                                 NArrayHandle *head_grads);
+
 /*!
- * \brief return an array to describe the arguments
- *  of this operator
- * \param out_size the size of output array
- * \param out_array the array of parameter requirments
+ * \brief Get executor's head NArray
+ *
+ * \param handle executor handle
+ * \param out_size output narray vector size
+ * \param out out put narray handles
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXOpDescribeArgs(mx_uint *out_size,
-                               int **out_array);
+MXNET_DLL int MXExecutorHeads(ExecutorHandle handle,
+                              mx_uint *out_size,
+                              NArrayHandle **out);
+
 /*!
- * \brief infer shape of unknown input shapes given the known one
- *  this function do not return the shape of output
- *  the shapes are packed into a CSR matrix represened by ind_ptr and shape_array
+ * \brief Generate Executor from symbol
  *
- *  When the function returns, it return a new CSR matrix by updating ind_ptr,
- *  and return the content in the return value
- *
- * \param ind_ptr the head pointer of the rows in CSR
- * \param shape_array the content of the CSR
- * \param out_nout number of output arguments of this operation
- * \param out_array another content of CSR with infered shape
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXOpInferShape(mx_uint *ind_ptr,
-                             mx_uint *shape_array,
-                             mx_uint *out_nout,
-                             mx_uint *out_array);
-/*!
- * \brief call forward on the operator
- * \param op the operator handle
- * \param in_data array of input narray to the operator
- * \param out_data array of output NArray to hold the result
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXOpForward(OperatorHandle op,
-                          NArrayHandle *in_data,
-                          NArrayHandle *out_data);
-/*!
- * \brief call backward on the operator
- * \param op the operator handle
- * \param grad_next array of output gradients
- * \param in_data array of input narray to the operator
- * \param out_data array of output narray to the operator
- * \param out_grad array to holds the gradient on these input
- *    can be NULL if that position request is kNullOp
- * \param reqs gradient request type
- * \return 0 when success, -1 when failure happens
- * \sa mxnet::Operator::GradReqType
- */
-MXNET_DLL int MXOpBackward(OperatorHandle op,
-                           NArrayHandle *grad_next,
-                           NArrayHandle *in_data,
-                           NArrayHandle *out_data,
-                           NArrayHandle *out_grad,
-                           mx_uint *reqs);
+ * \param symbol_handle symbol handle
+ * \param len length
+ * \param in_args in args array
+ * \param arg_grad_store arg grads handle array
+ * \param grad_req_type grad req array
+ * \param out output executor handle
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorBind(SymbolHandle symbol_handle,
+                             int dev_mask,
+                             int dev_id,
+                             mx_uint len,
+                             NArrayHandle *in_args,
+                             NArrayHandle *arg_grad_store,
+                             mx_uint *grad_req_type,
+                             ExecutorHandle *out);
 
 //--------------------------------------------
 // Part 5: IO Interface
diff --git a/include/mxnet/context.h b/include/mxnet/context.h
index 262ba2e787d4..700bb36f0abb 100644
--- a/include/mxnet/context.h
+++ b/include/mxnet/context.h
@@ -6,6 +6,8 @@
 #ifndef MXNET_CONTEXT_H_
 #define MXNET_CONTEXT_H_
 
+#include "./base.h"
+
 namespace mxnet {
 
 /*! \brief Context information about the execution enviroment */
@@ -31,6 +33,14 @@ struct Context {
   inline bool operator==(const Context &b) const {
     return dev_mask == b.dev_mask && dev_id == b.dev_id;
   }
+  /*!
+   * \brief check if current context not equals another one
+   * \param b another context to compare
+   * \return whether they are not the same
+   */
+  inline bool operator!=(const Context &b) const {
+    return !(*this == b);
+  }
 };
 
 /*!
diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 92257b3f0269..ed2b72bc4cc5 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -35,7 +35,7 @@ class NArray {
    */
   NArray(const TShape &shape, Context ctx,
          bool delay_alloc = false)
-      : ptr_(new Chunk(shape, ctx, delay_alloc)) {
+      : ptr_(new Chunk(shape.Size(), ctx, delay_alloc)), shape_(shape), offset_(0) {
   }
   /*!
    * \brief constructing a static NArray that shares data with TBlob
@@ -45,19 +45,20 @@ class NArray {
    * \param dev_id the device id this tensor sits at
    */
   NArray(const TBlob &data, int dev_id)
-      : ptr_(new Chunk(data, dev_id)) {
+      : ptr_(new Chunk(data, dev_id)), shape_(data.shape_), offset_(0) {
   }
   /*!
    * \return the shape of current NArray
    */
   inline const TShape &shape() const {
-    return ptr_->data.shape_;
+    return shape_;
   }
   /*!
    * \return the data TBlob
    */
-  inline const TBlob &data() const {
-    return ptr_->data;
+  inline TBlob data() const {
+    return TBlob(static_cast<real_t*>(ptr_->shandle.dptr) + offset_, \
+                                      shape_, ptr_->shandle.ctx.dev_mask);
   }
   /*!
    * \return the context of NArray, this function is only valid when the NArray is not empty
@@ -123,6 +124,43 @@ class NArray {
    * \return the new copy
    */
   NArray Copy(Context ctx) const;
+  /*!
+   * \brief Slice a NArray
+   *
+   * \param begin begin index in first dim
+   * \param end end index in first dim
+   *
+   * \return sliced NArray
+   */
+  inline NArray Slice(index_t begin, index_t end) const {
+    NArray ret = *this;
+    CHECK_GE(shape_.ndim(), 0) << "NArray not initialized";
+    CHECK_GE(shape_[0], end) << "Chunk is smaller than required";
+    size_t length = 1;
+    if (shape_.ndim() == 1) {
+      ret.offset_= begin;
+    } else {
+      for (index_t i = 1; i < shape_.ndim(); ++i) {
+        length *= shape_[i];
+      }
+      ret.offset_ = begin * length;
+    }
+    ret.shape_[0] = end - begin;
+    return ret;
+  }
+  /*!
+   * \brief Reshape current NArray
+   *
+   * \param shape new shape
+   * \return NArray in new shape
+   */
+  inline NArray Reshape(const TShape &shape) const {
+    CHECK_GE(shape_.Size(), shape.Size()) \
+      << "required shape is larger than chunk";
+    NArray ret = *this;
+    ret.shape_ = shape;
+    return ret;
+  }
 
  private:
   /*! \brief the real data chunk that backs NArray */
@@ -131,8 +169,6 @@ class NArray {
     Storage::Handle shandle;
     /*! \brief variable from DAG engine */
     DAGEngine::Variable var;
-    /*! \brief holds the data content */
-    TBlob data;
     /*!
      * \brief if this is true, this means the data do not come
      * from Storage, and do not need to be freed
@@ -146,25 +182,25 @@ class NArray {
     }
     /*! \brief construct from static data */
     Chunk(const TBlob &data, int dev_id)
-        : data(data),
-          static_data(true),
+        : static_data(true),
           delay_alloc(false) {
       var = DAGEngine::Get()->NewVar();
       shandle.ctx = Context(data.dev_mask_, dev_id);
+      shandle.dptr = data.dptr_;
+      shandle.size = data.shape_.Size() * sizeof(real_t);
     }
     /*! \brief construct a new chunk */
-    Chunk(const TShape &shape, Context ctx, bool delay_alloc_)
+    Chunk(uint64_t size, Context ctx, bool delay_alloc_)
         : static_data(false), delay_alloc(true) {
       var = DAGEngine::Get()->NewVar();
-      data.shape_ = shape;
+      shandle.size = size * sizeof(real_t);
       shandle.ctx = ctx;
       if (!delay_alloc_) this->CheckAndAlloc();
     }
     /*! \brief check if delay alloc is on, do alloc if not yet done */
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
-        shandle = Storage::Get()->Alloc(data.shape_.Size() * sizeof(real_t), shandle.ctx);
-        data = TBlob(static_cast<real_t*>(shandle.dptr), data.shape_, shandle.ctx.dev_mask);
+        shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
         delay_alloc = false;
       }
     }
@@ -183,6 +219,11 @@ class NArray {
   };
   /*! \brief internal data of NArray */
   std::shared_ptr<Chunk> ptr_;
+  /*! \brief shape of current NArray */
+  TShape shape_;
+  /*! \brief offset in chunk */
+  size_t offset_;
+
   // add friend to helper functions
   friend void CopyFromTo(const NArray &from, NArray *to);
   template<typename OP>
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 938083dbab33..e60afe6948a7 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -8,6 +8,7 @@
 #define MXNET_OPERATOR_H_
 
 #include <dmlc/base.h>
+#include <dmlc/logging.h>
 #include <vector>
 #include <string>
 #include <utility>
@@ -40,16 +41,18 @@ enum OpReqType {
 struct OpContext {
   /*! \brief whether it is training phase */
   int is_train;
-  /*! \brief Stream we are running on */
-  void *stream;
+  /*! \brief RunContext related resources */
+  RunContext run_ctx;
   /*! \brief Resources requested by the operator */
   std::vector<Resource> requested;
   /*!
-   * \brief set the RunContext related parts
-   * \param ctx the context
+   * \brief get mshadow stream from Context
+   * \return the mshadow stream
+   * \tparam xpu the device type of the stream
    */
-  inline void SetRunContext(const RunContext &ctx) {
-    stream = ctx.stream;
+  template<typename xpu>
+  inline mshadow::Stream<xpu>* get_stream() const {
+    return static_cast<mshadow::Stream<xpu>*>(run_ctx.stream);
   }
 };
 
@@ -84,20 +87,31 @@ class Operator {
                        const std::vector<TBlob> &out_data) = 0;
   /*!
    * \brief Perform a Backward Operation, write gradient to the in_grad.
+   *
+   * Convention:
+   *   out_grad.size() == OperatorProperty.NumVisibleReturns()
+   *   out_data.size() == OperatorProperty.NumReturns()
+   * out_data can contain additional invisible returns that remembers the
+   * state carried from the Forward pass. For example mask in the dropout.
+   *
+   * The gradients are passed from visible returns in this function.
+   *
    * \param ctx runtime context available to this call
-   * \param out_grad the gradient value we get from output of the Operator
+   * \param out_grad the gradient value we get from of the Operator.
    * \param in_data the array of input data.
    * \param out_data the array of output data.
    * \param req request types of the saving operation, can be all types.
    * \param in_grad the array of gradient we need to write to.
-   * \sa OpReqType, OpContext
+   * \sa OpReqType, OpContext, OperatorProperty
    */
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) = 0;
+                        const std::vector<TBlob> &in_grad) {
+    LOG(FATAL) << "Backward is not implemented";
+  }
 };
 
 #if DMLC_USE_CXX11
@@ -115,6 +129,12 @@ class OperatorProperty {
    * \brief virtual destructor
    */
   virtual ~OperatorProperty() {}
+  /*!
+   *  \brief Initialize the Operator by setting the parameters
+   *  This function need to be called before all other functions.
+   *  \param kwargs the keyword arguments parameters
+   */
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) = 0;
   /*!
    * \brief Get input arguments of the Operator.
    * \return vector of arguments.
@@ -148,12 +168,6 @@ class OperatorProperty {
   virtual int NumVisibleReturns() const {
     return NumReturns();
   }
-  /*!
-   *  \brief Set the parameters of the Operator.
-   *  \param name parameter name
-   *  \param val string for the configuration
-   */
-  virtual void SetParam(const char *name, const char *val) {}
   /*!
    * \brief infer the shapes of outputs and unknown input arguments
    * \param in_shape the shape of input arguments of the operator
@@ -166,7 +180,8 @@ class OperatorProperty {
    *
    * \param out_shape the shape of outputs of the operator
    *     InferShape will modify the vector to fill output TShape
-   * \return if the shape inference is successful, return true, else return false.
+   * \return true if the shape inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_shapes are inconsistent.
    */
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) const = 0;
@@ -243,28 +258,36 @@ class OperatorProperty {
    *  This function enables optimization to reuse memory of inputs in output.
    *  Only override when necessary, by default in-place is disabled.
    *
+   *  The reason for void* type in the out_data is to distinguish the order
+   *  of mappings between the two, compiler will report error when
+   *  in_data and out_data's order in the pair get reversed.
+   *
    * \code
    *  // The following code says out_data[0] can share data with in_data[0]
-   *  vector<pair<int,int> > ForwardInplaceOption(const vector<int> &in_data,
-   *                                              const vector<int> &out_data) const {
-   *    return {{out_data[0], in_data[0]}};
+   *  vector<pair<int, void*> > ForwardInplaceOption(const vector<int> &in_data,
+   *                                                 const vector<void*> &out_data) const {
+   *    return {{in_data[0], out_data[0]}};
    *  }
    * \endcode
    * \param in_data The input data in forward pass.
    * \param out_data The output data in forward pass.
-   * \return list of pair of integers taken from the inputs vector,
+   * \return list of pair of that maps input->output,
    *   indicating possible in place operations.
    */
-  virtual std::vector<std::pair<int, int> > ForwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
       const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
-    return std::vector<std::pair<int, int> >();
+      const std::vector<void*> &out_data) const {
+    return std::vector<std::pair<int, void*> >();
   }
   /*!
    * \brief Get possible backward inplace options.
    *  This function enables optimization to reuse memory of inputs in output.
    *  Only override when necessary, by default in-place is disabled.
    *
+   *  The reason for void* type in the in_grad is to distinguish the order
+   *  of mappings between the two, compiler will report error when
+   *  in_data and out_data's order in the pair get reversed.
+   *
    * \code
    *  // The following code says in_grad[0] can share data with in_data[0]
    *  vector<pair<int,int> > BackwardInplaceOption(
@@ -272,22 +295,22 @@ class OperatorProperty {
    *                 const std::vector<int> &in_data,
    *                 const std::vector<int> &out_data,
    *                 const std::vector<int> &in_grad) const {
-   *    return {in_grad[0], in_data[0]}};
+   *    return {in_data[0], in_grad[0]}};
    *  }
    * \endcode
    * \param in_data The input data in forward pass.
    * \param out_data The output data in forward pass.
    * \param in_grad Gradient of inputs in backward pass.
    * \param out_grad Gradient of outputs in backward pass.
-   * \return list of pair of integers taken from the inputs vector,
+   * \return list of pair of that maps input->output,
    *   indicating possible in place operations.
    */
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
-    return std::vector<std::pair<int, int> >();
+      const std::vector<void*> &in_grad) const {
+    return std::vector<std::pair<int, void*> >();
   }
   /*!
    * \brief Get Backward Input Dependency for generic types of data.
@@ -302,31 +325,35 @@ class OperatorProperty {
    * \sa DeclareBackwardDependency
    */
   template<typename T>
-  inline std::vector<T> BackwardInputs(const std::vector<T> &in_data,
-                                       const std::vector<T> &out_data,
-                                       const std::vector<T> &out_grad) const {
-    int cnt = 0;
-    std::vector<T> all_vec;
-    std::vector<int> in_data_idx, out_data_idx, out_grad_idx;
-    for (size_t i = 0; i < in_data.size(); ++i) {
-      in_data_idx.push_back(cnt++);
-      all_vec.push_back(in_data[i]);
+  inline std::vector<T> BackwardInputs(const std::vector<T> &out_grad,
+                                       const std::vector<T> &in_data,
+                                       const std::vector<T> &out_data) const {
+    int counter = 0;
+    std::vector<int> out_grad_index(out_grad.size());
+    std::vector<int> in_data_index(in_data.size());
+    std::vector<int> out_data_index(out_data.size());
+    for (size_t i = 0; i < out_grad_index.size(); ++i) {
+      out_grad_index[i] = counter++;
     }
-    for (size_t i = 0; i < out_data.size(); ++i) {
-      out_data_idx.push_back(cnt++);
-      all_vec.push_back(out_data[i]);
+    for (size_t i = 0; i < in_data_index.size(); ++i) {
+      in_data_index[i] = counter++;
     }
-    for (size_t i = 0; i < out_grad.size(); ++i) {
-      out_grad_idx.push_back(cnt++);
-      all_vec.push_back(out_data[i]);
+    for (size_t i = 0; i < out_data_index.size(); ++i) {
+      out_data_index[i] = counter++;
     }
-    std::vector<int> ret_idx = this->DeclareBackwardDependency(
-        in_data_idx, out_data_idx, out_grad_idx);
-    std::vector<T> ret;
-    for (size_t i = 0; i < ret_idx.size(); ++i) {
-      ret.push_back(all_vec[ret_idx[i]]);
+    std::vector<T> all_data;
+    all_data.insert(all_data.end(), out_grad.begin(), out_grad.end());
+    all_data.insert(all_data.end(), in_data.begin(), in_data.end());
+    all_data.insert(all_data.end(), out_data.begin(), out_data.end());
+
+    std::vector<int> ret_index = this->DeclareBackwardDependency(
+        out_grad_index, in_data_index, out_data_index);
+
+    std::vector<T> ret(ret_index.size());
+    for (size_t i = 0; i < ret_index.size(); ++i) {
+      ret[i] = all_data[ret_index[i]];
     }
-    return ret;
+    return std::move(ret);
   }
   /*!
    * \brief create OperatorProperty
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index dc00f5a33fb6..df06c4913de8 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -12,6 +12,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <functional>
 #include <unordered_map>
 #include <unordered_set>
 #include "./base.h"
@@ -37,8 +38,45 @@ class StaticGraph {
     uint32_t source_id;
     /*! \brief index of output from the source. */
     uint32_t index;
+    /*! \brief default constructor */
+    DataEntry() {}
+    /*!
+     * \brief constructor with source and index
+     * \param source_id source id
+     * \param index node index
+     */
+    DataEntry(uint32_t source_id, uint32_t index)
+        : source_id(source_id), index(index) {}
+    /*!
+     * \brief compare equality
+     * \param other the other entry to compare
+     * \return whether two entries equals to each other
+     */
+    inline bool operator==(const DataEntry &other) const {
+      return source_id == other.source_id && index == other.index;
+    }
+    /*!
+     * \brief comparator, allows to use map
+     * \param other the other entry to compare
+     * \return whether two entries is smaller than the other
+     */
+    inline bool operator<(const DataEntry &other) const {
+      if (source_id == other.source_id) return index < other.index;
+      return source_id < other.source_id;
+    }
   };
-  /*! \brief Operation Node in static graph */
+  /*!
+   * \brief Operation Node in static graphs.
+   *  There are two types of node, Forward and Backward Node.
+   *
+   *  - Forward node corresponds to the op.Forward
+   *  - Backward node corresponds to the Backward pass,
+   *    where the corresponding forward node is indicated by backward_source_id.
+   *    The op field in Backward node is nullptr
+   *
+   *  The reason we explicit support Backward node is to allow special treatment
+   *  such as shape inference and state sharing with Forward pass.
+   */
   struct Node {
     /*! \brief wrapped operator property */
     std::unique_ptr<OperatorProperty> op;
@@ -46,13 +84,36 @@ class StaticGraph {
     std::string name;
     /*! \brief inputs (node_id, index) for of the nodes*/
     std::vector<DataEntry> inputs;
+    /*!
+     * \brief If this field is nonnegative, this indicates this
+     *  Node is corresponds to a Backward Operation of Operator.
+     *  backward_source_id will points to the corresponding Forward Node.
+     *
+     *  For normal node, this field is -1.
+     *  When the node is a Backward node, the op field will be nullptr
+     */
+    int32_t backward_source_id;
+    /*! \brief default constructor */
+    Node() : backward_source_id(-1) {}
+    /*! \return whether the node is forward op node */
+    inline bool is_forward() const {
+      return op != nullptr;
+    }
+    /*! \return whether the node is backward op node */
+    inline bool is_backward() const {
+      return backward_source_id != -1;
+    }
+    /*! \return whether the node is variable node */
+    inline bool is_variable() const {
+      return op == nullptr && !is_backward();
+    }
   };
   /*! \brief all nodes in the graph */
   std::vector<Node> nodes;
-  /*! \brief index is nodes that correspods to arguments */
+  /*! \brief index of nodes that correspods to arguments */
   std::vector<uint32_t> arg_nodes;
-  /*! \brief outputs(heads) of the graph */
-  std::vector<DataEntry> outputs;
+  /*! \brief heads outputs of the graph */
+  std::vector<DataEntry> heads;
   // funtions to help inference in static graph
   /*!
    * \brief Perform a topological sort on the graph
@@ -85,8 +146,29 @@ class StaticGraph {
    *     InferShape will modify the vector to fill output TShape
    * \return if the shape inference is successful, return true, else return false.
    */
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape) const;
+  bool InferShape(std::vector<TShape>* in_shape,
+                  std::vector<TShape>* out_shape) const;
+  /*!
+   * \brief Add a full backward pass in the static graph.
+   *  This function will add gradient nodes for each heads,
+   *  and add the backward pass to backprop the gradients all
+   *  the way to the arguments.
+   *
+   *  This will change the nodes field in the StaticGraph, but will not change other fields.
+   *  The head and input of Backward pass will be returned by head_grad_nodes and arg_grads.
+   *
+   * \param head_grad_nodes used to store the created head gradient inputs for backward pass.
+   * \param arg_grads used to store gradients to args, can be multiple one if an argument is used by operator
+   */
+  void MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
+                        std::vector<std::vector<DataEntry> > *arg_grads);
+
+  /*!
+   * \brief create a sum node that aggregates gradient together
+   * \param grad_source the source of the inputs.
+   * \return a created ElementWiseSum node
+   */
+  static Node CreateSumNode(const std::vector<DataEntry> &grad_source);
 };
 
 /*!
@@ -174,7 +256,7 @@ class Symbol {
                       const std::string& name) const;
   /*!
    * \brief infer the shapes of outputs and unknown input arguments
-   * \param in_shape the shape of input arguments of the operator
+   * \param arg_shapes the shape of input arguments of the operator
    *     this should be of same length as the vector returned by ListArguments
    *     in_shape allows unknown elements, which are checked by shape.ndim() == 0.
    *     For unknown shapes, InferShape will try to fill in the correct Shape in in_shape
@@ -182,11 +264,23 @@ class Symbol {
    *
    *     common practice: set the shape of data input, and usually weight's shape can be infered
    *
-   * \param out_shape the shape of outputs of the operator
-   *     InferShape will modify the vector to fill output TShape
-   * \return if the shape inference is successful, return true, else return false.
+   * \param out_shapes Use to store the infered shapes of outputs.
+   * \return true if the shape inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_shapes are inconsistent.
+   */
+  bool InferShape(std::vector<TShape> *arg_shapes,
+                  std::vector<TShape> *out_shapes) const;
+  /*!
+   * \brief infer the shapes by providing shapes of known arguments.
+   * \param known_arg_shapes map of argument name to shape of arguments with known shapes.
+   * \param arg_shapes used to store infered shapes of arguments.
+   * \param out_shapes used to store infered shapes of outputs.
+   * \return true if the shape inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_shapes are inconsistent.
    */
-  bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape) const;
+  bool InferShape(const std::unordered_map<std::string, TShape> &known_arg_shapes,
+                  std::vector<TShape> *arg_shapes,
+                  std::vector<TShape> *out_shapes) const;
   /*!
    * \brief get number of outputs of this symbol
    * \return number of outputs
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 8cb698aa8219..e30c77d382a3 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -69,7 +69,7 @@ def _load_lib():
 FunctionHandle = ctypes.c_void_p
 SymbolCreatorHandle = ctypes.c_void_p
 SymbolHandle = ctypes.c_void_p
-
+ExecutorHandle = ctypes.c_void_p
 #----------------------------
 # helper function definition
 #----------------------------
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
new file mode 100644
index 000000000000..7352bfe2f289
--- /dev/null
+++ b/python/mxnet/executor.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+""" code for executor. """
+from __future__ import absolute_import
+
+import ctypes
+from .base import _LIB
+from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle
+from .base import check_call
+from .narray import NArray
+
+class Executor(object):
+    """ Executor is the actual executing object of MXNet."""
+    def __init__(self, handle):
+        """Init an executor from handle
+
+        Parameters
+        ----------
+        handle: ExecutorHandle
+            ExecutorHandle generated by calling Bind
+        """
+        if not isinstance(handle, ExecutorHandle):
+            raise TypeError("Handle type error")
+        self.handle = handle
+
+    def forward(self):
+        """Do forward."""
+        check_call(_LIB.MXExecutorForward(self.handle))
+
+    def backward(self, grads):
+        """Do backward on heads' gradient.
+
+        Parameters
+        ----------
+        grads: Array of NArray
+            heads' gradient
+        """
+        for obj in grads:
+            if not isinstance(obj, NArray):
+                raise TypeError("inputs must be NArray")
+        narray = c_array(NArrayHandle, [item.handle for item in grads])
+        check_call(_LIB.MXExecutorBackward(self.handle, len(grads), narray))
+
+    def heads(self):
+        """list all heads' output narray
+
+        Returns
+        -------
+        A list of narray binded to the heads of executor.
+        """
+        # TODO: think of access, make heads read only.
+        # (consider support read only NArray(NArrayView))
+        # Otherwise some of the internal might depends on out_data
+        # if user set the content of the head, the backward behavior can be incorrect.
+        out_size = mx_uint()
+        handles = ctypes.POINTER(NArrayHandle)()
+        check_call(_LIB.MXExecutorHeads(self.handle, ctypes.byref(out_size), ctypes.byref(handles)))
+        return [NArray(NArrayHandle(handles[i])) for i in range(out_size.value)]
diff --git a/python/mxnet/narray.py b/python/mxnet/narray.py
index 26a2198bd765..61839ecc0a60 100644
--- a/python/mxnet/narray.py
+++ b/python/mxnet/narray.py
@@ -134,7 +134,7 @@ def shape(self):
         pdata = ctypes.POINTER(mx_uint)()
         check_call(_LIB.MXNArrayGetShape(
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
-        return tuple(pdata[i] for i in range(ndim.value))
+        return tuple(pdata[:ndim.value])
 
     @property
     def context(self):
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 031b18ab862f..6c72442cb3f9 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -1,16 +1,18 @@
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access
+# pylint: disable=invalid-name, protected-access, too-many-locals
 """Symbol support of mxnet"""
 from __future__ import absolute_import
 
 import ctypes
 from .base import _LIB
-from .base import c_array, c_str
-from .base import SymbolHandle
+from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle, SymbolHandle
 from .base import check_call
+from .narray import NArray
+from .context import Context
+from .executor import Executor
 
 class Symbol(object):
-    """SymbolCreator is a function that takes Param and return symbol"""
+    """Symbol is symbolic graph of the mxnet."""
     _registry = None
 
     @staticmethod
@@ -136,6 +138,80 @@ def list_returns(self):
                 self.handle, ctypes.byref(size), ctypes.byref(sarr)))
         return [sarr[i] for i in range(size.value)]
 
+    def infer_shape(self, *args, **kwargs):
+        """Infer the shape of outputs and arguments of given known shapes of arguments.
+
+        User can either pass in the known shapes in positional way or keyword argument way.
+        Pair of Nones is returned if there is not enough information passed in.
+        An error will be raised if there is inconsistency found in the known shapes passed in.
+
+        Parameters
+        ----------
+        *args :
+            Provide shape of arguments in a positional way.
+            Unknown shape can be marked as None
+
+        **kwargs :
+            Provide keyword arguments of known shapes.
+
+        Returns
+        -------
+        arg_shapes : list of tuple or None
+            List of shapes of arguments.
+            The order is in the same order as list_arguments()
+        out_shapes : list of tuple or None
+            List of shapes of outputs.
+            The order is in the same order as list_returns()
+        """
+        if len(args) != 0 and len(kwargs) != 0:
+            raise ValueError('Can only specify known argument \
+                    shapes either by positional or kwargs way.')
+        sdata = []
+        indptr = [0]
+        if len(args) != 0:
+            keys = None
+            for s in args:
+                if s is not None:
+                    if not isinstance(s, tuple):
+                        raise TypeError('Argument need to be shapes(tuple)')
+                    sdata.extend(s)
+                indptr.append(len(sdata))
+        else:
+            keys = []
+            for k, v in kwargs.items():
+                keys.append(c_str(k))
+                if not isinstance(v, tuple):
+                    raise TypeError('Argument need to be shapes(tuple)')
+                sdata.extend(v)
+                indptr.append(len(sdata))
+        arg_shape_size = mx_uint()
+        arg_shape_ndim = ctypes.POINTER(mx_uint)()
+        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        out_shape_size = mx_uint()
+        out_shape_ndim = ctypes.POINTER(mx_uint)()
+        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        complete = ctypes.c_int()
+        check_call(_LIB.MXSymbolInferShape( \
+                self.handle, len(indptr) - 1, \
+                c_array(ctypes.c_char_p, keys), \
+                c_array(mx_uint, indptr), \
+                c_array(mx_uint, sdata), \
+                ctypes.byref(arg_shape_size), \
+                ctypes.byref(arg_shape_ndim), \
+                ctypes.byref(arg_shape_data), \
+                ctypes.byref(out_shape_size), \
+                ctypes.byref(out_shape_ndim), \
+                ctypes.byref(out_shape_data), \
+                ctypes.byref(complete)))
+        if complete.value != 0:
+            arg_shapes = [tuple(arg_shape_data[i][:arg_shape_ndim[i]]) \
+                    for i in range(arg_shape_size.value)]
+            out_shapes = [tuple(out_shape_data[i][:out_shape_ndim[i]]) \
+                    for i in range(out_shape_size.value)]
+            return (arg_shapes, out_shapes)
+        else:
+            return (None, None)
+
     def debug_str(self):
         """Get a debug string.
 
@@ -148,3 +224,37 @@ def debug_str(self):
         check_call(_LIB.MXSymbolPrint( \
                 self.handle, ctypes.byref(debug_str)))
         return debug_str.value
+
+    def bind(self, ctx, args, args_grad, reqs):
+        """bind current symbol to get an executor.
+
+        Parameters
+        ----------
+        ctx: Context
+            context executor to run on
+        args: Array of NArray
+            input args to the symbol
+        args_grad: Array of NArray
+            input args' gradient
+        reqs: Array of enum
+            graident requirements
+        """
+        # TODO(bing): consider a more friendly interface
+        # For example, pass in args_grad by dict
+
+        enum = {"null" : 0, "write_to" : 1, "in_place":2, "add_to" : 3}
+        if not isinstance(ctx, Context):
+            raise TypeError("Context type error")
+        args_handle = c_array(NArrayHandle, [item.handle for item in args])
+        args_grad_handle = c_array(NArrayHandle, [item.handle for item in args_grad])
+        reqs_array = c_array(mx_uint, [mx_uint(enum[item]) for item in reqs])
+        handle = ExecutorHandle()
+        check_call(_LIB.MXExecutorBind(self.handle,
+                                       mx_uint(ctx.device_mask),
+                                       mx_uint(ctx.device_id),
+                                       len(args),
+                                       args_handle,
+                                       args_grad_handle,
+                                       reqs_array,
+                                       ctypes.byref(handle)))
+        return Executor(handle)
diff --git a/python/mxnet/symbol_creator.py b/python/mxnet/symbol_creator.py
index c81deebaef11..d507a9c2871a 100644
--- a/python/mxnet/symbol_creator.py
+++ b/python/mxnet/symbol_creator.py
@@ -54,7 +54,7 @@ def __call__(self, *args, **kwargs):
             if isinstance(v, Symbol):
                 symbol_kwargs[k] = v
             else:
-                param_keys.append(k)
+                param_keys.append(c_str(k))
                 param_vals.append(c_str(str(v)))
 
         # create atomic symbol
diff --git a/python/test_infer_shape.py b/python/test_infer_shape.py
new file mode 100644
index 000000000000..b94388e5546d
--- /dev/null
+++ b/python/test_infer_shape.py
@@ -0,0 +1,19 @@
+# pylint: skip-file
+import mxnet as mx
+
+data = mx.sym.Variable('data')
+
+fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=1000)
+fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', num_hidden=10)
+fc3 = mx.sym.FullyConnected( name='fc2', num_hidden=10)
+
+print fc2.list_arguments()
+
+data_shape = (100, 100)
+arg_shapes, out_shapes = fc2.infer_shape(data=data_shape)
+print dict(zip(fc2.list_arguments(), arg_shapes))
+print dict(zip(fc2.list_returns(), out_shapes))
+
+weight_shape= (1, 100)
+data_shape = (100, 100)
+arg_shapes, out_shapes = fc2.infer_shape(data=data_shape, fc1_weight=weight_shape)
diff --git a/python/test_mnist.py b/python/test_mnist.py
new file mode 100644
index 000000000000..f9f37d2e82e3
--- /dev/null
+++ b/python/test_mnist.py
@@ -0,0 +1,140 @@
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import os, cPickle, gzip
+
+def Softmax(x):
+    batch, nidden = x.shape
+    maxes = np.max(x, axis=1)
+    x -= maxes.reshape(batch, 1)
+    x = np.exp(x)
+    norm = np.sum(x, axis=1)
+    prob = x / norm.reshape((batch, 1))
+    return prob
+
+def CalAcc(out, label):
+    pred = np.argmax(out, axis=1)
+    return np.sum(pred == label) * 1.0 / out.shape[0]
+
+def SetGradient(out_grad, label):
+    assert(out_grad.shape[0] == label.shape[0])
+    for i in xrange(label.shape[0]):
+        k = label[i]
+        out_grad[i][k] -= 1.0
+
+# load data
+class MNISTIter(object):
+    def __init__(self, which_set, batch_size=100):
+        if not os.path.exists('mnist.pkl.gz'):
+            os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz")
+        f = gzip.open('mnist.pkl.gz', 'rb')
+        train_set, valid_set, test_set = cPickle.load(f)
+        f.close()
+        if which_set == 'train':
+            self.data = train_set[0]
+            self.label = np.asarray(train_set[1])
+        elif which_set == 'valid':
+            self.data = valid_set[0]
+            self.label = np.asarray(valid_set[1])
+        else:
+            self.data = test_set[0]
+            self.data = np.asarray(test_set[1])
+        self.batch_size = batch_size
+        self.nbatch = self.data.shape[0] / batch_size
+        assert(self.data.shape[0] % batch_size == 0) # I am lazy
+        self.now_idx = -1
+    def BeforeFirst(self):
+        self.now_idx = -1
+    def Next(self):
+        self.now_idx += 1
+        if self.now_idx == self.nbatch:
+            return False
+        return True
+    def Get(self):
+        if self.now_idx < 0:
+            raise Exception("Iterator is at head")
+        elif self.now_idx >= self.nbatch:
+            raise Exception("Iterator is at end")
+        start = self.now_idx * self.batch_size
+        end = (self.now_idx + 1) * self.batch_size
+        return (self.data[start:end, :], self.label[start:end])
+
+
+
+# symbol net
+batch_size = 100
+data = mx.sym.Variable('data')
+fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=160)
+act1 = mx.sym.Activation(data = fc1, name='relu1', type="relu")
+fc2 = mx.sym.FullyConnected(data = act1, name='fc2', num_hidden=10)
+args_list = fc2.list_arguments()
+# infer shape
+data_shape = (batch_size, 784)
+arg_shapes, out_shapes = fc2.infer_shape(data=data_shape)
+arg_narrays = [mx.narray.create(shape) for shape in arg_shapes]
+grad_narrays = [mx.narray.create(shape) for shape in arg_shapes]
+mom_narrays = [mx.narray.create(shape) for shape in arg_shapes]
+inputs = dict(zip(args_list, arg_narrays))
+
+np.random.seed(0)
+# set random weight
+for name, narray in inputs.items():
+    if "weight" in name:
+        narray.numpy[:, :] = np.random.uniform(-0.001, 0.001, narray.numpy.shape)
+    if "bias" in name:
+        narray.numpy[:] = 0.0
+
+req = ['write_to' for i in range(len(arg_narrays))]
+# bind executer
+# TODO(bing): think of a better bind interface
+executor = fc2.bind(mx.Context('cpu'), arg_narrays, grad_narrays, req)
+# update
+
+out_narray = executor.heads()[0]
+grad_narray = mx.narray.create(out_narray.shape)
+
+epoch = 10
+momentum = 0.9
+lr = 0.001
+wd = 0.0004
+
+def Update(mom, grad, weight):
+    weight.numpy[:] -= lr * grad.numpy[:]
+
+block = zip(mom_narrays, grad_narrays, arg_narrays)
+
+
+train = MNISTIter("train", batch_size)
+valid = MNISTIter("valid", batch_size)
+
+for i in xrange(epoch):
+    # train
+    print "Epoch %d" % i
+    train_acc = 0.0
+    val_acc = 0.0
+    while train.Next():
+        data, label = train.Get()
+        inputs["data"].numpy[:] = data
+        executor.forward()
+        out_narray.numpy[:] = Softmax(out_narray.numpy)
+        train_acc += CalAcc(out_narray.numpy, label)
+        grad_narray.numpy[:] = out_narray.numpy
+        SetGradient(grad_narray.numpy, label)
+        executor.backward([grad_narray])
+
+        for mom, grad, weight in block:
+            Update(mom, grad, weight)
+
+    # evaluate
+    while valid.Next():
+        data, label = valid.Get()
+        inputs["data"].numpy[:] = data
+        executor.forward()
+        val_acc += CalAcc(out_narray.numpy, label)
+    print "Train Acc: ", train_acc / train.nbatch
+    print "Valid Acc: ", val_acc / valid.nbatch
+    train.BeforeFirst()
+    valid.BeforeFirst()
+
+
+
diff --git a/src/c_api.cc b/src/c_api.cc
index d5a1a67d70c6..3d5e03cc0748 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -11,6 +11,7 @@
 #include <mxnet/registry.h>
 #include <mxnet/operator.h>
 #include <mxnet/c_api.h>
+#include <vector>
 #include <mutex>
 #include <memory>
 
@@ -27,61 +28,78 @@
 #message("Warning: Threadlocal is not enabled");
 #endif
 
-/*! \brief symbol wrapper to easily hold returning information */
-struct MXAPISymbolWrapper {
-  /*! \brief the actual symbol */
-  mxnet::Symbol sym;
+using namespace mxnet;
+
+/*! \brief entry to to easily hold returning information */
+struct MXAPIThreadLocalEntry {
+  /*! \brief holds last error message */
+  std::string last_error;
   /*! \brief result holder for returning string */
   std::string ret_str;
   /*! \brief result holder for returning strings */
   std::vector<std::string> ret_vec_str;
   /*! \brief result holder for returning string pointers */
   std::vector<const char *> ret_vec_charp;
+  /*! \brief result holder for returning handles */
+  std::vector<void *> ret_handles;
+  /*! \brief result holder for returning shapes */
+  std::vector<TShape> arg_shapes, out_shapes;
+  /*! \brief result holder for returning shape dimensions */
+  std::vector<mx_uint> arg_shape_ndim, out_shape_ndim;
+  /*! \brief result holder for returning shape pointer */
+  std::vector<const mx_uint*> arg_shape_data, out_shape_data;
+  // helper function to setup return value of shape array
+  inline static void SetupShapeArrayReturn(
+      const std::vector<TShape> &shapes,
+      std::vector<mx_uint> *ndim,
+      std::vector<const mx_uint*> *data) {
+    ndim->resize(shapes.size());
+    data->resize(shapes.size());
+    for (size_t i = 0; i < shapes.size(); ++i) {
+      ndim->at(i) = shapes[i].ndim();
+      data->at(i) = shapes[i].data();
+    }
+  }
 };
 
 /*!
- * \brief helper to store error message in threadlocal storage
+ * \brief A threadlocal store to store threadlocal variables.
+ *  Will return a thread local singleton of type T
+ * \tparam T the type we like to store
  */
-class MXAPIErrorMessageHelper {
+class MXAPIThreadLocalStore {
  public:
-  /*! \brief get a single instance out from */
-  static MXAPIErrorMessageHelper *Get() {
-    static MXAPIErrorMessageHelper inst;
-    return &inst;
-  }
-  /*!
-   * \brief a helper function for error handling
-   *  will set the last error to be str_set when it is not NULL
-   * \param str_set the error to set
-   * \return a pointer message to last error
-   */
-  static const char *SetGetLastError(const char *str_set) {
-    // use last_error to record last error
-    static MX_TREAD_LOCAL std::string *last_error = NULL;
-    if (last_error == NULL) {
-      last_error = new std::string();
-      Get()->RegisterDelete(last_error);
-    }
-    if (str_set != NULL) {
-      *last_error = str_set;
+  /*! \brief store return entry */
+  typedef MXAPIThreadLocalEntry T;
+  /*! \return get a thread local singleton */
+  static T* Get() {
+    static MX_TREAD_LOCAL T* ptr = nullptr;
+    if (ptr == nullptr) {
+      ptr = new T();
+      Singleton()->RegisterDelete(ptr);
     }
-    return last_error->c_str();
+    return ptr;
   }
 
  private:
   /*! \brief constructor */
-  MXAPIErrorMessageHelper() {}
+  MXAPIThreadLocalStore() {}
   /*! \brief destructor */
-  ~MXAPIErrorMessageHelper() {
+  ~MXAPIThreadLocalStore() {
     for (size_t i = 0; i < data_.size(); ++i) {
       delete data_[i];
     }
   }
+  /*! \return singleton of the store */
+  static MXAPIThreadLocalStore *Singleton() {
+    static MXAPIThreadLocalStore inst;
+    return &inst;
+  }
   /*!
    * \brief register str for internal deletion
    * \param str the string pointer
    */
-  void RegisterDelete(std::string *str) {
+  void RegisterDelete(T *str) {
     std::unique_lock<std::mutex> lock(mutex_);
     data_.push_back(str);
     lock.unlock();
@@ -89,13 +107,12 @@ class MXAPIErrorMessageHelper {
   /*! \brief internal mutex */
   std::mutex mutex_;
   /*!\brief internal data */
-  std::vector<std::string*> data_;
+  std::vector<T*> data_;
 };
 
 // NOTE: all functions return 0 upon success
 // consider add try/catch block for user error
 // handling in the future
-using namespace mxnet;
 
 /*! \brief  macro to guard beginning and end section of all functions */
 #define API_BEGIN() try {
@@ -111,7 +128,7 @@ using namespace mxnet;
 
 /*! \brief return str message of the last error */
 const char *MXGetLastError() {
-  return MXAPIErrorMessageHelper::SetGetLastError(NULL);
+  return MXAPIThreadLocalStore::Get()->last_error.c_str();
 }
 
 /*!
@@ -120,7 +137,7 @@ const char *MXGetLastError() {
  * \return the return value of API after exception is handled
  */
 int MXHandleException(const dmlc::Error &e) {
-  MXAPIErrorMessageHelper::SetGetLastError(e.what());
+  MXAPIThreadLocalStore::Get()->last_error = e.what();
   return -1;
 }
 
@@ -295,24 +312,26 @@ int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
                                const char **keys,
                                const char **vals,
                                SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
+  Symbol *s = new Symbol();
   OperatorProperty *op = nullptr;
 
   API_BEGIN();
   OperatorPropertyEntry *e = static_cast<OperatorPropertyEntry *>(creator);
   op = (*e)();
+  std::vector<std::pair<std::string, std::string> > kwargs;
   for (int i = 0; i < num_param; ++i) {
-    op->SetParam(keys[i], vals[i]);
+    kwargs.push_back({std::string(keys[i]), std::string(vals[i])});
   }
-  s->sym = Symbol::Create(op);
+  op->Init(kwargs);
+  *s = Symbol::Create(op);
   *out = s;
   API_END_HANDLE_ERROR(delete s; delete op);
 }
 
 int MXSymbolCreateVariable(const char *name, SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
+  Symbol *s = new Symbol();
   API_BEGIN();
-  s->sym = Symbol::CreateVariable(name);
+  *s = Symbol::CreateVariable(name);
   *out = s;
   API_END_HANDLE_ERROR(delete s);
 }
@@ -320,71 +339,72 @@ int MXSymbolCreateVariable(const char *name, SymbolHandle *out) {
 int MXSymbolCreateGroup(mx_uint num_symbols,
                         SymbolHandle *symbols,
                         SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
-  MXAPISymbolWrapper **sym_arr = (MXAPISymbolWrapper**)symbols; // NOLINT(*)
+  Symbol *s = new Symbol();
+  Symbol **sym_arr = (Symbol**)symbols; // NOLINT(*)
   API_BEGIN();
   std::vector<Symbol> syms;
   for (mx_uint i = 0; i < num_symbols; ++i) {
-    syms.push_back(sym_arr[i]->sym);
+    syms.push_back(*sym_arr[i]);
   }
-  s->sym = Symbol::CreateGroup(syms);
+  *s = Symbol::CreateGroup(syms);
   *out = s;
   API_END_HANDLE_ERROR(delete s);
 }
 
 int MXSymbolFree(SymbolHandle symbol) {
   API_BEGIN();
-  delete static_cast<MXAPISymbolWrapper*>(symbol);
+  delete static_cast<Symbol*>(symbol);
   API_END();
 }
 
 int MXSymbolCopy(SymbolHandle symbol, SymbolHandle *out) {
-  MXAPISymbolWrapper *s = new MXAPISymbolWrapper();
-
+  Symbol *s = new Symbol();
   API_BEGIN();
-  s->sym = (static_cast<const MXAPISymbolWrapper*>(symbol)->sym).Copy();
+  *s = static_cast<const Symbol*>(symbol)->Copy();
   *out = s;
   API_END_HANDLE_ERROR(delete s);
 }
 
 int MXSymbolPrint(SymbolHandle symbol, const char **out_str) {
-  MXAPISymbolWrapper *s = static_cast<MXAPISymbolWrapper*>(symbol);
-
+  Symbol *s = static_cast<Symbol*>(symbol);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
   std::ostringstream os;
-  (s->sym).Print(os);
-  s->ret_str = os.str();
-  *out_str = (s->ret_str).c_str();
+  s->Print(os);
+  ret->ret_str = os.str();
+  *out_str = (ret->ret_str).c_str();
   API_END();
 }
 
 int MXSymbolListArguments(SymbolHandle symbol,
                           mx_uint *out_size,
                           const char ***out_str_array) {
-  MXAPISymbolWrapper *s = static_cast<MXAPISymbolWrapper*>(symbol);
+  Symbol *s = static_cast<Symbol*>(symbol);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
-  s->ret_vec_str = std::move((s->sym).ListArguments());
-  s->ret_vec_charp.clear();
-  for (size_t i = 0; i < s->ret_vec_str.size(); ++i) {
-    s->ret_vec_charp.push_back(s->ret_vec_str[i].c_str());
+  ret->ret_vec_str = std::move(s->ListArguments());
+  ret->ret_vec_charp.clear();
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
   }
-  *out_size = static_cast<mx_uint>(s->ret_vec_charp.size());
-  *out_str_array = dmlc::BeginPtr(s->ret_vec_charp);
+  *out_size = static_cast<mx_uint>(ret->ret_vec_charp.size());
+  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
   API_END();
 }
 
 int MXSymbolListReturns(SymbolHandle symbol,
-                          mx_uint *out_size,
-                          const char ***out_str_array) {
-  MXAPISymbolWrapper *s = static_cast<MXAPISymbolWrapper*>(symbol);
+                        mx_uint *out_size,
+                        const char ***out_str_array) {
+  Symbol *s = static_cast<Symbol*>(symbol);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
-  s->ret_vec_str = std::move((s->sym).ListReturns());
-  s->ret_vec_charp.clear();
-  for (size_t i = 0; i < s->ret_vec_str.size(); ++i) {
-    s->ret_vec_charp.push_back(s->ret_vec_str[i].c_str());
+  ret->ret_vec_str = std::move(s->ListReturns());
+  ret->ret_vec_charp.clear();
+  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
+    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
   }
-  *out_size = static_cast<mx_uint>(s->ret_vec_charp.size());
-  *out_str_array = dmlc::BeginPtr(s->ret_vec_charp);
+  *out_size = static_cast<mx_uint>(ret->ret_vec_charp.size());
+  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
   API_END();
 }
 
@@ -397,19 +417,133 @@ int MXSymbolCompose(SymbolHandle sym,
   std::string s_name;
   if (name != nullptr) s_name = name;
 
-  MXAPISymbolWrapper* s = static_cast<MXAPISymbolWrapper*>(sym);
+  Symbol* s = static_cast<Symbol*>(sym);
   if (keys == nullptr && num_args != 0) {
     std::vector<Symbol> pos_args;
     for (mx_uint i = 0; i < num_args; ++i) {
-      pos_args.push_back(((MXAPISymbolWrapper*)(args[i]))->sym);  //  NOLINT(*)
+      pos_args.push_back(*((Symbol*)args[i]));  //  NOLINT(*)
     }
-    (s->sym).Compose(pos_args, s_name);
+    s->Compose(pos_args, s_name);
   } else {
     std::unordered_map<std::string, Symbol> kwargs;
     for (mx_uint i = 0; i < num_args; ++i) {
-      kwargs[keys[i]] = ((MXAPISymbolWrapper*)(args[i]))->sym;  //  NOLINT(*)
+      kwargs[keys[i]] = *((Symbol*)args[i]);  //  NOLINT(*)
     }
-    (s->sym).Compose(kwargs, s_name);
+    s->Compose(kwargs, s_name);
   }
   API_END();
 }
+
+int MXSymbolInferShape(SymbolHandle sym,
+                       mx_uint num_args,
+                       const char** keys,
+                       const mx_uint *arg_ind_ptr,
+                       const mx_uint *arg_shape_data,
+                       mx_uint *in_shape_size,
+                       const mx_uint **in_shape_ndim,
+                       const mx_uint ***in_shape_data,
+                       mx_uint *out_shape_size,
+                       const mx_uint **out_shape_ndim,
+                       const mx_uint ***out_shape_data,
+                       int *complete) {
+  Symbol *s = static_cast<Symbol*>(sym);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  bool succ;
+  API_BEGIN();
+  if (keys == nullptr && num_args != 0) {
+    ret->arg_shapes.clear();
+    for (mx_uint i = 0; i < num_args; ++i) {
+      ret->arg_shapes.push_back(TShape(arg_shape_data + arg_ind_ptr[i],
+                                       arg_shape_data + arg_ind_ptr[i+1]));
+    }
+    succ = s->InferShape(&(ret->arg_shapes), &(ret->out_shapes));
+  } else {
+    std::unordered_map<std::string, TShape> kwargs;
+    for (mx_uint i = 0; i < num_args; ++i) {
+      kwargs[keys[i]] = TShape(arg_shape_data + arg_ind_ptr[i],
+                               arg_shape_data + arg_ind_ptr[i+1]);
+    }
+    succ = s->InferShape(kwargs, &(ret->arg_shapes), &(ret->out_shapes));
+  }
+  if (succ) {
+    MXAPIThreadLocalEntry::SetupShapeArrayReturn(
+        ret->arg_shapes, &(ret->arg_shape_ndim), &(ret->arg_shape_data));
+    MXAPIThreadLocalEntry::SetupShapeArrayReturn(
+        ret->out_shapes, &(ret->out_shape_ndim), &(ret->out_shape_data));
+    *in_shape_size = static_cast<mx_uint>(ret->arg_shapes.size());
+    *in_shape_ndim = dmlc::BeginPtr(ret->arg_shape_ndim);
+    *in_shape_data = dmlc::BeginPtr(ret->arg_shape_data);
+    *out_shape_size = static_cast<mx_uint>(ret->out_shapes.size());
+    *out_shape_ndim = dmlc::BeginPtr(ret->out_shape_ndim);
+    *out_shape_data = dmlc::BeginPtr(ret->out_shape_data);
+    *complete = 1;
+  } else {
+    *complete = 0;
+  }
+  API_END();
+}
+
+int MXExecutorForward(ExecutorHandle handle) {
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  exec->Forward();
+  API_END();
+}
+
+int MXExecutorBackward(ExecutorHandle handle,
+                       mx_uint len,
+                       NArrayHandle *head_grads) {
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  std::vector<NArray> narrays;
+  NArray **args_ptr = reinterpret_cast<NArray**>(head_grads);
+  for (mx_uint i = 0; i < len; ++i) {
+    narrays.push_back(*args_ptr[i]);
+  }
+  exec->Backward(narrays);
+  API_END();
+}
+
+int MXExecutorHeads(ExecutorHandle handle,
+                    mx_uint *out_size,
+                    NArrayHandle **out) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  Executor *exec = static_cast<Executor*>(handle);
+  std::vector<NArray> heads = exec->heads();
+  ret->ret_handles.resize(heads.size());
+  for (size_t i = 0; i < heads.size(); ++i) {
+    NArray *ptr = new NArray();
+    *ptr = heads[i];
+    ret->ret_handles[i] = ptr;
+  }
+  *out_size = heads.size();
+  *out = dmlc::BeginPtr(ret->ret_handles);
+  API_END();
+}
+
+int MXExecutorBind(SymbolHandle symbol_handle,
+                   int dev_mask,
+                   int dev_id,
+                   mx_uint len,
+                   NArrayHandle *in_args,
+                   NArrayHandle *arg_grad_store,
+                   mx_uint *grad_req_type,
+                   ExecutorHandle *out) {
+  API_BEGIN();
+  Symbol *symb = static_cast<Symbol*>(symbol_handle);
+  Context ctx = Context(dev_mask, dev_id);
+  NArray **in_args_ptr = reinterpret_cast<NArray**>(in_args);
+  NArray **arg_grad_ptr = reinterpret_cast<NArray**>(arg_grad_store);
+  std::vector<NArray> in_args_vec;
+  std::vector<NArray> arg_grad_vec;
+  std::vector<OpReqType> grad_req_vec;
+  for (mx_uint i = 0; i < len; ++i) {
+    in_args_vec.push_back(*(in_args_ptr[i]));
+    arg_grad_vec.push_back(*(arg_grad_ptr[i]));
+    grad_req_vec.push_back(static_cast<OpReqType>(grad_req_type[i]));
+  }
+  *out = Executor::Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
+  API_END();
+}
+
diff --git a/src/narray/narray.cc b/src/narray/narray.cc
index 831041bd1496..3618a38c9d59 100644
--- a/src/narray/narray.cc
+++ b/src/narray/narray.cc
@@ -37,14 +37,16 @@ inline void BinaryOp(const NArray &lhs,
     case cpu::kDevMask:
       DAGEngine::Get()->Push([lhs, rhs, ret](RunContext ctx) {
           ret.ptr_->CheckAndAlloc();
-          narray::Eval<cpu, OP>(lhs.ptr_->data, rhs.ptr_->data, &ret.ptr_->data, ctx);
+          TBlob tmp = ret.data();
+          narray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
         }, lhs.ctx(), {lhs.ptr_->var, rhs.ptr_->var}, {ret.ptr_->var});
       break;
 #if MXNET_USE_CUDA
     case gpu::kDevMask:
       DAGEngine::Get()->Push([lhs, rhs, ret](RunContext ctx) {
           ret.ptr_->CheckAndAlloc();
-          narray::Eval<gpu, OP>(lhs.ptr_->data, rhs.ptr_->data, &ret.ptr_->data, ctx);
+          TBlob tmp = ret.data();
+          narray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
         }, lhs.ctx(), {lhs.ptr_->var, rhs.ptr_->var}, {ret.ptr_->var});
       break;
 #endif
@@ -64,14 +66,16 @@ void CopyFromTo(const NArray &from, NArray *to) {
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<cpu, cpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<cpu, cpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var});
   } else if (a == cpu::kDevMask && b == gpu::kDevMask) {
 #if MXNET_USE_CUDA
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<cpu, gpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<cpu, gpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, ret.ctx(), {from.ptr_->var}, {ret.ptr_->var});
 #else
@@ -81,7 +85,8 @@ void CopyFromTo(const NArray &from, NArray *to) {
 #if MXNET_USE_CUDA
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<gpu, cpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<gpu, cpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var});
 #else
@@ -91,7 +96,8 @@ void CopyFromTo(const NArray &from, NArray *to) {
 #if MXNET_USE_CUDA
     DAGEngine::Get()->Push([from, ret](RunContext ctx) {
         ret.ptr_->CheckAndAlloc();
-        narray::Copy<gpu, gpu>(from.ptr_->data, &ret.ptr_->data,
+        TBlob tmp = ret.data();
+        narray::Copy<gpu, gpu>(from.data(), &tmp,
                                from.ctx(), ret.ctx(), ctx);
       }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var});
 #else
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
new file mode 100644
index 000000000000..3d57d6a88102
--- /dev/null
+++ b/src/operator/activation-inl.h
@@ -0,0 +1,142 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation-inl.h
+ * \brief Activation operator
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_ACTIVATION_INL_H_
+#define MXNET_OPERATOR_ACTIVATION_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+// Declare enumeration of input order to make code more intuitive.
+// // These enums are only visible within this header
+enum ActivationOpInputs {kData};
+enum ActivationOpOutputs {kOut};
+enum ActivationOpType {kReLU, kSigmoid, kTanh};
+
+struct ActivationParam : public dmlc::Parameter<ActivationParam> {
+  // use int for enumeration
+  int type;
+  DMLC_DECLARE_PARAMETER(ActivationParam) {
+    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU).\
+      add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
+  }
+};
+
+/**
+ * \brief This is the implementation of activation operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
+template<typename xpu, typename ForwardOp, typename BackwardOp>
+class ActivationOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[kOut], F<ForwardOp>(data));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> m_out_grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_out_data = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_in_grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+    Assign(m_in_grad, req[kData], F<BackwardOp>(m_out_data) * m_out_grad);
+  }
+};  // class ActivationOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(ActivationParam type);
+
+#if DMLC_USE_CXX11
+class ActivationProp : public OperatorProperty {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    // TODO(bing) change directly to vector of pairs begin end
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.Init(kmap);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new ActivationProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  virtual std::string TypeString() const {
+    return "Activation";
+  }
+
+  // decalre dependency and inplace optimization options
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut], out_data[kOut]};
+  }
+
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<void*> &in_grad) const {
+    return {{out_grad[kOut], in_grad[kData]}};
+  }
+
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void*> &out_data) const {
+    return {{in_data[kData], out_data[kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  ActivationParam param_;
+};
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_ACTIVATION_INL_H_
+
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
new file mode 100644
index 000000000000..275588e099af
--- /dev/null
+++ b/src/operator/activation.cc
@@ -0,0 +1,36 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cc
+ * \brief activation op
+ * \author Bing Xu
+*/
+
+#include <mxnet/registry.h>
+#include "./activation-inl.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(ActivationParam param) {
+  switch (param.type) {
+    case kReLU: return new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad>();
+    case kSigmoid: return new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
+    case kTanh: return new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return NULL;
+  }
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *ActivationProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(ActivationParam);
+
+REGISTER_OP_PROPERTY(Activation, ActivationProp);
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
new file mode 100644
index 000000000000..5b7b576e59d7
--- /dev/null
+++ b/src/operator/activation.cu
@@ -0,0 +1,25 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cu
+ * \brief
+ * \author Bing Xu
+*/
+#include "./activation-inl.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(ActivationParam param) {
+  switch(param.type) {
+    case kReLU: return new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad>();
+    case kSigmoid: return new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
+    case kTanh: return new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
+    default:
+      LOG(FATAL) << "unknown activation";
+      return NULL;
+  }
+}
+}  // op
+}  // namespace mxnet
+
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
new file mode 100644
index 000000000000..4a0d6e3fdd57
--- /dev/null
+++ b/src/operator/elementwise_sum-inl.h
@@ -0,0 +1,173 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elemementwise_sum-inl.h
+ * \brief elementwise sum
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_ELEMENTWISE_SUM_INL_H_
+#define MXNET_OPERATOR_ELEMENTWISE_SUM_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+enum ElementWiseSumOpInputs {kData0, kData1, kData2, kData3};
+enum ElementWiseSumOpOutputs {kOut};
+
+struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
+  int size;
+  DMLC_DECLARE_PARAMETER(ElementWiseSumParam) {
+    DMLC_DECLARE_FIELD(size).set_range(1, 100);
+  }
+};
+
+template<typename xpu>
+class ElementWiseSumOp : public Operator {
+ public:
+  explicit ElementWiseSumOp(ElementWiseSumParam param)
+      : size_(param.size) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1);
+    if (req[kOut] == kNullOp) return;
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    switch (size_) {
+      case 2: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1);
+        break;
+      }
+      case 3: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1 + in_2);
+        break;
+      }
+      case 4: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0 + in_1 + in_2 + in_3);
+        break;
+      }
+      default: {
+        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[kOut], in_0);
+        for (int i = 0; i < size_; ++i) {
+          out += in_data[i].FlatTo2D<xpu, real_t>(s);
+        }
+      }
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), static_cast<size_t>(size_));
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> ograd = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+
+    for (int i = 0; i < size_; ++i) {
+      if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
+      Tensor<xpu, 2> igrad = in_grad[i].FlatTo2D<xpu, real_t>(s);
+      Assign(igrad, req[i], ograd);
+    }
+  }
+
+ private:
+  int size_;
+};  // class ElementWiseSumOp
+
+template<typename xpu>
+Operator* CreateOp(ElementWiseSumParam param);
+
+#if DMLC_USE_CXX11
+class ElementWiseSumProp : public OperatorProperty {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    // TODO(bing) change directly to vector of pairs begin end
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.Init(kmap);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.size));
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    for (int i = 1; i < param_.size; ++i) {
+      SHAPE_ASSIGN_CHECK(*in_shape, i, dshape);
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new ElementWiseSumProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  virtual std::string TypeString() const {
+    return "ElementWiseSum";
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return out_grad;
+  }
+
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<void*> &in_grad) const {
+    return {{out_grad[0], in_grad[0]}};
+  }
+
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void*> &out_data) const {
+    return {{in_data[0], out_data[0]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  ElementWiseSumParam param_;
+};  // class ElementWiseSumProp
+
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_ELEMENTWISE_SUM_INL_H_
diff --git a/src/operator/elementwise_sum.cc b/src/operator/elementwise_sum.cc
new file mode 100644
index 000000000000..38e29141c7b3
--- /dev/null
+++ b/src/operator/elementwise_sum.cc
@@ -0,0 +1,24 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elementwise_sum.cc
+ * \brief elementwise sum operator
+*/
+#include <mxnet/registry.h>
+#include "./elementwise_sum-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(ElementWiseSumParam param) {
+  return new ElementWiseSumOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from static_operator_common.h
+Operator* ElementWiseSumProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(ElementWiseSumParam);
+
+REGISTER_OP_PROPERTY(ElementWiseSum, ElementWiseSumProp);
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/elementwise_sum.cu b/src/operator/elementwise_sum.cu
new file mode 100644
index 000000000000..7a9b443dad82
--- /dev/null
+++ b/src/operator/elementwise_sum.cu
@@ -0,0 +1,14 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file elementwise_sum.cu
+ * \brief elementwise sum operator
+*/
+#include "./elementwise_sum-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(ElementWiseSumParam param) {
+  return new ElementWiseSumOp<gpu>(param);
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 5c54d37220ee..b49e5c422739 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -7,12 +7,14 @@
 #define MXNET_OPERATOR_FULLY_CONNECTED_INL_H_
 
 #include <dmlc/logging.h>
+#include <dmlc/parameter.h>
 #include <mxnet/operator.h>
+#include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include "./operator_common.h"
-#include "./param.h"
+
 
 namespace mxnet {
 namespace op {
@@ -22,6 +24,17 @@ namespace op {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
 enum FullyConnectedOpOutputs {kOut};
 
+struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
+  int num_hidden;
+  bool no_bias;
+  DMLC_DECLARE_PARAMETER(FullyConnectedParam) {
+    // TODO(bing) change to only set lower bound
+    // add support for boolean
+    DMLC_DECLARE_FIELD(num_hidden).set_range(1, 100000);
+    DMLC_DECLARE_FIELD(no_bias).set_default(false);
+  }
+};
+
 /**
  * \brief This is the implementation of fully connected operator.
  * \tparam xpu The device that the op will be executed on.
@@ -29,7 +42,7 @@ enum FullyConnectedOpOutputs {kOut};
 template<typename xpu>
 class FullyConnectedOp : public Operator {
  public:
-  explicit FullyConnectedOp(Param p) {
+  explicit FullyConnectedOp(FullyConnectedParam p) {
     this->param_ = p;
   }
 
@@ -40,17 +53,17 @@ class FullyConnectedOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[kOut], kWriteTo);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     // TODO(bing): check the BLAS Handle, be careful
     // maybe need blas handle from context
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
     Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
     out = dot(data, wmat.T());
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
       out += repmat(bias, data.size(0));
     }
@@ -65,12 +78,12 @@ class FullyConnectedOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    size_t expected = param_.no_bias ? 2 : 3;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
     // TODO(bing): check the BLAS Handle, be careful
     //  maybe need blas handle from context
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
     Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
@@ -80,7 +93,7 @@ class FullyConnectedOp : public Operator {
     Tensor<xpu, 2> gwmat = in_grad[kWeight].get<xpu, 2, real_t>(s);
     Assign(gwmat, req[kWeight], dot(grad.T(), data));
     // gradient of bias
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
       Assign(gbias, req[kBias], sum_rows(grad));
     }
@@ -90,49 +103,57 @@ class FullyConnectedOp : public Operator {
   }
 
  private:
-  /** The param of the fully connected layer.*/
-  Param param_;
+  FullyConnectedParam param_;
 };  // class FullyConnectedOp
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateFullyConnectedOp(Param param);
+Operator* CreateOp(FullyConnectedParam param);
 
 #if DMLC_USE_CXX11
 class FullyConnectedProp : public OperatorProperty {
  public:
   virtual std::vector<std::string> ListArguments() const {
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       return {"data", "weight", "bias"};
     } else {
       return {"data", "weight"};
     }
   }
 
-  virtual void SetParam(const char *name, const char *val) {
-    param_.SetParam(name, val);
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
   }
 
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape) const {
     using namespace mshadow;
-    if (param_.no_bias == 0) {
+    if (!param_.no_bias) {
       CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
     }
     CHECK_GT(param_.num_hidden, 0);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_EQ(dshape.ndim(), 4) << \
-        "Input data should be 4D in batch-1-1-hidden";
-    CHECK_NE(dshape.ndim(), 0) << "Require data shape to be known";
-    ShapeAssignCheck((*in_shape)[kWeight], Shape2(param_.num_hidden, dshape[3]));
-    if (param_.no_bias == 0) {
-      ShapeAssignCheck((*in_shape)[kBias], Shape1(param_.num_hidden));
+    // require data to be known
+    if (dshape.ndim() ==  0) return false;
+
+    index_t num_input;
+    if (dshape.ndim() == 4) {
+      // TODO(bing) consider deprecate 4D input
+      CHECK(dshape[1] == 1 && dshape[2] == 1);
+      num_input = dshape[3];
+    } else {
+      CHECK_EQ(dshape.ndim(), 2)
+          << "FullyConnecteded: Input data should be 2D in (batch, num_hidden)";
+      num_input = dshape[1];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, kWeight, Shape2(param_.num_hidden, num_input));
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_hidden));
     }
     out_shape->clear();
-    out_shape->push_back(dshape);
-    (*out_shape)[0][3] = param_.num_hidden;
+    out_shape->push_back(Shape2(dshape[0], param_.num_hidden));
     return true;
   }
 
@@ -153,18 +174,18 @@ class FullyConnectedProp : public OperatorProperty {
     return {out_grad[kOut], in_data[kData], in_data[kWeight]};
   }
 
-  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
-      const std::vector<int> &in_grad) const {
-    return {{in_grad[kData], in_data[kData]}};
+      const std::vector<void*> &in_grad) const {
+    return {{in_data[kData], in_grad[kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
 
  private:
-  Param param_;
+  FullyConnectedParam param_;
 };  // class FullyConnectedSymbol
 #endif
 }  // namespace op
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 362d3c5698aa..7d529cb3ed64 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -8,15 +8,17 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateFullyConnectedOp<cpu>(Param param) {
+Operator* CreateOp<cpu>(FullyConnectedParam param) {
   return new FullyConnectedOp<cpu>(param);
 }
 
 // DO_BIND_DISPATCH comes from static_operator_common.h
 Operator* FullyConnectedProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateFullyConnectedOp, param_);
+  DO_BIND_DISPATCH(CreateOp, param_);
 }
 
+DMLC_REGISTER_PARAMETER(FullyConnectedParam);
+
 REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/fully_connected.cu b/src/operator/fully_connected.cu
index 223ef5166cc9..b97df8afb44c 100644
--- a/src/operator/fully_connected.cu
+++ b/src/operator/fully_connected.cu
@@ -7,7 +7,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateFullyConnectedOp<gpu>(Param param) {
+Operator* CreateOp<gpu>(FullyConnectedParam param) {
   return new FullyConnectedOp<gpu>(param);
 }
 }  // namespace op
diff --git a/src/operator/static_operator/mshadow_op.h b/src/operator/mshadow_op.h
similarity index 87%
rename from src/operator/static_operator/mshadow_op.h
rename to src/operator/mshadow_op.h
index bb33471f168a..010cf0ce7cc9 100644
--- a/src/operator/static_operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -1,17 +1,18 @@
 /*!
  * Copyright (c) 2015 by Contributors
  * \file mshadow_op.h
- * \brief extra mshadow operation for mxnet
+ * \brief
  * \author Bing Xu
- */
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_MSHADOW_OP_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_MSHADOW_OP_H_
+*/
+#ifndef MXNET_OPERATOR_MSHADOW_OP_H_
+#define MXNET_OPERATOR_MSHADOW_OP_H_
+
 #include <mxnet/base.h>
-#include <algorithm>
 
 namespace mxnet {
-/*! \brief operations for ActivationLayer */
 namespace op {
+namespace mshadow_op {
+/*! \brief identity Operation */
 struct identity {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return a;
@@ -98,9 +99,7 @@ struct square_root {
     return sqrt(a);
   }
 };
-
+}  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_MSHADOW_OP_H_
-
+#endif  // MXNET_OPERATOR_MSHADOW_OP_H_
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 87b581f28278..eea731c8fbe6 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -11,6 +11,7 @@
 #include <dmlc/logging.h>
 #include <mxnet/operator.h>
 #include <mxnet/base.h>
+#include <string>
 
 namespace mxnet {
 namespace op {
@@ -34,20 +35,39 @@ inline void Assign(OType &out, // NOLINT(*)
     default: LOG(FATAL) << "not reached";
   }
 }
+
+/*! \brief exception throwed by InferShape error */
+struct InferShapeError {
+  /*! \brief analyze message */
+  std::string msg;
+  /*! \brief corresponding input index */
+  int index;
+  // constructor
+  InferShapeError(std::string msg, int index)
+      : msg(msg), index(index) {}
+};
+
 /*!
- * \brief assign shape to out if out is unknown
- *  otherwise check consistency
- * \param out the output shape to be stored
+ * \brief macro assign shape to out if out is unknown otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param shape_array the shape array to store the result
+ * \param index the index of in the array
  * \param shape the infered shape
  */
-template<typename TS>
-inline void ShapeAssignCheck(TShape &out, const TS &shape) { // NOLINT(*)
-  if (out.ndim() == 0) {
-    out = shape;
-  } else {
-    CHECK(out == shape) << "InferShape:: shape inconsistent";
+#define SHAPE_ASSIGN_CHECK(shape_array, index, shape)                   \
+  {                                                                     \
+    auto &out = (shape_array)[index];                                   \
+    if (out.ndim() == 0) {                                              \
+      out = shape;                                                      \
+    } else {                                                            \
+      if (out != shape) {                                               \
+        std::ostringstream os;                                          \
+        os << "Shape inconsistent, Provided " <<  '='<< out << ','      \
+           << " inferred shape=" << shape;                              \
+        throw ::mxnet::op::InferShapeError(os.str(), index);            \
+      }                                                                 \
+    }                                                                   \
   }
-}
 
 // helper macro to implement bind dispatch
 #if MXNET_USE_CUDA
diff --git a/src/operator/param.h b/src/operator/param.h
index e1f6b4ee58d8..f0ce5886e2fb 100644
--- a/src/operator/param.h
+++ b/src/operator/param.h
@@ -35,10 +35,6 @@ struct Param {
   int no_bias;
   /*! \brief maximum temp_col_size allowed in each layer */
   int temp_col_max;
-  /*! \brief number of input channels */
-  int num_input_channel;
-  /*! \brief number of input hidden nodes, used by fullc */
-  int num_input_node;
   /*! \brief reserved fields, for future compatibility */
   int reserved[64];
 
@@ -48,11 +44,9 @@ struct Param {
   }
 
   inline void SetParam(const char *name, const char* val) {
-    if (!strcmp(name, "nhidden")) num_hidden = atoi(val);
-    if (!strcmp(name, "num_input_node")) num_input_node = atoi(val);
-    if (!strcmp(name, "num_input_channel")) num_input_channel = atoi(val);
-    if (!strcmp(name, "nchannel")) num_channel = atoi(val);
-    if (!strcmp(name, "ngroup")) num_group = atoi(val);
+    if (!strcmp(name, "num_hidden")) num_hidden = atoi(val);
+    if (!strcmp(name, "num_channel")) num_channel = atoi(val);
+    if (!strcmp(name, "num_group")) num_group = atoi(val);
     if (!strcmp(name, "kernel_size")) {
       kernel_y = kernel_x = atoi(val);
     }
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
new file mode 100644
index 000000000000..8b223e2476a2
--- /dev/null
+++ b/src/operator/pooling-inl.h
@@ -0,0 +1,201 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+
+#ifndef MXNET_OPERATOR_POOLING_INL_H_
+#define MXNET_OPERATOR_POOLING_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+enum PoolingOpInputs {kData};
+enum PoolingOpOutputs {kOut};
+enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+
+struct PoolingParam : public dmlc::Parameter<PoolingParam> {
+  int kernel_x;
+  int kernel_y;
+  int stride_x;
+  int stride_y;
+  int pad_x;
+  int pad_y;
+  int type;
+  DMLC_DECLARE_PARAMETER(PoolingParam) {
+    // TODO(bing) change to only set lower bound
+    DMLC_DECLARE_FIELD(kernel_x).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(kernel_y).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(stride_x).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(stride_y).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(pad_x).set_default(0).set_range(0, 10000);
+    DMLC_DECLARE_FIELD(pad_y).set_default(0).set_range(0, 10000);
+    DMLC_DECLARE_FIELD(type).set_default(kMaxPooling)\
+      .add_enum("max", kMaxPooling).add_enum("avg", kAvgPooling)\
+      .add_enum("sum", kSumPooling);
+  }
+};
+
+template<typename xpu, typename Reducer>
+class PoolingOp : public Operator {
+ public:
+  explicit PoolingOp(PoolingParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
+    // TODO(bing): dual stride in mshadow
+    if (param_.type == kMaxPooling || param_.type == kSumPooling) {
+      out = pool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                          out_shape,
+                          param_.kernel_y,
+                          param_.kernel_x,
+                          param_.kernel_y);
+    } else if (param_.type == kAvgPooling) {
+      out = (1.0f / (param_.kernel_y * param_.kernel_x)) * \
+            pool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                          out_shape,
+                          param_.kernel_y,
+                          param_.kernel_x,
+                          param_.kernel_y);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    // TODO(bing): remove pad (0,0)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> output_data = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> input_grad = in_grad[kData].get<xpu, 4, real_t>(s);
+
+    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
+
+    if (param_.type == kMaxPooling || param_.type == kSumPooling) {
+      Assign(input_grad, req[kData],
+             crop(unpool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.kernel_y,
+                                  param_.kernel_x,
+                                  param_.stride_y),
+                  in_shape,
+                  param_.pad_y,
+                  param_.pad_x));
+    } else if (param_.type == kAvgPooling) {
+      Assign(input_grad, req[kData],
+             (1.0f / param_.kernel_y / param_.kernel_x) *\
+             crop(unpool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.kernel_y,
+                                  param_.kernel_x,
+                                  param_.stride_y),
+                  in_shape,
+                  param_.pad_y,
+                  param_.pad_x));
+    }
+  }
+
+ private:
+  PoolingParam param_;
+};  // class PoolingOp
+
+template<typename xpu>
+Operator* CreateOp(PoolingParam param);
+
+
+#if DMLC_USE_CXX11
+class PoolingProp : public OperatorProperty {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK_EQ(dshape.ndim(), 4) << \
+      "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    TShape oshape = dshape;
+    if (dshape.ndim() ==  0) return false;
+    oshape[2] = std::min(dshape[2] + 2 * param_.pad_y - param_.kernel_y + param_.stride_y - 1,
+                         dshape[2] + 2 * param_.pad_y - 1) / param_.stride_y + 1;
+    oshape[3] = std::min(dshape[3] + 2 * param_.pad_x - param_.kernel_x + param_.stride_x - 1,
+                         dshape[3] + 2 * param_.pad_x - 1) / param_.stride_x + 1;
+    CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
+    out_shape->clear();
+    out_shape->push_back(oshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    PoolingProp *prop_sym = new PoolingProp();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  virtual std::string TypeString() const {
+    return "Pooling";
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut], in_data[kData], out_data[kOut]};
+  }
+
+  virtual std::vector<std::pair<int, int> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<int> &in_grad) const {
+    return {{in_data[kData], in_grad[kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  PoolingParam param_;
+};  // class PoolingProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_POOLING_INL_H_
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
new file mode 100644
index 000000000000..a6ebc91e0873
--- /dev/null
+++ b/src/operator/pooling.cc
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include <mxnet/registry.h>
+#include "./pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(PoolingParam param) {
+  switch (param.type) {
+    case kMaxPooling: return new PoolingOp<cpu, mshadow::red::maximum>(param);
+    case kAvgPooling: return new PoolingOp<cpu, mshadow::red::sum>(param);
+    case kSumPooling: return new PoolingOp<cpu, mshadow::red::sum>(param);
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return NULL;
+  }
+}
+
+Operator* PoolingProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(PoolingParam);
+
+REGISTER_OP_PROPERTY(Pooling, PoolingProp);
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
new file mode 100644
index 000000000000..2db6d9ea549a
--- /dev/null
+++ b/src/operator/pooling.cu
@@ -0,0 +1,26 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(PoolingParam param) {
+  switch (param.type) {
+    case kMaxPooling: return new PoolingOp<gpu, mshadow::red::maximum>(param);
+    case kAvgPooling: return new PoolingOp<gpu, mshadow::red::sum>(param);
+    case kSumPooling: return new PoolingOp<gpu, mshadow::red::sum>(param);
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return NULL;
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/static_operator/activation_op-inl.h b/src/operator/static_operator/activation_op-inl.h
deleted file mode 100644
index cfb0b7cec8b5..000000000000
--- a/src/operator/static_operator/activation_op-inl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file activation_op-inl.h
- * \brief activation operator of mxnet
- */
-
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_ACTIVATION_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_ACTIVATION_OP_INL_H_
-
-#include <dmlc/logging.h>
-#include <mxnet/operator.h>
-#include <vector>
-#include "./static_operator_common.h"
-
-namespace mxnet {
-namespace op {
-template<typename xpu, typename ForwardOp, typename BackOp>
-class ActivationOp : public StaticOperator {
- public:
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    CHECK_EQ(in_shape->size(), 1) << "Only 1 input is allowed";
-    CHECK_NE((*in_shape)[0].ndim(), 0) << "Require data shape to be known";
-    out_shape->clear();
-    out_shape->push_back((*in_shape)[0]);
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    mshadow::Stream<xpu> *stream = \
-      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> in = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(stream);
-    out = mshadow::expr::F<ForwardOp>(in);
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    CHECK_EQ(grad_next.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    mshadow::Stream<xpu> *stream = \
-      static_cast<mshadow::Stream<xpu> *>(ctx.stream);
-    mshadow::Tensor<xpu, 2> grad = grad_next[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> data = in_data[0].FlatTo2D<xpu, real_t>(stream);
-    mshadow::Tensor<xpu, 2> out = out_grad[0].FlatTo2D<xpu, real_t>(stream);
-    Assign(out, req[0], mshadow::expr::F<BackOp>(
-        mshadow::expr::F<ForwardOp>(data)) * grad);
-  }
-};  // class ActivationOp
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_ACTIVATION_OP_INL_H_
diff --git a/src/operator/static_operator/pooling_op-inl.h b/src/operator/static_operator/pooling_op-inl.h
deleted file mode 100644
index 8c6014a8c2cf..000000000000
--- a/src/operator/static_operator/pooling_op-inl.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file pooling_op-inl.h
- * \brief pooling operator
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_POOLING_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_POOLING_OP_INL_H_
-
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <vector>
-#include "./param.h"
-#include "./static_operator_common.h"
-
-
-namespace mxnet {
-namespace op {
-template<typename xpu, typename Reducer, OpType mode>
-class PoolingOp : public StaticOperator {
- public:
-  virtual void SetParam(const char *name, const char *val) {
-    param_.SetParam(name, val);
-  }
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    CHECK_GT(param_.kernel_y, 0);
-    CHECK_GT(param_.kernel_x, 0);
-    const int ksize_y = static_cast<index_t>(param_.kernel_y);
-    const int ksize_x = static_cast<index_t>(param_.kernel_x);
-    const int pad_y = static_cast<index_t>(param_.pad_y);
-    const int pad_x = static_cast<index_t>(param_.pad_x);
-    // TODO(bing): dual stride
-    const int kstride = static_cast<index_t>(param_.stride_y);
-    mshadow::Shape<4> ishape = (*in_shape)[0].get<4>();
-    oshape_ = ishape;
-    fea_shape_ = mshadow::Shape2(ishape[2], ishape[3]);
-    oshape_[2] = std::min(ishape[2] + 2 * pad_y - ksize_y + kstride - 1,
-                             ishape[2] + 2 * pad_y - 1) / kstride + 1;
-    oshape_[3] = std::min(ishape[3] + 2 * pad_x - ksize_x + kstride - 1,
-                             ishape[3] + 2 * pad_x - 1) / kstride + 1;
-    CHECK(oshape_[2] > 0 && oshape_[3] > 0) << "kernel size exceed input";
-    out_shape->clear();
-    out_shape->push_back((*in_shape)[0]);
-    (*out_shape)[0][2] = oshape_[2];
-    (*out_shape)[0][3] = oshape_[3];
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 0);
-    if (!(temp_.shape_ == oshape_)) {
-      temp_.Resize(oshape_);
-    }
-    const int ksize_y = param_.kernel_y;
-    const int ksize_x = param_.kernel_x;
-    const int pad_y = param_.pad_y;
-    const int pad_x = param_.pad_x;
-    // TODO(bing): dual stride
-    const int kstride = param_.stride_y;
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[0].get<xpu, 4, real_t>(s);
-    mshadow::Shape<2> pshape = Shape2(out.shape_[2], out.shape_[3]);
-    if (mode == kMaxPooling || mode == kSumPooling) {
-      temp_ = pool<Reducer>(pad(data, pad_y, pad_x),
-                          pshape,
-                          ksize_y,
-                          ksize_x,
-                          kstride);
-    } else if (mode == kAvgPooling) {
-      temp_ = (1.0f / (ksize_y * ksize_x)) * \
-            pool<Reducer>(pad(data, pad_y, pad_x),
-                          pshape,
-                          ksize_y,
-                          ksize_x,
-                          kstride);
-    } else {
-      LOG(FATAL) << "Unknown pooling mode";
-    }
-    Copy(out, temp_, s);
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    CHECK_EQ(grad_next.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    const int ksize_y = param_.kernel_y;
-    const int ksize_x = param_.kernel_x;
-    const int pad_y = param_.pad_y;
-    const int pad_x = param_.pad_x;
-    // TODO(bing): dual stride
-    const int kstride = param_.stride_y;
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> grad = grad_next[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_grad[0].get<xpu, 4, real_t>(s);
-    if (mode == kMaxPooling || mode == kSumPooling) {
-      Assign(out,
-             req[0],
-             crop(unpool<Reducer>(pad(data, pad_y, pad_x),
-                                  pad(temp_, 0, 0),
-                                  pad(grad, 0, 0),
-                                  ksize_y,
-                                  ksize_x,
-                                  kstride),
-                  fea_shape_,
-                  pad_y,
-                  pad_x));
-    } else if (mode == kAvgPooling) {
-      Assign(out,
-             req[0],
-             (1.0f / (ksize_y * ksize_x)) * \
-             crop(unpool<Reducer>(pad(data, pad_y, pad_x),
-                                  pad(temp_, 0, 0),
-                                  pad(grad, 0, 0),
-                                  ksize_y,
-                                  ksize_x,
-                                  kstride),
-                  fea_shape_,
-                  pad_y,
-                  pad_x));
-    } else {
-      LOG(FATAL) << "Unknown pooling mode";
-    }
-  }
-
- private:
-  /*! \brief parameters that potentially be useful */
-  Param param_;
-  /*! \brief temp space to save pooled result */
-  mshadow::TensorContainer<xpu, 4> temp_;
-  /*! \brief pooled output shape */
-  mshadow::Shape<4> oshape_;
-  /*! \brief input feature map shape */
-  mshadow::Shape<2> fea_shape_;
-};  // class PoolingOp
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_POOLING_OP_INL_H_
diff --git a/src/registry.cc b/src/registry.cc
index 42fef1df3423..f64980d8bacc 100644
--- a/src/registry.cc
+++ b/src/registry.cc
@@ -25,12 +25,18 @@ Registry<Entry> *Registry<Entry>::Get() {
   return &instance;
 }
 
-#if DMLC_USE_CXX11
+
 template NArrayFunctionEntry &Registry<NArrayFunctionEntry>::Register(const std::string& name);
 template Registry<NArrayFunctionEntry> *Registry<NArrayFunctionEntry>::Get();
-#endif
 
 template OperatorPropertyEntry &Registry<OperatorPropertyEntry>::Register(const std::string& name);
 template Registry<OperatorPropertyEntry> *Registry<OperatorPropertyEntry>::Get();
 
+// implementation of all factory functions
+OperatorProperty *OperatorProperty::Create(const char* type_name) {
+  auto *creator = Registry<OperatorPropertyEntry>::Find(type_name);
+  CHECK_NE(creator, nullptr)
+      << "Cannot find Operator " << type_name << " in registry";
+  return (*creator)();
+}
 }  // namespace mxnet
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
new file mode 100644
index 000000000000..a434f22a2fc6
--- /dev/null
+++ b/src/symbol/graph_executor.cc
@@ -0,0 +1,496 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_executor.cc
+ * \brief Executor to execute the Graph.
+*/
+#include <dmlc/logging.h>
+#include <mxnet/symbolic.h>
+#include <memory>
+#include "./graph_executor.h"
+
+namespace mxnet {
+/*!
+ * \brief wrapper class that wraps Backward operation as Forward.
+ */
+class GraphExecutor::BackwardOpWrapper : public Operator {
+ public:
+  /*!
+   * \brief create a backward Operator wrapper given forward op.
+   * \param prop pointer to the property of forward wrapper
+   * \param forward_op the shared ptr to Forward operator
+   * \return the created wrapper.
+   */
+  explicit BackwardOpWrapper(const OperatorProperty *prop,
+                             std::shared_ptr<Operator> forward_op)
+      : op_(forward_op) {
+    out_grad_.resize(prop->NumVisibleReturns());
+    in_data_.resize(prop->ListArguments().size());
+    out_data_.resize(prop->NumReturns());
+
+    std::vector<TBlob*> out_grad_ptr(out_grad_.size());
+    for (size_t i = 0; i < out_grad_.size(); ++i) {
+      out_grad_ptr[i] = &out_grad_[i];
+    }
+    std::vector<TBlob*> in_data_ptr(in_data_.size());
+    for (size_t i = 0; i < in_data_.size(); ++i) {
+      in_data_ptr[i] = &in_data_[i];
+    }
+    std::vector<TBlob*> out_data_ptr(out_data_.size());
+    for (size_t i = 0; i < out_data_.size(); ++i) {
+      out_data_ptr[i] = &out_data_[i];
+    }
+    arg_data_ptr_ = prop->BackwardInputs(
+        out_grad_ptr, in_data_ptr, out_data_ptr);
+  }
+  // implement forward
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    // set things correctly
+    CHECK(arg_data_ptr_.size() == in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      *(arg_data_ptr_[i]) = in_data[i];
+    }
+    // redirect internally
+    op_->Backward(ctx, out_grad_, in_data_, out_data_, req, out_data);
+  }
+
+ private:
+  /*! \brief internal forward operator */
+  std::shared_ptr<Operator> op_;
+  /*! \brief internal space for out_grad */
+  std::vector<TBlob> out_grad_;
+  /*! \brief internal space for in_data */
+  std::vector<TBlob> in_data_;
+  /*! \brief internal space for out_data */
+  std::vector<TBlob> out_data_;
+  /*!
+   * \brief pointer to places in the internal space.
+   *  arg_data_ptr_ maps in_data in Forward to the internal space.
+   */
+  std::vector<TBlob*> arg_data_ptr_;
+};
+
+// get resource
+inline std::vector<ResourceRequest>
+GraphExecutor::GetResource(uint32_t node_id) const {
+  const StaticGraph::Node &node = graph_.nodes[node_id];
+  if (node.is_forward()) {
+    return node.op->ForwardResource();
+  } else {
+    CHECK(node.is_backward());
+    return graph_.nodes[node.backward_source_id].op->BackwardResource();
+  }
+}
+
+inline int GraphExecutor::GetNumOutputs(uint32_t node_id) const {
+  const StaticGraph::Node &node = graph_.nodes[node_id];
+  if (node.is_forward()) {
+    return node.op->NumReturns();
+  } else if (node.is_backward()) {
+    return static_cast<int>(
+        graph_.nodes[node.backward_source_id].op->ListArguments().size());
+  } else {
+    CHECK(node.is_variable());
+    return 1;
+  }
+}
+
+// implement get input option
+template<typename T>
+inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
+    uint32_t node_id,
+    const std::vector<T> &in_data,
+    const std::vector<T> &out_data) const {
+  // get the node
+  const StaticGraph::Node &node = graph_.nodes[node_id];
+
+  if (node.is_forward()) {
+    std::vector<int> in_data_index(in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      in_data_index[i] = static_cast<int>(i);
+    }
+    std::vector<void*> out_data_ptr(out_data.size());
+    for (size_t i = 0; i < out_data.size(); ++i) {
+      out_data_ptr[i] = (void*)&out_data[i];  // NOLINT(*)
+    }
+    auto rmap_index = node.op->ForwardInplaceOption(in_data_index, out_data_ptr);
+    std::vector<std::pair<T, T> > remap(rmap_index.size());
+    for (size_t i = 0; i < remap.size(); ++i) {
+      remap[i].first = in_data[rmap_index[i].first];
+      remap[i].second = *static_cast<const T*>(rmap_index[i].second);
+    }
+    return std::move(remap);
+  } else {
+    CHECK(node.is_backward());
+    // forward property
+    const OperatorProperty *fwd = graph_.nodes[node.backward_source_id].op.get();
+
+    std::vector<int> out_grad_index(fwd->NumVisibleReturns());
+    std::vector<int> in_data_index(fwd->ListArguments().size());
+    std::vector<int> out_data_index(fwd->NumReturns());
+    CHECK_EQ(in_data_index.size(), out_data.size());
+    int counter = 0;
+    for (size_t i = 0; i < out_grad_index.size(); ++i) {
+      out_grad_index[i] = counter++;
+    }
+    for (size_t i = 0; i < in_data_index.size(); ++i) {
+      in_data_index[i] = counter++;
+    }
+    for (size_t i = 0; i < out_data_index.size(); ++i) {
+      out_data_index[i] = counter++;
+    }
+    auto args_index = fwd->DeclareBackwardDependency(
+        out_grad_index, in_data_index, out_data_index);
+    std::vector<const T*> args_array(counter, nullptr);
+    CHECK_EQ(args_index.size(), in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      args_array[args_index[i]] = &in_data[i];
+    }
+    std::vector<void*> in_grad_ptr(out_data.size());
+    for (size_t i = 0; i < in_grad_ptr.size(); ++i) {
+      in_grad_ptr[i] = (void*)&out_data[i];  // NOLINT(*)
+    }
+    auto remap_index = fwd->BackwardInplaceOption(
+        out_grad_index, in_data_index, out_data_index, in_grad_ptr);
+    std::vector<std::pair<T, T> > remap(remap_index.size());
+    for (size_t i = 0; i < remap_index.size(); ++i) {
+      CHECK_NE(args_array[remap_index[i].first], nullptr)
+          << "BackwardInplaceOption uses input that is returned by DeclareBackwardDependency";
+      remap[i].first = *args_array[remap_index[i].first];
+      remap[i].second = *static_cast<T*>(remap_index[i].second);
+    }
+    return std::move(remap);
+  }
+}
+
+inline GraphExecutor::OpExecEntry
+GraphExecutor::GetOpExecEntry(uint32_t nid) {
+  OpNode& op_node = op_nodes_[nid];
+  Operator *op = op_node.op.get();
+  std::vector<OpReqType> req;
+  std::vector<TBlob> in_data, out_data;
+  in_data.reserve(graph_.nodes[nid].inputs.size());
+  out_data.reserve(op_node.outputs.size());
+  req.reserve(op_node.outputs.size());
+
+  OpExecEntry exec;
+  for (const DataEntryInfo& out : op_node.outputs) {
+    out_data.push_back(out.data.data());
+    exec.mutate_vars.push_back(out.data.var());
+    req.push_back(out.op_req);
+  }
+
+  for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+    const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+    in_data.push_back(info.data.data());
+    // skip inplace since they already appear in mutate vars
+    if (info.inplace_op_id != static_cast<int>(nid)) {
+      exec.use_vars.push_back(info.data.var());
+    }
+  }
+
+  OpContext* op_ctx_ptr = &op_node.op_ctx;
+  exec.exec_fun = [op, op_ctx_ptr, in_data, req, out_data] (RunContext ctx) {
+    op_ctx_ptr->run_ctx = ctx;
+    op->Forward(*op_ctx_ptr, in_data, req, out_data);
+  };
+  return std::move(exec);
+}
+
+void GraphExecutor::InitGraph(Symbol symbol, Context ctx, bool need_backward) {
+  // initialize all internal daa structures
+  symbol.ToStaticGraph(&graph_);
+  num_forward_nodes_  = graph_.nodes.size();
+  if (need_backward) {
+    graph_.MakeBackwardPass(&head_grad_nodes_, &arg_grads_);
+  }
+  // reorganize so backward node always follow forward
+  // note that this may not be the case, because existence of head_grad_nodes
+  std::vector<uint32_t> topo = graph_.TopoSort();
+  std::vector<uint32_t>  backward;
+  for (uint32_t nid : topo) {
+    if (nid < num_forward_nodes_) {
+      topo_order_.push_back(nid);
+    } else {
+      backward.push_back(nid);
+    }
+  }
+  topo_order_.insert(topo_order_.end(), backward.begin(), backward.end());
+  // setup all the operator nodes data structure
+  op_nodes_.resize(graph_.nodes.size());
+  for (size_t i = 0; i < graph_.nodes.size(); ++i) {
+    op_nodes_[i].ctx = ctx;
+    op_nodes_[i].outputs.resize(GetNumOutputs(i));
+  }
+}
+
+void GraphExecutor::InitDataEntryInfo(const std::vector<NArray> &in_args,
+                                      const std::vector<NArray> &arg_grad_store,
+                                      const std::vector<OpReqType> &grad_req_type) {
+  CHECK_EQ(arg_grad_store.size(), grad_req_type.size());
+  CHECK_EQ(in_args.size(), graph_.arg_nodes.size());
+  // bind inputs
+  for (size_t i = 0; i < graph_.arg_nodes.size(); ++i) {
+    DataEntryInfo &info = op_nodes_[graph_.arg_nodes[i]].outputs[0];
+    info.type = kBindByExternal;
+    info.data = in_args[i];
+  }
+  // setup ref for head nodes
+  for (StaticGraph::DataEntry e : graph_.heads) {
+    DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+    ++info.ref_count;
+    op_nodes_[e.source_id].activated = true;
+  }
+  // need Backward pass
+  if (arg_grads_.size() != 0) {
+    CHECK_EQ(arg_grads_.size(), arg_grad_store.size());
+    CHECK_EQ(arg_grads_.size(), grad_req_type.size());
+    // setup gradient placeholders
+    for (size_t i = 0; i < arg_grads_.size(); ++i) {
+      if (grad_req_type[i] == kNullOp) continue;
+      CHECK_NE(grad_req_type[i], kWriteInplace)
+          << "Gradient request can only be nullop, add, write";
+      std::vector<StaticGraph::DataEntry> &grad_source = arg_grads_[i];
+      CHECK_GE(grad_source.size(), 1);
+      // TODO(bing) add a aggregation node here
+      if (grad_source.size() > 1) {
+        CHECK_EQ(grad_req_type[i], kAddTo)
+            << "The gradient contains multiple variables,";
+      }
+      for (StaticGraph::DataEntry e : grad_source) {
+        DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        info.type = kBindByExternal;
+        info.op_req = grad_req_type[i];
+        info.data = arg_grad_store[i];
+        ++info.ref_count;
+        op_nodes_[e.source_id].activated = true;
+      }
+    }
+    // setup head gradient
+    for (uint32_t nid : head_grad_nodes_) {
+      DataEntryInfo &info = op_nodes_[nid].outputs[0];
+      info.type = kTobeBindByExternal;
+    }
+  }
+  // update ref counters for all other nodes, in reverse topo order
+  for (auto it = topo_order_.rbegin(); it != topo_order_.rend(); ++it) {
+    uint32_t nid = *it;
+    if (op_nodes_[nid].activated) {
+      for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+        DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        ++info.ref_count;
+        op_nodes_[e.source_id].activated = true;
+      }
+    }
+  }
+
+  // shape inference
+  std::vector<std::vector<TShape> > out_shapes(op_nodes_.size());
+  for (size_t i = 0; i < out_shapes.size(); ++i) {
+    out_shapes[i].resize(op_nodes_[i].outputs.size());
+  }
+  for (size_t i = 0; i < graph_.arg_nodes.size(); ++i) {
+    out_shapes[graph_.arg_nodes[i]][0] = in_args[i].shape();
+  }
+  CHECK(graph_.InferNodeShapes(topo_order_, &out_shapes))
+      << "Shape inference cannot be complete in bind";
+  for (size_t i = 0; i < out_shapes.size(); ++i) {
+    for (size_t j = 0; j < out_shapes[i].size(); ++j) {
+      op_nodes_[i].outputs[j].shape = out_shapes[i][j];
+    }
+  }
+}
+
+void GraphExecutor::InitDataEntryMemory() {
+  // use allocator to allocate memory.
+  GraphStorageAllocator allocator(&graph_);
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+
+    // check inplace option
+    std::vector<DataEntryInfo*> in_data;
+    in_data.reserve(graph_.nodes[nid].inputs.size());
+    // check inputs are ready.
+    for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+      DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+      CHECK_NE(info.type, kNotInitialized);
+      CHECK_NE(info.ref_count, 0);
+      in_data.push_back(&info);
+    }
+    std::vector<DataEntryInfo*> out_data(op_nodes_[nid].outputs.size());
+    for (size_t i = 0; i < op_nodes_[nid].outputs.size(); ++i) {
+      out_data[i] = &op_nodes_[nid].outputs[i];
+      CHECK_NE(out_data[i]->type, kInternalAllocated);
+    }
+    auto inplace = GetInplaceOption(nid, in_data, out_data);
+
+    for (std::pair<DataEntryInfo*, DataEntryInfo*> kv : inplace) {
+      DataEntryInfo* in = kv.first;
+      DataEntryInfo* out = kv.second;
+      if (in->ref_count == 1 &&
+          in->type == kInternalAllocated &&
+          out->type == kNotInitialized) {
+        // we can only do inplace if we are last user of in
+        // and out is not initialized.
+        out->type = kInternalAllocated;
+        out->op_req = kWriteInplace;
+        out->storage_id = in->storage_id;
+        // set inplace op id
+        in->ref_count = 0;
+        in->inplace_op_id = static_cast<int>(nid);
+      }
+    }
+    // allocate output,
+    for (DataEntryInfo *out : out_data) {
+      if (out->op_req == kNullOp && out->ref_count != 0) {
+        out->op_req = kWriteTo;
+      }
+      if (out->type == kNotInitialized) {
+        out->storage_id = allocator.Request(
+            op_nodes_[nid].ctx, out->shape, nid);
+        out->type = kInternalAllocated;
+      }
+    }
+    // then free inputs
+    for (DataEntryInfo *in : in_data) {
+      // ref_count == 0 means it is taken by inplace op
+      if (in->ref_count == 0) {
+        CHECK_EQ(in->inplace_op_id, static_cast<int>(nid));
+        continue;
+      }
+      // if we decrease it to zero, means we are ready to relase
+      --in->ref_count;
+      if (in->ref_count == 0 && in->type == kInternalAllocated) {
+        allocator.Release(in->storage_id, nid);
+      }
+    }
+    // check out again, if there is ref_count == 0, release it
+    for (DataEntryInfo *out : out_data) {
+      if (out->ref_count == 0 && out->type == kInternalAllocated) {
+        allocator.Release(out->storage_id, nid);
+      }
+    }
+  }
+  // one pass complete, allocate real memory
+  allocator.InitStorages();
+  // get the real data NArray into the DataEntryInfo
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    for (DataEntryInfo &out : op_nodes_[nid].outputs) {
+      CHECK_NE(out.type, kNotInitialized);
+      if (out.type == kInternalAllocated) {
+        out.data = allocator.Get(out.storage_id, out.shape);
+      }
+    }
+  }
+  for (StaticGraph::DataEntry e : graph_.heads) {
+    DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+    CHECK_EQ(info.type, kInternalAllocated);
+    heads_narray_.push_back(info.data);
+  }
+}
+
+void GraphExecutor::InitOpNodes() {
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& op_node = op_nodes_[nid];
+    if (graph_.nodes[nid].is_forward()) {
+      op_node.op.reset(graph_.nodes[nid].op->CreateOperator(op_node.ctx));
+    } else {
+      CHECK(graph_.nodes[nid].is_backward());
+      op_node.op.reset(new BackwardOpWrapper(
+          graph_.nodes[graph_.nodes[nid].backward_source_id].op.get(),
+          op_nodes_[graph_.nodes[nid].backward_source_id].op));
+    }
+    bool allow_cache = true;
+    for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
+      DataEntryInfo& info = op_nodes_[e.source_id].outputs[e.index];
+      if (info.type == kTobeBindByExternal) allow_cache = false;
+    }
+    for (DataEntryInfo& info : op_node.outputs) {
+      if (info.type == kTobeBindByExternal) allow_cache = false;
+    }
+    if (allow_cache) {
+      op_node.cached_exec = GetOpExecEntry(nid);
+    }
+  }
+}
+
+void GraphExecutor::RunOps(size_t topo_start, size_t topo_end) {
+  for (size_t i = topo_start; i < topo_end; ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& opnode = op_nodes_[nid];
+    if (opnode.cached_exec.exec_fun != nullptr) {
+      DAGEngine::Get()->Push(
+          opnode.cached_exec.exec_fun,
+          opnode.ctx,
+          opnode.cached_exec.use_vars,
+          opnode.cached_exec.mutate_vars);
+    } else {
+      auto exec = GetOpExecEntry(nid);
+      DAGEngine::Get()->Push(
+          exec.exec_fun,
+          opnode.ctx,
+          exec.use_vars,
+          exec.mutate_vars);
+    }
+  }
+}
+
+std::string GraphExecutor::DebugStr() const {
+  std::ostringstream os;
+  os << "num_forward_nodes=" << num_forward_nodes_ << '\n';
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    os << "Op " << i << ":" << graph_.nodes[nid].name << '\n';
+    for (size_t j = 0; j < op_nodes_[nid].outputs.size(); ++j) {
+      const DataEntryInfo &info = op_nodes_[nid].outputs[j];
+      os << "\toutput[" << j << "]: shape=" << info.shape;
+      if (info.storage_id != GraphStorageAllocator::kBadStorageID) {
+        os << ", storage_id=" << info.storage_id;
+      }
+      if (info.inplace_op_id != -1) {
+        os << ", inplace_consumer=" << graph_.nodes[info.inplace_op_id].name;
+      }
+      os << '\n';
+    }
+  }
+  return os.str();
+}
+
+void GraphExecutor::Forward() {
+  RunOps(0, num_forward_nodes_);
+}
+
+void GraphExecutor::Backward(const std::vector<NArray> &head_grads) {
+  CHECK_EQ(head_grad_nodes_.size(), head_grads.size());
+  for (size_t i = 0; i < head_grad_nodes_.size(); ++i) {
+    uint32_t nid = head_grad_nodes_[i];
+    CHECK(graph_.nodes[nid].is_variable());
+    DataEntryInfo &info = op_nodes_[nid].outputs[0];
+    CHECK_EQ(info.type, kTobeBindByExternal);
+    info.data = head_grads[i];
+  }
+  RunOps(num_forward_nodes_, topo_order_.size());
+}
+
+Executor *Executor::Bind(Symbol symbol,
+                         Context ctx,
+                         const std::vector<NArray> &in_args,
+                         const std::vector<NArray> &arg_grad_store,
+                         const std::vector<OpReqType> &grad_req_type) {
+  GraphExecutor *exec = new GraphExecutor();
+  exec->Init(symbol, ctx, in_args, arg_grad_store, grad_req_type);
+  return exec;
+}
+}  // namespace mxnet
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
new file mode 100644
index 000000000000..a072eee69b68
--- /dev/null
+++ b/src/symbol/graph_executor.h
@@ -0,0 +1,186 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_executor.h
+ * \brief Executor to execute the Forward and Backward on Composition Graph.
+*/
+#ifndef MXNET_SYMBOL_GRAPH_EXECUTOR_H_
+#define MXNET_SYMBOL_GRAPH_EXECUTOR_H_
+
+#include <mxnet/symbolic.h>
+#include <memory>
+#include <vector>
+#include <utility>
+#include "./graph_memory_allocator.h"
+
+namespace mxnet {
+/*!
+ * \brief Executor of a computation graph.
+ */
+class GraphExecutor : public Executor {
+ public:
+  virtual ~GraphExecutor() {}
+  virtual void Forward();
+  virtual void Backward(const std::vector<NArray> &head_grads);
+  virtual const std::vector<NArray> &heads() const {
+    return heads_narray_;
+  }
+  // implement Executor::Bind, only call it once.
+  inline void Init(Symbol symbol,
+                   Context ctx,
+                   const std::vector<NArray> &in_args,
+                   const std::vector<NArray> &arg_grad_store,
+                   const std::vector<OpReqType> &grad_req_type) {
+    CHECK_EQ(grad_req_type.size(), arg_grad_store.size());
+    bool need_backward = false;
+    for (auto req : grad_req_type) {
+      if (req != kNullOp) need_backward = true;
+    }
+    this->InitGraph(symbol, ctx, need_backward);
+    this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type);
+    this->InitDataEntryMemory();
+    this->InitOpNodes();
+    // TODO(bing): remove me when things are OK
+    LOG(INFO) << "-----Execution memory plan-----\n"
+              << DebugStr() << '\n'
+              << "------------------------------\n";
+  }
+
+ protected:
+  // internal class of wrapping BackwardOp as ForwardOp
+  class BackwardOpWrapper;
+  // type of data entry
+  enum DataEntryType {
+    // memory is binded by external NArray in Bind
+    kBindByExternal,
+    // to be binded by external NArray in Forward and Backward
+    kTobeBindByExternal,
+    // internal memory, allocated
+    kInternalAllocated,
+    // internal memory, to be allocated
+    kNotInitialized
+  };
+  // Additional information about each data entry
+  struct DataEntryInfo {
+    // the actual data for the entry
+    NArray data;
+    // write request to this entry
+    OpReqType op_req;
+    // the operatio node that will take
+    // this DataEntry as inplace input
+    int inplace_op_id;
+    // data entry type
+    DataEntryType type;
+    // shape of this entry
+    TShape shape;
+    // storage id from allocator if it is internal allocation.
+    GraphStorageAllocator::StorageID storage_id;
+    // reference count on how many times this entry is being used.
+    // That is how many operators and heads need this DataEntry
+    // this is a temporal variable that is used during initialization.
+    uint32_t ref_count;
+    // constructor
+    DataEntryInfo()
+        : op_req(kNullOp),
+          inplace_op_id(-1),
+          type(kNotInitialized),
+          storage_id(GraphStorageAllocator::kBadStorageID),
+          ref_count(0) {}
+  };
+  // all the information needed to push the op to engine
+  struct OpExecEntry {
+    // execution function for
+    DAGEngine::Op exec_fun;
+    // variables to read from
+    std::vector<DAGEngine::Variable> use_vars;
+    // variables to mutate
+    std::vector<DAGEngine::Variable> mutate_vars;
+    // constructor
+    OpExecEntry() : exec_fun(nullptr) {}
+  };
+  // Information about operational node
+  struct OpNode {
+    // whether this op node is activated
+    bool activated;
+    // the context of the node
+    Context ctx;
+    // data entry information about outputs of op
+    std::vector<DataEntryInfo> outputs;
+    // The following parts are constructed in InitOpNodes
+    // the real operator
+    std::shared_ptr<Operator> op;
+    // op context, that is defined for this op.
+    OpContext op_ctx;
+    // executor, this is only allocated for nodes
+    // whose inputs, outputs are pre-defined.
+    // otherwise cached_exec.exec_fun == nullptr
+    OpExecEntry cached_exec;
+    // constructor
+    OpNode() : activated(false) {}
+  };
+  /*!
+   * \brief Get input option of a node.
+   *  This function is overriden for both Forward and Backward node.
+   *
+   * \param node_id node index of node in StaticGraph
+   * \param in_data the input data entry to the node
+   * \param out_data the output data entry in the graph
+   * \return the paired inplace option.
+   */
+  template<typename T>
+  inline std::vector<std::pair<T, T> > GetInplaceOption(
+      uint32_t node_id,
+      const std::vector<T> &in_data,
+      const std::vector<T> &out_data) const;
+  /*!
+   * \brief Get resource requirement of a node.
+   *  This function is overriden for both Forward and Backward node.
+   * \param node_id node index of node in StaticGraph
+   * \return the desired resource request.
+   */
+  inline std::vector<ResourceRequest> GetResource(uint32_t node_id) const;
+  /*!
+   * \brief Get number of outputs of a node.
+   *  This function is overriden for both Forward and Backward node.
+   * \param node_id node index of node in StaticGraph
+   * \return the number of outputs of the node.
+   */
+  inline int GetNumOutputs(uint32_t node_id) const;
+  /*!
+   * \brief get execution entry for an OpNode.
+   *  This function can only be called after initialization is done.
+   * \param node_id the id of operational node.
+   * \return the execution entry.
+   */
+  inline OpExecEntry GetOpExecEntry(uint32_t node_id);
+  // initialize the internal graph structure
+  void InitGraph(Symbol symbol, Context ctx, bool need_backward);
+  // initialize internal DataEntryInfo, reference counting
+  void InitDataEntryInfo(const std::vector<NArray> &in_args,
+                         const std::vector<NArray> &arg_grad_store,
+                         const std::vector<OpReqType> &grad_req_type);
+  // initialize internal data entries NArray
+  void InitDataEntryMemory();
+  // initialize OpNode data structure
+  void InitOpNodes();
+  // run ops from topo order start to end
+  void RunOps(size_t topo_start, size_t topo_end);
+  // get debug string
+  std::string DebugStr() const;
+  // internal computational graph
+  StaticGraph graph_;
+  // topological order of nodes in computation graph
+  // backward nodes always follow forward nodes
+  std::vector<uint32_t> topo_order_;
+  // number of forward nodes in the graph
+  size_t num_forward_nodes_;
+  // head gradient node in the graph, if there is backward pass
+  std::vector<uint32_t> head_grad_nodes_;
+  // argument node in the graph, if there is backward pass
+  std::vector<std::vector<StaticGraph::DataEntry> > arg_grads_;
+  // operational nodes
+  std::vector<OpNode> op_nodes_;
+  // head NArrays
+  std::vector<NArray> heads_narray_;
+};  // class GraphExecutor
+}  // namespace mxnet
+#endif  // MXNET_SYMBOL_GRAPH_EXECUTOR_H_
diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h
new file mode 100644
index 000000000000..b7bd2db2081e
--- /dev/null
+++ b/src/symbol/graph_memory_allocator.h
@@ -0,0 +1,145 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file graph_memory_allocator.h
+ * \brief Memory allocator for graph executor.
+*/
+#ifndef MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_
+#define MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_
+
+#include <mxnet/symbolic.h>
+#include <mxnet/narray.h>
+#include <map>
+#include <vector>
+
+namespace mxnet {
+/*!
+ * \brief Memory allocators for the GraphExecutor.
+ *  This class is intended to be used by GraphExecutor
+ *  to allocate the memory for each DataEntryInfo.
+ *
+ *  The class algorithm works in two phase:
+ *  (1) Planning Phase: GraphExecutor call Request and Release
+ *      to request and release resources according to dependency.
+ *      - Each call to Request will get a ResourceID that is used to
+ *        identify the memory block assigned to each DataEntryInfo.
+ *  (2) Allocating phase: GraphExecutor call InitMemory.
+ *      - Then each DataEntry will call Get to get the real NArray.
+ *  (3) All the memory will be freed up when reference to all the related NArray ends.
+ */
+class GraphStorageAllocator {
+ public:
+  /*! \brief resource index */
+  typedef int64_t StorageID;
+  /*! \brief bad storage id */
+  static const StorageID kBadStorageID = -1;
+  /*! \brief constructor to the graph memory allocator */
+  explicit GraphStorageAllocator(StaticGraph *graph);
+  /*!
+   * \brief Request a memory.
+   * \param ctx the context of the graph
+   * \param shape shape of the NArray we want
+   * \param node_id the node that is requesting the memory, used as hint.
+   */
+  StorageID Request(Context ctx, TShape shape, uint32_t node_id);
+  /*!
+   * \brief Release a memory.
+   * \param id the storage ID of the memory.
+   * \param node_id the node id in the graph that is releasing the memory.
+   */
+  void Release(StorageID id, uint32_t node_id);
+  /*! \brief Initialize all the memories requested */
+  void InitStorages();
+  /*!
+   * \brief Get the the memory allocated in planning phase.
+   * \param id the storage id allocated in planning phase.
+   * \param shape the shape of the NArray requested.
+   */
+  NArray Get(StorageID id, TShape shape);
+
+ private:
+  /*! \brief internal storage entry */
+  struct StorageEntry {
+    /*! \brief id of the storage */
+    StorageID id;
+    /*! \brief the context of the storage */
+    Context ctx;
+    /*! \brief maximum size of the storage that is requested */
+    size_t max_size;
+    /*! \brief the actual NArray to hold the data */
+    NArray data;
+    /*! \brief constructor */
+    StorageEntry() : max_size(0) {}
+  };
+  /*!
+   * \brief Allocate a StorageID when Request cannot found existing ones.
+   * \param ctx the context of the graph
+   * \param shape shape of the NArray we want
+   */
+  StorageID Alloc(Context ctx, size_t size);
+
+  /*! \brief reference to the computation graph */
+  StaticGraph *graph_;
+  /*! \brief all the resources available */
+  std::vector<std::unique_ptr<StorageEntry> > data_;
+  /*!
+   * \brief free list of storage entries, maps size to free list
+   */
+  std::multimap<size_t, StorageEntry*> free_;
+};
+
+// put implementation in header files for now
+GraphStorageAllocator::GraphStorageAllocator(StaticGraph *graph)
+    : graph_(graph) {}
+
+GraphStorageAllocator::StorageID
+GraphStorageAllocator::Alloc(Context ctx, size_t size) {
+  StorageID id = static_cast<StorageID>(data_.size());
+  std::unique_ptr<StorageEntry> ptr(new StorageEntry());
+  ptr->id = id;
+  ptr->ctx = ctx;
+  ptr->max_size = size;
+  data_.push_back(std::move(ptr));
+  return id;
+}
+
+GraphStorageAllocator::StorageID
+GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
+  size_t size = shape.Size();
+  auto begin = free_.lower_bound(size);
+  auto end = free_.upper_bound(size);
+  // vector of possible candidates
+  for (auto it = begin; it != end; ++it) {
+    StorageEntry *e = it->second;
+    if (e->ctx != ctx) continue;
+    // Use exect matching strategy
+    // TODO(bing): think of other strategies, for example, rough match.
+    if (e->max_size != size) continue;
+    // find a exact match, erase from map and return
+    free_.erase(it);
+    return e->id;
+  }
+  // cannot find anything return a new one.
+  return this->Alloc(ctx, size);
+}
+
+void GraphStorageAllocator::Release(StorageID id, uint32_t node_id) {
+  CHECK_NE(id, kBadStorageID);
+  StorageEntry *e = data_[id].get();
+  free_.insert({e->max_size, e});
+}
+
+void GraphStorageAllocator::InitStorages() {
+  for (size_t i = 0; i < data_.size(); ++i) {
+    StorageEntry *e = data_[i].get();
+    TShape shape = mshadow::Shape1(e->max_size);
+    e->data = NArray(shape, e->ctx);
+  }
+}
+
+NArray GraphStorageAllocator::Get(StorageID id, TShape shape) {
+  CHECK_NE(id, kBadStorageID);
+  StorageEntry *e = data_[id].get();
+  return e->data.Slice(0, shape.Size()).Reshape(shape);
+}
+}  // namespace mxnet
+#endif  // MXNET_SYMBOL_GRAPH_MEMORY_ALLOCATOR_H_
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 5419e26afe86..5eb0ad14a282 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -7,14 +7,19 @@
 #include <mxnet/symbolic.h>
 #include <vector>
 #include <queue>
+#include <map>
+#include "../operator/operator_common.h"
 
 namespace mxnet {
 std::vector<uint32_t> StaticGraph::TopoSort() const {
   std::vector<int> out_degree(nodes.size(), 0);
-  for (const Node &n : nodes) {
-    for (const DataEntry &e : n.inputs) {
+  for (const Node& n : nodes) {
+    for (const DataEntry& e : n.inputs) {
       ++out_degree[e.source_id];
     }
+    if (n.is_backward()) {
+      ++out_degree[n.backward_source_id];
+    }
   }
   std::vector<uint32_t> ret(nodes.size());
   auto result = ret.rbegin();
@@ -29,12 +34,17 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
     queue.pop();
     *result = node_id;
     ++result;
-    for (const DataEntry &e : nodes[node_id].inputs) {
-      out_degree[e.source_id] -= 1;
-      if (out_degree[e.source_id] == 0) {
+    const Node& n = nodes[node_id];
+    for (const DataEntry& e : n.inputs) {
+      if (--out_degree[e.source_id] == 0) {
         queue.push(e.source_id);
       }
     }
+    if (n.is_backward()) {
+      if (--out_degree[n.backward_source_id] == 0) {
+        queue.push(n.backward_source_id);
+      }
+    }
   }
   return std::move(ret);
 }
@@ -42,19 +52,73 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
 bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
                                   std::vector<std::vector<TShape> > *node_out_shapes) const {
   for (uint32_t nid : topo_order) {
-    const Node &node = nodes[nid];
-    if (node.op != nullptr) {
+    const Node& node = nodes[nid];
+    if (node.is_forward()) {
       std::vector<TShape> in_shape;
-      for (const DataEntry &e : node.inputs) {
+      for (const DataEntry& e : node.inputs) {
         in_shape.push_back((*node_out_shapes)[e.source_id][e.index]);
       }
-      if (!node.op->InferShape(&in_shape, &(*node_out_shapes)[nid])) return false;
+      try {
+        if (!node.op->InferShape(&in_shape, &(*node_out_shapes)[nid])) return false;
+      } catch (const op::InferShapeError &err) {
+        // error handling
+        const std::string &op_name = node.name;
+        std::string arg_name = node.op->ListArguments()[err.index];
+        std::ostringstream os;
+        os << "InferShape Error in "
+           << op_name << "\'s" << ' ' << arg_name << " argument\n";
+        auto &source = nodes[node.inputs[err.index].source_id];
+        if (source.is_variable()) {
+          os << "Corresponding keyword of symbol: " << source.name << '\n' << err.msg;
+        }
+        throw dmlc::Error(os.str());
+      }
       for (size_t i = 0; i < node.inputs.size(); ++i) {
-        const DataEntry &e = node.inputs[i];
+        const DataEntry& e = node.inputs[i];
         (*node_out_shapes)[e.source_id][e.index] = in_shape[i];
       }
+    } else if (nodes[nid].is_backward()) {
+      // simply use shapes from forward pass to assign backward shape
+      const Node& forward = nodes[node.backward_source_id];
+      CHECK(forward.is_forward());
+      std::vector<TShape>& in_grad_shapes = (*node_out_shapes)[nid];
+      CHECK(in_grad_shapes.size() == forward.inputs.size());
+      // assign the input shape to output gradients
+      for (size_t i = 0; i < forward.inputs.size(); ++i) {
+        const DataEntry &e = forward.inputs[i];
+        try {
+          SHAPE_ASSIGN_CHECK(in_grad_shapes, i, (*node_out_shapes)[e.source_id][e.index]);
+        } catch (const op::InferShapeError &err) {
+          const std::string &op_name = forward.name;
+          std::string arg_name = forward.op->ListArguments()[e.index];
+          std::ostringstream os;
+          os << "InferShape Error in "
+             << op_name << "\'s" << ' ' << arg_name << " gradient argument\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
+      // consistent check for input shapes
+      auto& out_data_shapes = (*node_out_shapes)[node.backward_source_id];
+      // use BackwardInputs to select entries corresponding to node.inputs
+      auto in_shape = forward.op->BackwardInputs(
+          out_data_shapes, in_grad_shapes, out_data_shapes);
+      for (size_t i = 0; i < node.inputs.size(); ++i) {
+        const DataEntry& e = node.inputs[i];
+        try {
+          SHAPE_ASSIGN_CHECK((*node_out_shapes)[e.source_id], e.index, in_shape[i]);
+        } catch (const op::InferShapeError &err) {
+          const std::string &op_name = nodes[e.source_id].name;
+          std::ostringstream os;
+          os << "InferShape Error in "
+             << op_name << "\'s" << " gradient values\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
     }
   }
+  // TODO(bing) assign shape for head gradient
   return true;
 }
 
@@ -63,8 +127,10 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   std::vector<std::vector<TShape> > node_out_shapes(nodes.size());
   for (size_t i = 0; i < nodes.size(); ++i) {
     int nout = 1;
-    if (nodes[i].op != nullptr) {
+    if (nodes[i].is_forward()) {
       nout = nodes[i].op->NumReturns();
+    } else if (nodes[i].is_backward()) {
+      nout = static_cast<int>(nodes[nodes[i].backward_source_id].inputs.size());
     }
     node_out_shapes[i].resize(nout);
   }
@@ -78,10 +144,117 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   for (size_t i = 0; i < arg_nodes.size(); ++i) {
     (*in_shape)[i] = node_out_shapes[arg_nodes[i]][0];
   }
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    DataEntry e = outputs[i];
+  out_shape->resize(heads.size());
+  for (size_t i = 0; i < heads.size(); ++i) {
+    const DataEntry &e = heads[i];
     (*out_shape)[i] = node_out_shapes[e.source_id][e.index];
   }
   return true;
 }
+
+StaticGraph::Node StaticGraph::CreateSumNode(
+    const std::vector<DataEntry> &grad_source) {
+  // find multiple gradients, need aggregate
+  std::ostringstream os_size;
+  Node agg_node;
+  agg_node.op.reset(OperatorProperty::Create("ElementWiseSum"));
+  os_size << grad_source.size();
+  agg_node.op->Init({{"size", os_size.str()}});
+  agg_node.inputs = grad_source;
+  return std::move(agg_node);
+}
+
+void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
+                                   std::vector<std::vector<DataEntry> > *arg_grads) {
+  arg_grads->clear();
+  head_grad_nodes->clear();
+  // get topo order of nodes, before new nodes are added
+  std::vector<uint32_t> topo_order = TopoSort();
+  // map out_data entry to out_grad
+  std::map<DataEntry, std::vector<DataEntry> > grad_map;
+  // allocate head gradient nodes
+  for (DataEntry head : heads) {
+    Node node;
+    std::ostringstream os;
+    os << nodes[head.source_id].name << '_' << head.index << "_grad";
+    // TODO(bing): add index to name
+    node.name = os.str();
+    // node id
+    uint32_t nid = static_cast<uint32_t>(nodes.size());
+    nodes.push_back(std::move(node));
+    // create a variable node for gradient input
+    DataEntry igrad(nid, 0);
+    head_grad_nodes->push_back(nid);
+    // update gradient map
+    auto it = grad_map.find(head);
+    if (it == grad_map.end()) {
+      grad_map[head] = {igrad};
+    } else {
+      it->second.push_back(igrad);
+    }
+  }
+  // do backward pass traverse
+  for (auto it = topo_order.rbegin(); it != topo_order.rend(); ++it) {
+    uint32_t nid = *it;
+    // skip variables
+    if (nodes[nid].is_variable()) continue;
+    CHECK(nodes[nid].is_forward()) << "Do not support Backward of Backward";
+    // get out_grad and out_data entry
+    std::vector<DataEntry> out_grad, out_data;
+    // nvisible is out_grad.size()
+    int nvisible = nodes[nid].op->NumVisibleReturns();
+    // ntotal is out_data.size()
+    int ntotal = nodes[nid].op->NumReturns();
+    // check all outpus
+    for (int i = 0; i < ntotal; ++i) {
+      DataEntry odata(nid, static_cast<uint32_t>(i));
+      out_data.push_back(odata);
+      if (i >= nvisible) continue;
+      // get out_grad
+      auto it = grad_map.find(odata);
+      CHECK(it != grad_map.end()) << "bad graph";
+      std::vector<DataEntry> &gnodes = it->second;
+      if (gnodes.size() == 1) {
+        out_grad.push_back(gnodes[0]);
+      } else {
+        std::ostringstream os_name;
+        Node agg_node = StaticGraph::CreateSumNode(gnodes);
+        os_name << nodes[nid].name << '_' << i << "_out_grad_agg";
+        agg_node.name = os_name.str();
+        uint32_t agg_node_id = static_cast<uint32_t>(nodes.size());
+        nodes.push_back(std::move(agg_node));
+        out_grad.push_back(DataEntry(agg_node_id, 0));
+      }
+    }
+    // Create a gradient backward node
+    Node grad_node;
+    // Point to the corresponding source
+    grad_node.backward_source_id = nid;
+    // select out the dependent inputs
+    grad_node.inputs = nodes[nid].op->BackwardInputs(
+        out_grad, nodes[nid].inputs, out_data);
+    grad_node.name = nodes[nid].name + "_backward";
+    uint32_t grad_node_id = static_cast<uint32_t>(nodes.size());
+    nodes.push_back(std::move(grad_node));
+    // update gradient map
+    for (size_t i = 0; i < nodes[nid].inputs.size(); ++i) {
+      DataEntry idata = nodes[nid].inputs[i];
+      DataEntry igrad(grad_node_id, static_cast<uint32_t>(i));
+      auto it = grad_map.find(idata);
+      if (it == grad_map.end()) {
+        grad_map[idata] = {igrad};
+      } else {
+        it->second.push_back(igrad);
+      }
+    }
+  }
+  // create return values of arg_grads
+  arg_grads->resize(arg_nodes.size());
+  for (size_t i = 0; i < arg_nodes.size(); ++i) {
+    DataEntry odata(arg_nodes[i], 0);
+    auto it = grad_map.find(odata);
+    CHECK(it != grad_map.end()) << "bad graph";
+    arg_grads->at(i) = it->second;
+  }
+}
 }  // namespace mxnet
diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index 86cf54feabfa..54a5fe9422b2 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -1,7 +1,7 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file symbol.cc
- * \brief symbol of mxnet
+  *\file symbol.cc
+  *\brief symbol of mxnet
  */
 #include <dmlc/logging.h>
 #include <mxnet/symbolic.h>
@@ -12,13 +12,13 @@
 
 namespace mxnet {
 /*!
- * \brief Node is represents node of an operator in the symbolic graph.
+  *\brief Node is represents node of an operator in the symbolic graph.
  *
- * It stores connection to the inputs to function represented by OperatorProperty
- * NOTE on data structure: there are three types of node:
- * - Normal node: contains all the necessary elements of a graph.
- * - OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
- * - Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
+  *It stores connection to the inputs to function represented by OperatorProperty
+  *NOTE on data structure: there are three types of node:
+  *- Normal node: contains all the necessary elements of a graph.
+  *- OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
+  *- Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
  */
 struct Symbol::Node {
   /*! \brief Operator of this node */
@@ -28,11 +28,11 @@ struct Symbol::Node {
   /*! \brief inputs to this node */
   std::vector<DataEntry> inputs;
   /*!
-   * \brief constructor
-   * \param op the OperatorProperty to construct the Node
-   * \param name the name of the symbol
+    *\brief constructor
+    *\param op the OperatorProperty to construct the Node
+    *\param name the name of the symbol
    */
-  explicit Node(OperatorProperty* op = nullptr, const std::string& name = "")
+  explicit Node(OperatorProperty *op = nullptr, const std::string& name = "")
       : op(op), name(name) {
   }
   /*! \return Whether the symbol is atomic */
@@ -63,7 +63,7 @@ inline void Symbol::DFSVisit(FVisit fvisit) const {
     }
   }
   while (!stack.empty()) {
-    Node* back = stack.back();
+    Node *back = stack.back();
     stack.pop_back();
     fvisit(back);
     for (auto it = back->inputs.rbegin(); it != back->inputs.rend(); ++it) {
@@ -76,6 +76,28 @@ inline void Symbol::DFSVisit(FVisit fvisit) const {
   }
 }
 
+// helper function to handle keyword argument mismatch
+// throw approperiate messages
+template<typename TMap>
+inline void KeywordArgumentMismatch(const char *source,
+                                    const TMap &kwargs,
+                                    const std::vector<std::string> args) {
+  std::unordered_set<std::string> keys(args.begin(), args.end());
+  std::ostringstream head, msg;
+  msg << "\nCandidate arguments:\n";
+  for (size_t i = 0; i < args.size(); ++i) {
+    msg << "\t[" << i << ']' << args[i] << '\n';
+  }
+
+  for (const auto& kv : kwargs) {
+    if (keys.count(kv.first) == 0) {
+      LOG(FATAL) << source
+                 << "Keyword argument name " << kv.first << " not found."
+                 << msg.str();
+    }
+  }
+}
+
 int Symbol::FindDuplicateArgs(std::unordered_map<std::string, int> *out) const {
   out->clear();
   int max_dup = 1;
@@ -328,19 +350,8 @@ void Symbol::Compose(const std::unordered_map<std::string, Symbol>& kwargs,
     }
   }
   if (nmatched != kwargs.size()) {
-    // Error message handling
-    std::vector<std::string> req_args = this->ListArguments();
-    std::unordered_set<std::string> keys(req_args.begin(), req_args.end());
-    std::ostringstream msg;
-    msg << "\nCandidate arguments:\n";
-    for (size_t i = 0; i < req_args.size(); ++i) {
-      msg << "\t[" << i << ']' << req_args[i] << '\n';
-    }
-    for (const auto& kv : kwargs) {
-      CHECK_NE(keys.count(kv.first), 0)
-          << "Keyword Argument " << kv.first << " not found in arguments."
-          << msg.str();
-    }
+    KeywordArgumentMismatch(
+        "Symbol.Compose", kwargs, ListArguments());
   }
 }
 
@@ -358,11 +369,34 @@ Symbol Symbol::operator () (const std::unordered_map<std::string, Symbol>& kwarg
   return s;
 }
 
-bool Symbol::InferShape(std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape) const {
+bool Symbol::InferShape(std::vector<TShape> *arg_shapes,
+                        std::vector<TShape> *out_shapes) const {
+  StaticGraph g;
+  this->ToStaticGraph(&g);
+  return g.InferShape(arg_shapes, out_shapes);
+}
+
+bool Symbol::InferShape(const std::unordered_map<std::string, TShape>& known_arg_shapes,
+                        std::vector<TShape> *arg_shapes,
+                        std::vector<TShape> *out_shapes) const {
   StaticGraph g;
   this->ToStaticGraph(&g);
-  return g.InferShape(in_shape, out_shape);
+  arg_shapes->clear();
+  arg_shapes->resize(g.arg_nodes.size(), TShape());
+  size_t nmatched = 0;
+  for (size_t i = 0; i < g.arg_nodes.size(); ++i) {
+    const std::string& name = g.nodes[g.arg_nodes[i]].name;
+    auto it = known_arg_shapes.find(name);
+    if (it != known_arg_shapes.end()) {
+      arg_shapes->at(i) = it->second;
+      ++nmatched;
+    }
+  }
+  if (nmatched != known_arg_shapes.size()) {
+    KeywordArgumentMismatch(
+        "Symbol.InterShape", known_arg_shapes, ListArguments());
+  }
+  return g.InferShape(arg_shapes, out_shapes);
 }
 
 Symbol Symbol::Create(OperatorProperty *op)  {
@@ -424,12 +458,12 @@ void Symbol::ToStaticGraph(StaticGraph *out_graph) const {
     }
   }
   // setup heads
-  out_graph->outputs.clear();
+  out_graph->heads.clear();
   for (auto &head : heads_) {
     StaticGraph::DataEntry e;
     e.source_id = node_index[head.source.get()];
     e.index = head.index;
-    out_graph->outputs.push_back(e);
+    out_graph->heads.push_back(e);
   }
 }
 }  // namespace mxnet
diff --git a/windows/mxnet.sln b/windows/mxnet.sln
deleted file mode 100755
index 16f82f6b6fb1..000000000000
--- a/windows/mxnet.sln
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2013
-VisualStudioVersion = 12.0.21005.1
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mxnet", "mxnet.vcxproj", "{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|Win32.ActiveCfg = Debug|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|Win32.Build.0 = Debug|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|x64.ActiveCfg = Debug|x64
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Debug|x64.Build.0 = Debug|x64
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|Win32.ActiveCfg = Release|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|Win32.Build.0 = Release|Win32
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|x64.ActiveCfg = Release|x64
-		{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
diff --git a/windows/mxnet.vcxproj b/windows/mxnet.vcxproj
deleted file mode 100755
index 2823478cc51f..000000000000
--- a/windows/mxnet.vcxproj
+++ /dev/null
@@ -1,148 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{2DA41CBC-B8B2-4696-86CD-9AFBAB029661}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <TargetMachine>MachineX86</TargetMachine>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Windows</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>$(solutionDir)\..\src</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-    </ClCompile>
-    <Link>
-      <TargetMachine>MachineX86</TargetMachine>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Windows</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <AdditionalIncludeDirectories>$(solutionDir)\..\src</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\src\narray\narray.cpp" />
-    <ClCompile Include="..\src\test\testInterface.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\src\common\all_ops.h" />
-    <ClInclude Include="..\src\common\common.h" />
-    <ClInclude Include="..\src\engine\dagengine.h" />
-    <ClInclude Include="..\src\layer\dummylayer.h" />
-    <ClInclude Include="..\src\layer\layer.h" />
-    <ClInclude Include="..\src\narray\narray.h" />
-    <ClInclude Include="..\src\storage\storage.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/windows/mxnet.vcxproj.filters b/windows/mxnet.vcxproj.filters
deleted file mode 100755
index 1ff068b088be..000000000000
--- a/windows/mxnet.vcxproj.filters
+++ /dev/null
@@ -1,48 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <Filter Include="Source Files">
-      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
-      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
-    </Filter>
-    <Filter Include="Header Files">
-      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
-      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
-    </Filter>
-    <Filter Include="Resource Files">
-      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
-      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav</Extensions>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\src\narray\narray.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\src\test\testInterface.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\src\common\all_ops.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\common\common.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\engine\dagengine.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\layer\dummylayer.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\layer\layer.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\narray\narray.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\storage\storage.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-  </ItemGroup>
-</Project>
\ No newline at end of file
diff --git a/windows/mxnet.vcxproj.user b/windows/mxnet.vcxproj.user
deleted file mode 100755
index ef5ff2a1fae6..000000000000
--- a/windows/mxnet.vcxproj.user
+++ /dev/null
@@ -1,4 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup />
-</Project>
\ No newline at end of file