From 6b21c24cceda50c40adc19bb4f6aa7da42530065 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 20 Aug 2015 12:15:18 -0600
Subject: [PATCH 01/20] Read docs

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b1ef53bc6148..4659ce9f0413 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # MXNet
 
 [![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
+[![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](https://readthedocs.org/projects/mxnet/?badge=latest)
 
 This is a project that combines lessons and ideas we learnt from [cxxnet](https://github.com/dmlc/cxxnet), [minerva](https://github.com/dmlc/minerva) and [purine2](https://github.com/purine/purine2).
 - The interface is designed in collaboration by authors of three projects.

From ec0f128cb0234f51bf04a780990ab68a89448f88 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 20 Aug 2015 23:17:39 -0600
Subject: [PATCH 02/20] simplify symbol creator as discussed

---
 python/mxnet/__init__.py       |   5 +-
 python/mxnet/symbol.py         | 130 +++++++++++++++++++++++++++-----
 python/mxnet/symbol_creator.py | 132 ---------------------------------
 python/test_mnist.py           |   8 +-
 4 files changed, 119 insertions(+), 156 deletions(-)
 delete mode 100644 python/mxnet/symbol_creator.py

diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 2a70190fd3cd..94b71bce16cc 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -12,11 +12,10 @@
 from .context import Context, current_context
 from .narray import NArray
 from .function import _FunctionRegistry
-from .symbol import Symbol
-from .symbol_creator import _SymbolCreatorRegistry
+from . import symbol
 
 __version__ = "0.1.0"
 
 # this is a global function registry that can be used to invoke functions
 op = NArray._init_function_registry(_FunctionRegistry())
-sym = Symbol._init_symbol_creator_registry(_SymbolCreatorRegistry())
+
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index b4f8cd1b7914..cb5daf2225a1 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -4,30 +4,16 @@
 from __future__ import absolute_import
 
 import ctypes
+import sys
 from .base import _LIB
-from .base import c_array, c_str, mx_uint, NArrayHandle, ExecutorHandle, SymbolHandle
+from .base import c_array, c_str, mx_uint, string_types
+from .base import NArrayHandle, ExecutorHandle, SymbolHandle
 from .base import check_call
 from .context import Context
 from .executor import Executor
 
 class Symbol(object):
     """Symbol is symbolic graph of the mxnet."""
-    _registry = None
-
-    @staticmethod
-    def _init_symbol_creator_registry(symbol_creator_registry):
-        """Initialize symbol creator registry
-
-        Parameters
-        ----------
-        symbol_creator_registry:
-            pass in symbol_creator_registry
-        Returns
-        -------
-        the passed in registry
-        """
-        _registry = symbol_creator_registry
-        return _registry
 
     def __init__(self, handle):
         """Initialize the function with handle
@@ -257,3 +243,113 @@ def bind(self, ctx, args, args_grad, reqs):
                                        reqs_array,
                                        ctypes.byref(handle)))
         return Executor(handle)
+
+
+def Variable(name):
+    """Create a symbolic variable with specified name.
+
+    Parameters
+    ----------
+    name : str
+       Name of the variable.
+
+    Returns
+    -------
+    variable : Symbol
+        The created variable symbol.
+    """
+    if not isinstance(name, string_types):
+        raise TypeError('Expect a string for variable `name`')
+    handle = SymbolHandle()
+    check_call(_LIB.MXSymbolCreateVariable(name, ctypes.byref(handle)))
+    return Symbol(handle)
+
+
+def Group(symbols):
+    """Create a symbolic variable that groups several symbols together.
+
+    Parameters
+    ----------
+    symbols : list
+        List of symbols to be grouped.
+
+    Returns
+    -------
+    sym : Symbol
+        The created group symbol.
+     """
+    ihandles = []
+    for sym in symbols:
+        if not isinstance(sym, Symbol):
+            raise TypeError('Expect Symbols in the list input')
+        ihandles.append(sym.handle)
+    handle = SymbolHandle()
+    check_call(_LIB.MXSymbolCreateGroup(
+        len(ihandles), c_array(SymbolHandle, ihandles), ctypes.byref(handle)))
+    return Symbol(handle)
+
+
+def _make_atomic_symbol_function(handle, func_name):
+    """Create an atomic symbol function by handle and funciton name."""
+    def creator(*args, **kwargs):
+        """Activation Operator of Neural Net.
+        The parameters listed below can be passed in as keyword arguments.
+
+        Parameters
+        ----------
+        name : string, required.
+            Name of the resulting symbol.
+
+        Returns
+        -------
+        symbol: Symbol
+            the resulting symbol
+        """
+        param_keys = []
+        param_vals = []
+        symbol_kwargs = {}
+        name = kwargs.pop('name', None)
+
+        for k, v in kwargs.items():
+            if isinstance(v, Symbol):
+                symbol_kwargs[k] = v
+            else:
+                param_keys.append(c_str(k))
+                param_vals.append(c_str(str(v)))
+        # create atomic symbol
+        param_keys = c_array(ctypes.c_char_p, param_keys)
+        param_vals = c_array(ctypes.c_char_p, param_vals)
+        sym_handle = SymbolHandle()
+        check_call(_LIB.MXSymbolCreateAtomicSymbol(
+            handle, len(param_keys),
+            param_keys, param_vals,
+            ctypes.byref(sym_handle)))
+
+        if len(args) != 0 and len(symbol_kwargs) != 0:
+            raise TypeError('%s can only accept input \
+                Symbols either as positional or keyword arguments, not both' % func_name)
+
+        s = Symbol(sym_handle)
+        s._compose(*args, name=name, **symbol_kwargs)
+        return s
+    creator.__name__ = func_name
+    return creator
+
+
+def _init_module_functions():
+    """List and add all the atomic symbol functions to current module."""
+    plist = ctypes.POINTER(ctypes.c_void_p)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXSymbolListAtomicSymbolCreators(ctypes.byref(size),
+                                                     ctypes.byref(plist)))
+    module_obj = sys.modules[__name__]
+    for i in range(size.value):
+        hdl = ctypes.c_void_p(plist[i])
+        name = ctypes.c_char_p()
+        check_call(_LIB.MXSymbolGetAtomicSymbolName(hdl, ctypes.byref(name)))
+        function = _make_atomic_symbol_function(hdl, name.value)
+        setattr(module_obj, function.__name__, function)
+
+# Initialize the atomic symbo in startups
+_init_module_functions()
+
diff --git a/python/mxnet/symbol_creator.py b/python/mxnet/symbol_creator.py
deleted file mode 100644
index bcadbe7daacb..000000000000
--- a/python/mxnet/symbol_creator.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding: utf-8
-# pylint: disable=invalid-name, protected-access, no-self-use
-"""Symbol support of mxnet"""
-from __future__ import absolute_import
-
-import ctypes
-from .base import _LIB
-from .base import c_array, c_str, string_types
-from .base import SymbolHandle
-from .base import check_call
-from .symbol import Symbol
-
-class _SymbolCreator(object):
-    """SymbolCreator is a function that takes Param and return symbol"""
-
-    def __init__(self, name, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolCreatorHandle
-            the function handle of the function
-
-        name : string
-            the name of the function
-        """
-        self.name = name
-        self.handle = handle
-
-    def __call__(self, *args, **kwargs):
-        """Invoke creator of symbol by passing kwargs
-
-        Parameters
-        ----------
-        name : string
-            Name of the resulting symbol.
-
-        *args
-            Positional arguments
-
-        **kwargs
-            Provide the params necessary for the symbol creation.
-
-        Returns
-        -------
-        the resulting symbol
-        """
-        param_keys = []
-        param_vals = []
-        symbol_kwargs = {}
-        name = kwargs.pop('name', None)
-
-        for k, v in kwargs.items():
-            if isinstance(v, Symbol):
-                symbol_kwargs[k] = v
-            else:
-                param_keys.append(c_str(k))
-                param_vals.append(c_str(str(v)))
-
-        # create atomic symbol
-        param_keys = c_array(ctypes.c_char_p, param_keys)
-        param_vals = c_array(ctypes.c_char_p, param_vals)
-        sym_handle = SymbolHandle()
-        check_call(_LIB.MXSymbolCreateAtomicSymbol(
-            self.handle, len(param_keys),
-            param_keys, param_vals,
-            ctypes.byref(sym_handle)))
-
-        if len(args) != 0 and len(symbol_kwargs) != 0:
-            raise TypeError('%s can only accept input \
-                Symbols either as positional or keyword arguments, not both' % self.name)
-
-        s = Symbol(sym_handle)
-        s._compose(*args, name=name, **symbol_kwargs)
-        return s
-
-class _SymbolCreatorRegistry(object):
-    """Function Registry"""
-    def __init__(self):
-        plist = ctypes.POINTER(ctypes.c_void_p)()
-        size = ctypes.c_uint()
-        check_call(_LIB.MXSymbolListAtomicSymbolCreators(ctypes.byref(size),
-                                                         ctypes.byref(plist)))
-        hmap = {}
-        for i in range(size.value):
-            hdl = ctypes.c_void_p(plist[i])
-            name = ctypes.c_char_p()
-            check_call(_LIB.MXSymbolGetAtomicSymbolName(hdl, ctypes.byref(name)))
-            hmap[name.value] = _SymbolCreator(name, hdl)
-        self.__dict__.update(hmap)
-
-    def Variable(self, name):
-        """Create a symbolic variable with specified name.
-
-        Parameters
-        ----------
-        name : str
-            Name of the variable.
-
-        Returns
-        -------
-        variable : Symbol
-            The created variable symbol.
-        """
-        if not isinstance(name, string_types):
-            raise TypeError('Expect a string for variable `name`')
-        handle = SymbolHandle()
-        check_call(_LIB.MXSymbolCreateVariable(name, ctypes.byref(handle)))
-        return Symbol(handle)
-
-    def Group(self, symbols):
-        """Create a symbolic variable that groups several symbols together.
-
-        Parameters
-        ----------
-        symbols : list
-            List of symbols to be grouped.
-
-        Returns
-        -------
-        sym : Symbol
-            The created group symbol.
-        """
-        ihandles = []
-        for sym in symbols:
-            if not isinstance(sym, Symbol):
-                raise TypeError('Expect Symbols in the list input')
-            ihandles.append(sym.handle)
-        handle = SymbolHandle()
-        check_call(_LIB.MXSymbolCreateGroup(
-            len(ihandles), c_array(SymbolHandle, ihandles), ctypes.byref(handle)))
-        return Symbol(handle)
diff --git a/python/test_mnist.py b/python/test_mnist.py
index f9f37d2e82e3..9b61654f8897 100644
--- a/python/test_mnist.py
+++ b/python/test_mnist.py
@@ -63,10 +63,10 @@ def Get(self):
 
 # symbol net
 batch_size = 100
-data = mx.sym.Variable('data')
-fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=160)
-act1 = mx.sym.Activation(data = fc1, name='relu1', type="relu")
-fc2 = mx.sym.FullyConnected(data = act1, name='fc2', num_hidden=10)
+data = mx.symbol.Variable('data')
+fc1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=160)
+act1 = mx.symbol.Activation(data = fc1, name='relu1', type="relu")
+fc2 = mx.symbol.FullyConnected(data = act1, name='fc2', num_hidden=10)
 args_list = fc2.list_arguments()
 # infer shape
 data_shape = (batch_size, 784)

From 140449b73f1afe212f25a0def6dd00a0d89f053c Mon Sep 17 00:00:00 2001
From: linmin <mavenlin@gmail.com>
Date: Fri, 21 Aug 2015 13:51:51 +0800
Subject: [PATCH 03/20] return value does not need std move
 http://stackoverflow.com/questions/11817873/using-stdmove-when-returning-a-value-from-a-function-to-avoid-to-copy

---
 include/mxnet/operator.h     | 2 +-
 src/symbol/graph_executor.cc | 6 +++---
 src/symbol/static_graph.cc   | 4 ++--
 src/symbol/symbol.cc         | 6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index e60afe6948a7..b44a50d2db7d 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -353,7 +353,7 @@ class OperatorProperty {
     for (size_t i = 0; i < ret_index.size(); ++i) {
       ret[i] = all_data[ret_index[i]];
     }
-    return std::move(ret);
+    return ret;
   }
   /*!
    * \brief create OperatorProperty
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index a434f22a2fc6..8dbadb34e24e 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -121,7 +121,7 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
       remap[i].first = in_data[rmap_index[i].first];
       remap[i].second = *static_cast<const T*>(rmap_index[i].second);
     }
-    return std::move(remap);
+    return remap;
   } else {
     CHECK(node.is_backward());
     // forward property
@@ -161,7 +161,7 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
       remap[i].first = *args_array[remap_index[i].first];
       remap[i].second = *static_cast<T*>(remap_index[i].second);
     }
-    return std::move(remap);
+    return remap;
   }
 }
 
@@ -196,7 +196,7 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
     op_ctx_ptr->run_ctx = ctx;
     op->Forward(*op_ctx_ptr, in_data, req, out_data);
   };
-  return std::move(exec);
+  return exec;
 }
 
 void GraphExecutor::InitGraph(Symbol symbol, Context ctx, bool need_backward) {
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 5eb0ad14a282..c9ed278b8f7e 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -46,7 +46,7 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
       }
     }
   }
-  return std::move(ret);
+  return ret;
 }
 
 bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
@@ -161,7 +161,7 @@ StaticGraph::Node StaticGraph::CreateSumNode(
   os_size << grad_source.size();
   agg_node.op->Init({{"size", os_size.str()}});
   agg_node.inputs = grad_source;
-  return std::move(agg_node);
+  return agg_node;
 }
 
 void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index 54a5fe9422b2..56ad0a869540 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -201,7 +201,7 @@ std::vector<std::string> Symbol::ListReturns() const {
       }
     }
   }
-  return std::move(ret);
+  return ret;
 }
 
 Symbol Symbol::operator[] (size_t index) const {
@@ -415,13 +415,13 @@ Symbol Symbol::CreateGroup(const std::vector<Symbol> &symbols) {
   for (const auto &s : symbols) {
     ret.heads_.insert(ret.heads_.end(), s.heads_.begin(), s.heads_.end());
   }
-  return std::move(ret);
+  return ret;
 }
 
 Symbol Symbol::CreateVariable(const std::string &name) {
   Symbol s;
   s.heads_.push_back(DataEntry(std::make_shared<Node>(nullptr, name), 0));
-  return std::move(s);
+  return s;
 }
 
 void Symbol::ToStaticGraph(StaticGraph *out_graph) const {

From 4d61deddfd7b9a43e1afc61e23a57be1e51c2cf6 Mon Sep 17 00:00:00 2001
From: linmin <mavenlin@gmail.com>
Date: Fri, 21 Aug 2015 17:32:23 +0800
Subject: [PATCH 04/20] support backward of backward

---
 include/mxnet/symbolic.h     | 29 +++++++++++++++++++----------
 src/symbol/graph_executor.cc | 10 +++++-----
 src/symbol/static_graph.cc   | 21 +++++++++++++--------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index df06c4913de8..f97a695b7f19 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -85,27 +85,36 @@ class StaticGraph {
     /*! \brief inputs (node_id, index) for of the nodes*/
     std::vector<DataEntry> inputs;
     /*!
-     * \brief If this field is nonnegative, this indicates this
-     *  Node is corresponds to a Backward Operation of Operator.
-     *  backward_source_id will points to the corresponding Forward Node.
+     * \brief source node id; if this field is negative, it means this
+     *  Node is a forward node. If this field is nonnegative, it
+     *  means this Node is the gradient of the source node.
+     */
+    int32_t source_id;
+    /*!
+     * \brief backward; if this field is true, that means this node
+     *  represents the backward function of the op. Else, it
+     *  represents the forward function.  When it represents the
+     *  backward function, itself has not op but shares from the
+     *  source node. It is because the backward function shares the
+     *  states from the forward, and they need to share op.
      *
-     *  For normal node, this field is -1.
-     *  When the node is a Backward node, the op field will be nullptr
+     * Since we support gradient of gradient, a forward node can also
+     * be the gradient of another node. See source id.
      */
-    int32_t backward_source_id;
+    bool backward;
     /*! \brief default constructor */
-    Node() : backward_source_id(-1) {}
+    Node() : source_id(-1), backward(false) {}
     /*! \return whether the node is forward op node */
     inline bool is_forward() const {
-      return op != nullptr;
+      return !backward && !is_variable();
     }
     /*! \return whether the node is backward op node */
     inline bool is_backward() const {
-      return backward_source_id != -1;
+      return backward;
     }
     /*! \return whether the node is variable node */
     inline bool is_variable() const {
-      return op == nullptr && !is_backward();
+      return op == nullptr && source_id == -1;
     }
   };
   /*! \brief all nodes in the graph */
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 8dbadb34e24e..d058242ef11e 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -80,7 +80,7 @@ GraphExecutor::GetResource(uint32_t node_id) const {
     return node.op->ForwardResource();
   } else {
     CHECK(node.is_backward());
-    return graph_.nodes[node.backward_source_id].op->BackwardResource();
+    return graph_.nodes[node.source_id].op->BackwardResource();
   }
 }
 
@@ -90,7 +90,7 @@ inline int GraphExecutor::GetNumOutputs(uint32_t node_id) const {
     return node.op->NumReturns();
   } else if (node.is_backward()) {
     return static_cast<int>(
-        graph_.nodes[node.backward_source_id].op->ListArguments().size());
+        graph_.nodes[node.source_id].op->ListArguments().size());
   } else {
     CHECK(node.is_variable());
     return 1;
@@ -125,7 +125,7 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
   } else {
     CHECK(node.is_backward());
     // forward property
-    const OperatorProperty *fwd = graph_.nodes[node.backward_source_id].op.get();
+    const OperatorProperty *fwd = graph_.nodes[node.source_id].op.get();
 
     std::vector<int> out_grad_index(fwd->NumVisibleReturns());
     std::vector<int> in_data_index(fwd->ListArguments().size());
@@ -406,8 +406,8 @@ void GraphExecutor::InitOpNodes() {
     } else {
       CHECK(graph_.nodes[nid].is_backward());
       op_node.op.reset(new BackwardOpWrapper(
-          graph_.nodes[graph_.nodes[nid].backward_source_id].op.get(),
-          op_nodes_[graph_.nodes[nid].backward_source_id].op));
+          graph_.nodes[graph_.nodes[nid].source_id].op.get(),
+          op_nodes_[graph_.nodes[nid].source_id].op));
     }
     bool allow_cache = true;
     for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index c9ed278b8f7e..82dc61193566 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -18,7 +18,7 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
       ++out_degree[e.source_id];
     }
     if (n.is_backward()) {
-      ++out_degree[n.backward_source_id];
+      ++out_degree[n.source_id];
     }
   }
   std::vector<uint32_t> ret(nodes.size());
@@ -41,8 +41,8 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
       }
     }
     if (n.is_backward()) {
-      if (--out_degree[n.backward_source_id] == 0) {
-        queue.push(n.backward_source_id);
+      if (--out_degree[n.source_id] == 0) {
+        queue.push(n.source_id);
       }
     }
   }
@@ -79,7 +79,7 @@ bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
       }
     } else if (nodes[nid].is_backward()) {
       // simply use shapes from forward pass to assign backward shape
-      const Node& forward = nodes[node.backward_source_id];
+      const Node& forward = nodes[node.source_id];
       CHECK(forward.is_forward());
       std::vector<TShape>& in_grad_shapes = (*node_out_shapes)[nid];
       CHECK(in_grad_shapes.size() == forward.inputs.size());
@@ -99,7 +99,7 @@ bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
         }
       }
       // consistent check for input shapes
-      auto& out_data_shapes = (*node_out_shapes)[node.backward_source_id];
+      auto& out_data_shapes = (*node_out_shapes)[node.source_id];
       // use BackwardInputs to select entries corresponding to node.inputs
       auto in_shape = forward.op->BackwardInputs(
           out_data_shapes, in_grad_shapes, out_data_shapes);
@@ -130,7 +130,7 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
     if (nodes[i].is_forward()) {
       nout = nodes[i].op->NumReturns();
     } else if (nodes[i].is_backward()) {
-      nout = static_cast<int>(nodes[nodes[i].backward_source_id].inputs.size());
+      nout = static_cast<int>(nodes[nodes[i].source_id].inputs.size());
     }
     node_out_shapes[i].resize(nout);
   }
@@ -198,7 +198,6 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
     uint32_t nid = *it;
     // skip variables
     if (nodes[nid].is_variable()) continue;
-    CHECK(nodes[nid].is_forward()) << "Do not support Backward of Backward";
     // get out_grad and out_data entry
     std::vector<DataEntry> out_grad, out_data;
     // nvisible is out_grad.size()
@@ -229,7 +228,13 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
     // Create a gradient backward node
     Node grad_node;
     // Point to the corresponding source
-    grad_node.backward_source_id = nid;
+    grad_node.source_id = nid;
+    // reverse the source node
+    grad_node.backward = !(nodes[grad_node.source_id].backward);
+    // if grad node is a forward node, needs to have its own OpProperty
+    if (!grad_node.backward) {
+      grad_node.op.reset(nodes[nodes[nid].source_id].op->Copy());
+    }
     // select out the dependent inputs
     grad_node.inputs = nodes[nid].op->BackwardInputs(
         out_grad, nodes[nid].inputs, out_data);

From bb4cb6a2510316312390a3e90a733dd30633e731 Mon Sep 17 00:00:00 2001
From: linmin <mavenlin@gmail.com>
Date: Fri, 21 Aug 2015 17:44:46 +0800
Subject: [PATCH 05/20] format annotation

---
 src/symbol/symbol.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index 56ad0a869540..aecac3dda487 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -1,7 +1,7 @@
 /*!
- *  Copyright (c) 2015 by Contributors
-  *\file symbol.cc
-  *\brief symbol of mxnet
+ * Copyright (c) 2015 by Contributors
+ * \file symbol.cc
+ * \brief symbol of mxnet
  */
 #include <dmlc/logging.h>
 #include <mxnet/symbolic.h>
@@ -12,13 +12,13 @@
 
 namespace mxnet {
 /*!
-  *\brief Node is represents node of an operator in the symbolic graph.
+ * \brief Node is represents node of an operator in the symbolic graph.
  *
-  *It stores connection to the inputs to function represented by OperatorProperty
-  *NOTE on data structure: there are three types of node:
-  *- Normal node: contains all the necessary elements of a graph.
-  *- OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
-  *- Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
+ * It stores connection to the inputs to function represented by OperatorProperty
+ * NOTE on data structure: there are three types of node:
+ * - Normal node: contains all the necessary elements of a graph.
+ * - OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
+ * - Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
  */
 struct Symbol::Node {
   /*! \brief Operator of this node */

From 6de67dd839e419200da60263f4b27861b3f97574 Mon Sep 17 00:00:00 2001
From: Lin Min <mavenlin@users.noreply.github.com>
Date: Fri, 21 Aug 2015 23:03:34 +0800
Subject: [PATCH 06/20] Revert "support backward of backward"

---
 include/mxnet/operator.h     |  2 +-
 include/mxnet/symbolic.h     | 29 ++++++++++-------------------
 src/symbol/graph_executor.cc | 16 ++++++++--------
 src/symbol/static_graph.cc   | 25 ++++++++++---------------
 src/symbol/symbol.cc         | 24 ++++++++++++------------
 5 files changed, 41 insertions(+), 55 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index b44a50d2db7d..e60afe6948a7 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -353,7 +353,7 @@ class OperatorProperty {
     for (size_t i = 0; i < ret_index.size(); ++i) {
       ret[i] = all_data[ret_index[i]];
     }
-    return ret;
+    return std::move(ret);
   }
   /*!
    * \brief create OperatorProperty
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index f97a695b7f19..df06c4913de8 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -85,36 +85,27 @@ class StaticGraph {
     /*! \brief inputs (node_id, index) for of the nodes*/
     std::vector<DataEntry> inputs;
     /*!
-     * \brief source node id; if this field is negative, it means this
-     *  Node is a forward node. If this field is nonnegative, it
-     *  means this Node is the gradient of the source node.
-     */
-    int32_t source_id;
-    /*!
-     * \brief backward; if this field is true, that means this node
-     *  represents the backward function of the op. Else, it
-     *  represents the forward function.  When it represents the
-     *  backward function, itself has not op but shares from the
-     *  source node. It is because the backward function shares the
-     *  states from the forward, and they need to share op.
+     * \brief If this field is nonnegative, this indicates this
+     *  Node is corresponds to a Backward Operation of Operator.
+     *  backward_source_id will points to the corresponding Forward Node.
      *
-     * Since we support gradient of gradient, a forward node can also
-     * be the gradient of another node. See source id.
+     *  For normal node, this field is -1.
+     *  When the node is a Backward node, the op field will be nullptr
      */
-    bool backward;
+    int32_t backward_source_id;
     /*! \brief default constructor */
-    Node() : source_id(-1), backward(false) {}
+    Node() : backward_source_id(-1) {}
     /*! \return whether the node is forward op node */
     inline bool is_forward() const {
-      return !backward && !is_variable();
+      return op != nullptr;
     }
     /*! \return whether the node is backward op node */
     inline bool is_backward() const {
-      return backward;
+      return backward_source_id != -1;
     }
     /*! \return whether the node is variable node */
     inline bool is_variable() const {
-      return op == nullptr && source_id == -1;
+      return op == nullptr && !is_backward();
     }
   };
   /*! \brief all nodes in the graph */
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index d058242ef11e..a434f22a2fc6 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -80,7 +80,7 @@ GraphExecutor::GetResource(uint32_t node_id) const {
     return node.op->ForwardResource();
   } else {
     CHECK(node.is_backward());
-    return graph_.nodes[node.source_id].op->BackwardResource();
+    return graph_.nodes[node.backward_source_id].op->BackwardResource();
   }
 }
 
@@ -90,7 +90,7 @@ inline int GraphExecutor::GetNumOutputs(uint32_t node_id) const {
     return node.op->NumReturns();
   } else if (node.is_backward()) {
     return static_cast<int>(
-        graph_.nodes[node.source_id].op->ListArguments().size());
+        graph_.nodes[node.backward_source_id].op->ListArguments().size());
   } else {
     CHECK(node.is_variable());
     return 1;
@@ -121,11 +121,11 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
       remap[i].first = in_data[rmap_index[i].first];
       remap[i].second = *static_cast<const T*>(rmap_index[i].second);
     }
-    return remap;
+    return std::move(remap);
   } else {
     CHECK(node.is_backward());
     // forward property
-    const OperatorProperty *fwd = graph_.nodes[node.source_id].op.get();
+    const OperatorProperty *fwd = graph_.nodes[node.backward_source_id].op.get();
 
     std::vector<int> out_grad_index(fwd->NumVisibleReturns());
     std::vector<int> in_data_index(fwd->ListArguments().size());
@@ -161,7 +161,7 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
       remap[i].first = *args_array[remap_index[i].first];
       remap[i].second = *static_cast<T*>(remap_index[i].second);
     }
-    return remap;
+    return std::move(remap);
   }
 }
 
@@ -196,7 +196,7 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
     op_ctx_ptr->run_ctx = ctx;
     op->Forward(*op_ctx_ptr, in_data, req, out_data);
   };
-  return exec;
+  return std::move(exec);
 }
 
 void GraphExecutor::InitGraph(Symbol symbol, Context ctx, bool need_backward) {
@@ -406,8 +406,8 @@ void GraphExecutor::InitOpNodes() {
     } else {
       CHECK(graph_.nodes[nid].is_backward());
       op_node.op.reset(new BackwardOpWrapper(
-          graph_.nodes[graph_.nodes[nid].source_id].op.get(),
-          op_nodes_[graph_.nodes[nid].source_id].op));
+          graph_.nodes[graph_.nodes[nid].backward_source_id].op.get(),
+          op_nodes_[graph_.nodes[nid].backward_source_id].op));
     }
     bool allow_cache = true;
     for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 82dc61193566..5eb0ad14a282 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -18,7 +18,7 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
       ++out_degree[e.source_id];
     }
     if (n.is_backward()) {
-      ++out_degree[n.source_id];
+      ++out_degree[n.backward_source_id];
     }
   }
   std::vector<uint32_t> ret(nodes.size());
@@ -41,12 +41,12 @@ std::vector<uint32_t> StaticGraph::TopoSort() const {
       }
     }
     if (n.is_backward()) {
-      if (--out_degree[n.source_id] == 0) {
-        queue.push(n.source_id);
+      if (--out_degree[n.backward_source_id] == 0) {
+        queue.push(n.backward_source_id);
       }
     }
   }
-  return ret;
+  return std::move(ret);
 }
 
 bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
@@ -79,7 +79,7 @@ bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
       }
     } else if (nodes[nid].is_backward()) {
       // simply use shapes from forward pass to assign backward shape
-      const Node& forward = nodes[node.source_id];
+      const Node& forward = nodes[node.backward_source_id];
       CHECK(forward.is_forward());
       std::vector<TShape>& in_grad_shapes = (*node_out_shapes)[nid];
       CHECK(in_grad_shapes.size() == forward.inputs.size());
@@ -99,7 +99,7 @@ bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
         }
       }
       // consistent check for input shapes
-      auto& out_data_shapes = (*node_out_shapes)[node.source_id];
+      auto& out_data_shapes = (*node_out_shapes)[node.backward_source_id];
       // use BackwardInputs to select entries corresponding to node.inputs
       auto in_shape = forward.op->BackwardInputs(
           out_data_shapes, in_grad_shapes, out_data_shapes);
@@ -130,7 +130,7 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
     if (nodes[i].is_forward()) {
       nout = nodes[i].op->NumReturns();
     } else if (nodes[i].is_backward()) {
-      nout = static_cast<int>(nodes[nodes[i].source_id].inputs.size());
+      nout = static_cast<int>(nodes[nodes[i].backward_source_id].inputs.size());
     }
     node_out_shapes[i].resize(nout);
   }
@@ -161,7 +161,7 @@ StaticGraph::Node StaticGraph::CreateSumNode(
   os_size << grad_source.size();
   agg_node.op->Init({{"size", os_size.str()}});
   agg_node.inputs = grad_source;
-  return agg_node;
+  return std::move(agg_node);
 }
 
 void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
@@ -198,6 +198,7 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
     uint32_t nid = *it;
     // skip variables
     if (nodes[nid].is_variable()) continue;
+    CHECK(nodes[nid].is_forward()) << "Do not support Backward of Backward";
     // get out_grad and out_data entry
     std::vector<DataEntry> out_grad, out_data;
     // nvisible is out_grad.size()
@@ -228,13 +229,7 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
     // Create a gradient backward node
     Node grad_node;
     // Point to the corresponding source
-    grad_node.source_id = nid;
-    // reverse the source node
-    grad_node.backward = !(nodes[grad_node.source_id].backward);
-    // if grad node is a forward node, needs to have its own OpProperty
-    if (!grad_node.backward) {
-      grad_node.op.reset(nodes[nodes[nid].source_id].op->Copy());
-    }
+    grad_node.backward_source_id = nid;
     // select out the dependent inputs
     grad_node.inputs = nodes[nid].op->BackwardInputs(
         out_grad, nodes[nid].inputs, out_data);
diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index aecac3dda487..54a5fe9422b2 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -1,7 +1,7 @@
 /*!
- * Copyright (c) 2015 by Contributors
- * \file symbol.cc
- * \brief symbol of mxnet
+ *  Copyright (c) 2015 by Contributors
+  *\file symbol.cc
+  *\brief symbol of mxnet
  */
 #include <dmlc/logging.h>
 #include <mxnet/symbolic.h>
@@ -12,13 +12,13 @@
 
 namespace mxnet {
 /*!
- * \brief Node is represents node of an operator in the symbolic graph.
+  *\brief Node is represents node of an operator in the symbolic graph.
  *
- * It stores connection to the inputs to function represented by OperatorProperty
- * NOTE on data structure: there are three types of node:
- * - Normal node: contains all the necessary elements of a graph.
- * - OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
- * - Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
+  *It stores connection to the inputs to function represented by OperatorProperty
+  *NOTE on data structure: there are three types of node:
+  *- Normal node: contains all the necessary elements of a graph.
+  *- OperatorProperty: the inputs_ is empty, represents an OperatorProperty that has not been applied.
+  *- Variable: the sym_ is nullptr, represents an named Variable of tensors that can be composed.
  */
 struct Symbol::Node {
   /*! \brief Operator of this node */
@@ -201,7 +201,7 @@ std::vector<std::string> Symbol::ListReturns() const {
       }
     }
   }
-  return ret;
+  return std::move(ret);
 }
 
 Symbol Symbol::operator[] (size_t index) const {
@@ -415,13 +415,13 @@ Symbol Symbol::CreateGroup(const std::vector<Symbol> &symbols) {
   for (const auto &s : symbols) {
     ret.heads_.insert(ret.heads_.end(), s.heads_.begin(), s.heads_.end());
   }
-  return ret;
+  return std::move(ret);
 }
 
 Symbol Symbol::CreateVariable(const std::string &name) {
   Symbol s;
   s.heads_.push_back(DataEntry(std::make_shared<Node>(nullptr, name), 0));
-  return s;
+  return std::move(s);
 }
 
 void Symbol::ToStaticGraph(StaticGraph *out_graph) const {

From 8b9489e5def1c7ad31642eef39f41a7bd7ed8f8c Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 21 Aug 2015 09:54:50 -0600
Subject: [PATCH 07/20] init sphinx

---
 doc/.gitignore            |   5 +
 doc/Makefile              | 192 ++++++++++++++++++++++++++++++++++++++
 doc/conf.py               | 183 ++++++++++++++++++++++++++++++++++++
 doc/index.md              |  15 +++
 doc/python/python_api.rst |   8 ++
 doc/sphinx_util.py        |  16 ++++
 make/readthedocs.mk       |  75 +++++++++++++++
 7 files changed, 494 insertions(+)
 create mode 100644 doc/Makefile
 create mode 100644 doc/conf.py
 create mode 100644 doc/index.md
 create mode 100644 doc/python/python_api.rst
 create mode 100644 doc/sphinx_util.py
 create mode 100644 make/readthedocs.mk

diff --git a/doc/.gitignore b/doc/.gitignore
index 8e786536c590..4d81c70e7cc8 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -1 +1,6 @@
+html
+latex
+*.sh
+_*
 doxygen
+*.pyc
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 000000000000..40bba2a280db
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 000000000000..46d91d95b41a
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+import shlex
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+libpath = os.path.join(curr_path, '../python/')
+sys.path.insert(0, libpath)
+sys.path.insert(0, curr_path)
+
+from sphinx_util import MarkdownParser, AutoStructify
+
+# -- mock out modules
+import mock
+MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib']
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = mock.Mock()
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'mxnet'
+author = u'%s developers' % project
+copyright = u'2015, %s' % author
+github_doc_root = 'https://github.com/dmlc/mxnet/tree/master/doc/'
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+}
+os.environ['MXNET_BUILD_DOC'] = '1'
+# Version information.
+import mxnet
+version = mxnet.__version__
+release = mxnet.__version__
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md']
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+# html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# hook for doxygen
+def run_doxygen(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
+        if retcode < 0:
+            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+
+def run_build_mxnet(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        subprocess.call('cd ..; rm -rf dmlc-core;' +
+                        'git clone https://github.com/dmlc/dmlc-core')
+        subprocess.call('cd ..; rm -rf mshadow;' +
+                        'git clone https://github.com/dmlc/mshadow')
+        subprocess.call('cd ..; cp/make/readthedocs.mk config.mk')
+        retcode = subprocess.call("cd %s; make" % folder, shell=True)
+        if retcode < 0:
+            sys.stderr.write("build terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("build execution failed: %s" % e)
+
+def generate_doxygen_xml(app):
+    """Run the doxygen make commands if we're on the ReadTheDocs server"""
+    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+    if read_the_docs_build:
+        run_doxygen('..')
+        run_build_mxnet('..')
+    sys.stderr.write('The Lib path: %s\n' % str(os.listdir('../lib')))
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    # app.connect("builder-inited", generate_doxygen_xml)
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 000000000000..86099881930a
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,15 @@
+MXNet Documentation
+===================
+
+Contents
+--------
+* [Python API Reference](python/python_api.rst)
+
+Indices and tables
+------------------
+
+```eval_rst
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+```
\ No newline at end of file
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
new file mode 100644
index 000000000000..78fbe60c6d49
--- /dev/null
+++ b/doc/python/python_api.rst
@@ -0,0 +1,8 @@
+Python API Reference
+====================
+This page gives the Python API reference of mxnet.
+
+Symbolic Interface
+------------------
+.. automodule:: mxnet.symbol
+    :functions:
diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py
new file mode 100644
index 000000000000..f6a33ffa375d
--- /dev/null
+++ b/doc/sphinx_util.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+"""Helper utilty function for customization."""
+import sys
+import os
+import docutils
+import subprocess
+
+if os.environ.get('READTHEDOCS', None) == 'True':
+    subprocess.call('cd ..; rm -rf recommonmark;' +
+                    'git clone https://github.com/tqchen/recommonmark', shell=True)
+
+sys.path.insert(0, os.path.abspath('../recommonmark/'))
+from recommonmark import parser, transform
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
diff --git a/make/readthedocs.mk b/make/readthedocs.mk
new file mode 100644
index 000000000000..d91f6797b11a
--- /dev/null
+++ b/make/readthedocs.mk
@@ -0,0 +1,75 @@
+#--------------------------------------------------------
+# Configuration for document generation with less deps
+# The library may not run, but doc generation could work
+#--------------------------------------------------------
+
+# choice of compiler
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 0
+USE_OPENCV_DECODER = 0
+# whether use CUDNN R3 library
+USE_CUDNN = 0
+# add the path to CUDNN libary to link and compile flag
+# if you do not need that, or do not have that, leave it as NONE
+USE_CUDNN_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+USE_STATIC_MKL = NONE
+USE_BLAS = blas
+#
+# add path to intel libary, you may need it
+# for MKL, if you did not add the path to enviroment variable
+#
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = NONE
+PS_THIRD_PATH = NONE
+
+# whether compile with rabit
+USE_RABIT_PS = 0
+RABIT_PATH = rabit
+
+# use openmp iterator
+USE_OPENMP_ITER = 0
+# the additional link flags you want to add
+ADD_LDFLAGS =
+
+# the additional compile flags you want to add
+ADD_CFLAGS =
+#
+# If use MKL, choose static link automaticly to fix python wrapper
+#
+ifeq ($(USE_BLAS), mkl)
+	USE_STATIC_MKL = 1
+endif
+
+#------------------------
+# configuration for DMLC
+#------------------------
+# whether use HDFS support during compile
+# this will allow cxxnet to directly save/load model from hdfs
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+# this will allow cxxnet to directly save/load model from s3
+USE_S3 = 0
+
+# path to libjvm.so
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server

From 85c51c04baaef685edd92f57039ff4ed8c317d3b Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 21 Aug 2015 09:57:13 -0600
Subject: [PATCH 08/20] doc

---
 doc/README    | 9 +++++++++
 doc/README.md | 4 ----
 2 files changed, 9 insertions(+), 4 deletions(-)
 create mode 100644 doc/README
 delete mode 100644 doc/README.md

diff --git a/doc/README b/doc/README
new file mode 100644
index 000000000000..3b1d081e2c53
--- /dev/null
+++ b/doc/README
@@ -0,0 +1,9 @@
+This document is generated by sphinx.
+Make sure you cloned the following repos in the root.
+
+- https://github.com/dmlc/dmlc-core
+- https://github.com/dmlc/mshadow
+- https://github.com/tqchen/recommonmark
+- Type make in root foler to make the library
+
+Type make html in doc folder.
diff --git a/doc/README.md b/doc/README.md
deleted file mode 100644
index dae2fff081f2..000000000000
--- a/doc/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-Documents
-=======
-* Type ```make doc``` in project root
-

From 985484b53b5ac5879af69b3cb2706c5cb143ca31 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 21 Aug 2015 12:52:48 -0600
Subject: [PATCH 09/20] try

---
 Makefile                   |   9 ++-
 doc/python/python_api.rst  |   2 +-
 doc/sphinx_util.py         |   5 ++
 make/config.mk             |   6 +-
 make/readthedocs.mk        |   4 +-
 src/operator/softmax-inl.h | 148 +++++++++++++++++++++++++++++++++++++
 src/operator/softmax.cc    |  27 +++++++
 src/operator/softmax.cu    |  19 +++++
 8 files changed, 210 insertions(+), 10 deletions(-)
 create mode 100644 src/operator/softmax-inl.h
 create mode 100644 src/operator/softmax.cc
 create mode 100644 src/operator/softmax.cu

diff --git a/Makefile b/Makefile
index 50e9a21c50e8..b06942ae0352 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ ifdef CXXNET_CONFIG
 	config = $(CXXNET_CONFIG)
 else ifneq ("$(wildcard ./config.mk)","")
 	config = config.mk
-else
+else 
 	config = make/config.mk
 endif
 endif
@@ -58,14 +58,14 @@ endif
 BIN = test/api_registry_test test/test_storage
 OBJ = narray_op_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o graph_executor.o pooling_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o graph_executor.o pooling_cpu.o softmax_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o
+	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -94,7 +94,8 @@ elementwise_sum_cpu.o: src/operator/elementwise_sum.cc
 elementwise_sum_gpu.o: src/operator/elementwise_sum.cu
 pooling_cpu.o: src/operator/pooling.cc
 pooling_gpu.o: src/operator/pooling.cu
-
+softmax_cpu.o: src/operator/softmax.cc
+softmax_gpu.o: src/operator/softmax.cu
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 78fbe60c6d49..3b726ef575e7 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -5,4 +5,4 @@ This page gives the Python API reference of mxnet.
 Symbolic Interface
 ------------------
 .. automodule:: mxnet.symbol
-    :functions:
+    :members:
diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py
index f6a33ffa375d..917ef3f68ddb 100644
--- a/doc/sphinx_util.py
+++ b/doc/sphinx_util.py
@@ -9,7 +9,12 @@
     subprocess.call('cd ..; rm -rf recommonmark;' +
                     'git clone https://github.com/tqchen/recommonmark', shell=True)
 
+if os.path.exists("../lib/libmxnet.so") == False:
+    subprocess.call('cd ..; cp make/readthedocs.mk ./config.mk;', shell = True)
+    subprocess.call('cd ..; sh ./scripts/build_dmlc.sh; make clean; make;', shell = True)
 sys.path.insert(0, os.path.abspath('../recommonmark/'))
+
+
 from recommonmark import parser, transform
 
 MarkdownParser = parser.CommonMarkParser
diff --git a/make/config.mk b/make/config.mk
index 48587a4f9114..a23ff2147612 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -24,8 +24,8 @@ USE_CUDA_PATH = NONE
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
-USE_OPENCV = 1
-USE_OPENCV_DECODER = 1
+USE_OPENCV = 0
+USE_OPENCV_DECODER = 0
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 # add the path to CUDNN libary to link and compile flag
@@ -55,7 +55,7 @@ RABIT_PATH = rabit
 # use openmp iterator
 USE_OPENMP_ITER = 1
 # the additional link flags you want to add
-ADD_LDFLAGS = -ljpeg
+ADD_LDFLAGS =
 
 # the additional compile flags you want to add
 ADD_CFLAGS =
diff --git a/make/readthedocs.mk b/make/readthedocs.mk
index d91f6797b11a..016d35be6abd 100644
--- a/make/readthedocs.mk
+++ b/make/readthedocs.mk
@@ -30,7 +30,7 @@ USE_CUDNN_PATH = NONE
 # choose the version of blas you want to use
 # can be: mkl, blas, atlas, openblas
 USE_STATIC_MKL = NONE
-USE_BLAS = blas
+USE_BLAS = NONE
 #
 # add path to intel libary, you may need it
 # for MKL, if you did not add the path to enviroment variable
@@ -52,7 +52,7 @@ USE_OPENMP_ITER = 0
 ADD_LDFLAGS =
 
 # the additional compile flags you want to add
-ADD_CFLAGS =
+ADD_CFLAGS = -DMSHADOW_STAND_ALONE=1
 #
 # If use MKL, choose static link automaticly to fix python wrapper
 #
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
new file mode 100644
index 000000000000..247bb1aa4978
--- /dev/null
+++ b/src/operator/softmax-inl.h
@@ -0,0 +1,148 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_SOFTMAX_INL_H_
+#define MXNET_OPERATOR_SOFTMAX_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+enum SoftmaxOpInputs {kData, kLabel};
+enum SoftmaxOpOutputs {kOut};
+
+struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
+  float grad_scale;
+  DMLC_DECLARE_PARAMETER(SoftmaxParam) {
+    DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f);
+  };
+};
+
+template<typename xpu>
+class SoftmaxOp : public Operator {
+ public:
+  explicit SoftmaxOp(SoftmaxParam param) : param_(param) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2) << "Softmax Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "Softmax Output: [output]";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Softmax(data, out);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 2> out = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+    SoftmaxGrad(grad, out, label);
+    if (param_.grad_scale < 1.0) {
+      grad *= param_.grad_scale;
+    }
+  }
+
+ private:
+  SoftmaxParam param_;
+};  // class SoftmaxOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(SoftmaxParam param);
+
+#if DMLC_USE_CXX11
+class SoftmaxProp : public OperatorProperty {
+ public:
+  virtual std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    const TShape &dshape = in_shape->at(kData);
+    TShape &lshape = in_shape->at(kLabel);
+    if (dshape.ndim() == 0) return false;
+    if (lshape.ndim() == 0) {
+      SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[1]))
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new SoftmaxProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  virtual std::string TypeString() const {
+    return "Softmax";
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_data[kOut], in_data[kLabel]};
+  }
+
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<void*> &in_grad) const {
+    return {{out_grad[kOut], in_grad[kData]}};
+  }
+
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void*> &out_data) const {
+    return {{in_data[kData], out_data[kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+ private:
+  SoftmaxParam param_;
+
+};  // class SoftmaxProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SOFTMAX_INL_H_
diff --git a/src/operator/softmax.cc b/src/operator/softmax.cc
new file mode 100644
index 000000000000..91fd7a1170ce
--- /dev/null
+++ b/src/operator/softmax.cc
@@ -0,0 +1,27 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include <mxnet/registry.h>
+#include "./softmax-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(SoftmaxParam param) {
+  return new SoftmaxOp<cpu>(param);
+}
+
+Operator *SoftmaxProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(SoftmaxParam);
+
+REGISTER_OP_PROPERTY(Softmax, SoftmaxProp);
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/softmax.cu b/src/operator/softmax.cu
new file mode 100644
index 000000000000..0ebbfc16ce68
--- /dev/null
+++ b/src/operator/softmax.cu
@@ -0,0 +1,19 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./softmax-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(SoftmaxParam param) {
+  return new SoftmaxOp<gpu>(param);
+}
+
+}  // namespace op
+}  // namespace mxnet
+

From ae64c5fc49dbfe23363b66a2d7f69ead9556b0e0 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 21 Aug 2015 22:52:16 -0600
Subject: [PATCH 10/20] autodoc

---
 doc/conf.py                        |  2 ++
 include/mxnet/c_api.h              |  8 ++++++
 include/mxnet/registry.h           | 37 ++++++++++++++++++++++++
 src/c_api.cc                       | 45 ++++++++++++++++++++++++++++++
 src/operator/activation-inl.h      |  7 +++--
 src/operator/activation.cc         |  6 +++-
 src/operator/fully_connected-inl.h |  6 ++--
 src/operator/fully_connected.cc    |  7 ++++-
 src/operator/softmax-inl.h         |  7 ++---
 9 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 46d91d95b41a..981ab273c98c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -57,6 +57,8 @@
     'sphinx.ext.mathjax',
 ]
 
+autodoc_member_order = 'bysource'
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index d194edb049c7..32a2156a7725 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -221,6 +221,14 @@ MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
  */
 MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
                                           const char **out);
+/*!
+ * \brief Get the docstring of AtomicSymbol.
+ * \param creator the AtomicSymbolCreator
+ * \param out the returned name of the creator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolGetAtomicSymbolDoc(AtomicSymbolCreator creator,
+                                         const char **out);
 /*!
  * \brief Create an AtomicSymbol.
  * \param creator the AtomicSymbolCreator
diff --git a/include/mxnet/registry.h b/include/mxnet/registry.h
index ddc0a3ca22a0..716438ec4c72 100644
--- a/include/mxnet/registry.h
+++ b/include/mxnet/registry.h
@@ -224,6 +224,12 @@ struct OperatorPropertyEntry {
   bool use_param;
   /*! \brief name of the entry */
   std::string name;
+  /*! \brief description of operator */
+  std::string description;
+  /*! \brief pair of name description the arguments */
+  std::vector<std::pair<std::string, std::string> > arguments;
+  /*! \brief documentation to parameters */
+  std::string param_doc;
   /*! \brief function body to create OperatorProperty */
   Creator body;
   /*! \brief constructor */
@@ -231,11 +237,42 @@ struct OperatorPropertyEntry {
       : use_param(true), name(name), body(NULL) {}
   /*!
    * \brief set the function body
+   * \param body body to set
+   * \return reference to self.
    */
   inline OperatorPropertyEntry &set_body(Creator body) {
     this->body = body;
     return *this;
   }
+  /*!
+   * \brief describe the operator.
+   * \param description the description.
+   * \return reference to self.
+   */
+  inline OperatorPropertyEntry &describe(const std::string &description) {
+    this->description = description;
+    return *this;
+  }
+  /*!
+   * \brief set the parameter documents.
+   * \param description the description.
+   * \return reference to self.
+   */
+  inline OperatorPropertyEntry &set_param_doc(const std::string &param_doc) {
+    this->param_doc = param_doc;
+    return *this;
+  }
+  /*!
+   * \brief add argument to the entry.
+   * \param name name of the argument.
+   * \param desc description of the argument.
+   * \return reference to self.
+   */
+  inline OperatorPropertyEntry &add_argument(const std::string &name,
+                                             const std::string &desc) {
+    arguments.push_back(std::make_pair(name, desc));
+    return *this;
+  }
   /*!
    * \brief invoke the function
    * \return the created OperatorProperty
diff --git a/src/c_api.cc b/src/c_api.cc
index 3d5e03cc0748..04ecbda25bd9 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -12,6 +12,8 @@
 #include <mxnet/operator.h>
 #include <mxnet/c_api.h>
 #include <vector>
+#include <sstream>
+#include <string>
 #include <mutex>
 #include <memory>
 
@@ -307,6 +309,49 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
   API_END();
 }
 
+int MXSymbolGetAtomicSymbolDoc(AtomicSymbolCreator creator,
+                               const char **out) {
+  OperatorPropertyEntry *e = static_cast<OperatorPropertyEntry *>(creator);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  std::ostringstream os, os_param;
+  if (e->description.length() != 0) {
+    os << e->description << "\n\n";
+  } else {
+    os << "Symbolic Operator "<< e->name << "\n\n";
+  }
+  // get parameter doc
+  for (auto kv : e->arguments) {
+    os_param << kv.first << " : Symbol\n";
+    if (kv.second.length() != 0) {
+      os_param << "    " << kv.second << '\n';
+    }
+  }
+  os_param << e->param_doc;
+  std::string param_doc = os_param.str();
+  if (param_doc.length() != 0) {
+    os << "Parameters\n"
+       << "----------\n"
+       << param_doc << '\n';
+  } else {
+    os << "Parameters\n"
+       << "----------\n"
+       << "args\n"
+       << "    Positional arguments to the Symbol.\n\n"
+       << "kwargs\n"
+       << "    Keyword arguments to the Symbol.\n\n";
+  }
+  // generate return
+  os << "Returns\n"
+     << "-------\n"
+     << "output : Symbol\n"
+     << "    "
+     << "The result output symbol.\n";
+  ret->ret_str = os.str();
+  *out = ret->ret_str.c_str();
+  API_END();
+}
+
 int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
                                int num_param,
                                const char **keys,
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 3d57d6a88102..c6ade088c545 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -29,8 +29,11 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
   int type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
-    DMLC_DECLARE_FIELD(type).set_default(kReLU).add_enum("relu", kReLU).\
-      add_enum("sigmoid", kSigmoid).add_enum("tanh", kTanh);
+    DMLC_DECLARE_FIELD(type).set_default(kReLU)
+        .add_enum("relu", kReLU)
+        .add_enum("sigmoid", kSigmoid)
+        .add_enum("tanh", kTanh)
+        .describe("Activation function to be applied.");
   }
 };
 
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index 275588e099af..c2cdabc4144d 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -30,7 +30,11 @@ Operator *ActivationProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(ActivationParam);
 
-REGISTER_OP_PROPERTY(Activation, ActivationProp);
+REGISTER_OP_PROPERTY(Activation, ActivationProp)
+.describe("Apply activation function to input.")
+.add_argument("data", "Input data to activation function.")
+.set_param_doc(ActivationParam::__DOC__());
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index b49e5c422739..8f4efa6f6b3f 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -30,8 +30,10 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   DMLC_DECLARE_PARAMETER(FullyConnectedParam) {
     // TODO(bing) change to only set lower bound
     // add support for boolean
-    DMLC_DECLARE_FIELD(num_hidden).set_range(1, 100000);
-    DMLC_DECLARE_FIELD(no_bias).set_default(false);
+    DMLC_DECLARE_FIELD(num_hidden).set_range(1, 100000)
+        .describe("Number of hidden nodes of the output.");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+        .describe("Whether to disable bias parameter.");
   }
 };
 
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 7d529cb3ed64..dad936e79ea5 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -19,6 +19,11 @@ Operator* FullyConnectedProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
-REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp);
+REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
+.describe("Apply matrix multiplication to input then add a bias.")
+.add_argument("data", "Input data to the FullyConnectedOp.")
+.add_argument("weight", "Weight matrix.")
+.add_argument("bias", "Bias parameter.")
+.set_param_doc(FullyConnectedParam::__DOC__());
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index 247bb1aa4978..47f52089d99d 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -94,14 +94,11 @@ class SoftmaxProp : public OperatorProperty {
                           std::vector<TShape> *out_shape) const {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
-    const TShape &dshape = in_shape->at(kData);
-    TShape &lshape = in_shape->at(kLabel);
+    const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
-    if (lshape.ndim() == 0) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[1]))
-    }
     out_shape->clear();
     out_shape->push_back(dshape);
+    out_shape->emplace_back(Shape1(dshape[0]));
     return true;
   }
 

From 225fabd9ae7a6397256abfc4e1347009a713bb50 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 21 Aug 2015 23:42:14 -0600
Subject: [PATCH 11/20] add pythondoc

---
 python/mxnet/symbol.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index cb5daf2225a1..2fc603a9d759 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -289,8 +289,14 @@ def Group(symbols):
     return Symbol(handle)
 
 
-def _make_atomic_symbol_function(handle, func_name):
+def _make_atomic_symbol_function(handle):
     """Create an atomic symbol function by handle and funciton name."""
+    name = ctypes.c_char_p()
+    docs = ctypes.c_char_p()
+    check_call(_LIB.MXSymbolGetAtomicSymbolName(handle, ctypes.byref(name)))
+    check_call(_LIB.MXSymbolGetAtomicSymbolDoc(handle, ctypes.byref(docs)))
+    func_name = name.value;
+
     def creator(*args, **kwargs):
         """Activation Operator of Neural Net.
         The parameters listed below can be passed in as keyword arguments.
@@ -333,6 +339,8 @@ def creator(*args, **kwargs):
         s._compose(*args, name=name, **symbol_kwargs)
         return s
     creator.__name__ = func_name
+    creator.__doc__ = docs.value
+    print creator.__doc__
     return creator
 
 
@@ -345,9 +353,7 @@ def _init_module_functions():
     module_obj = sys.modules[__name__]
     for i in range(size.value):
         hdl = ctypes.c_void_p(plist[i])
-        name = ctypes.c_char_p()
-        check_call(_LIB.MXSymbolGetAtomicSymbolName(hdl, ctypes.byref(name)))
-        function = _make_atomic_symbol_function(hdl, name.value)
+        function = _make_atomic_symbol_function(hdl)
         setattr(module_obj, function.__name__, function)
 
 # Initialize the atomic symbo in startups

From daf41e17319d231ca5c946fa98f6ab9b2be39f66 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 21 Aug 2015 23:49:07 -0600
Subject: [PATCH 12/20] add cpp doc

---
 Makefile                                     |  4 ++--
 doc/Doxyfile                                 |  2 +-
 doc/conf.py                                  |  5 +++++
 doc/cpp/c_api.md                             | 15 +++++++++++++++
 doc/cpp/cpp_guide.md                         |  9 +++++++++
 doc/cpp/symbolic.md                          | 15 +++++++++++++++
 doc/index.md                                 |  5 +++--
 doc/python/{python_api.rst => python_api.md} |  2 ++
 include/mxnet/registry.h                     |  2 +-
 python/mxnet/symbol.py                       |  1 -
 10 files changed, 53 insertions(+), 7 deletions(-)
 create mode 100644 doc/cpp/c_api.md
 create mode 100644 doc/cpp/cpp_guide.md
 create mode 100644 doc/cpp/symbolic.md
 rename doc/python/{python_api.rst => python_api.md} (92%)

diff --git a/Makefile b/Makefile
index b06942ae0352..73b9e357460e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ ifdef CXXNET_CONFIG
 	config = $(CXXNET_CONFIG)
 else ifneq ("$(wildcard ./config.mk)","")
 	config = config.mk
-else 
+else
 	config = make/config.mk
 endif
 endif
@@ -129,7 +129,7 @@ $(CUBIN) :
 lint:
 	python dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts test python
 
-doc:
+doxygen:
 	doxygen doc/Doxyfile
 
 clean:
diff --git a/doc/Doxyfile b/doc/Doxyfile
index 41c86905b59f..aeef012f2384 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -1821,7 +1821,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = NO
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
diff --git a/doc/conf.py b/doc/conf.py
index 981ab273c98c..186759119c2e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -55,8 +55,13 @@
     'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
     'sphinx.ext.mathjax',
+    'breathe',
 ]
 
+# Use breathe to include doxygen documents
+breathe_projects = {'mxnet' : 'doxygen/xml/'}
+breathe_default_project = 'mxnet'
+
 autodoc_member_order = 'bysource'
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/doc/cpp/c_api.md b/doc/cpp/c_api.md
new file mode 100644
index 000000000000..45172f74bd61
--- /dev/null
+++ b/doc/cpp/c_api.md
@@ -0,0 +1,15 @@
+C API
+=====
+MXNet provides a C API interface that you can invoke.
+This allows MXNet to interfacing with most other languages that supports
+interfacing to C.
+
+If you are interested in porting MXNet to a new language, take a look at
+the
+
+
+API Reference
+-------------
+```eval_rst
+.. doxygenfile:: c_api.h
+```
\ No newline at end of file
diff --git a/doc/cpp/cpp_guide.md b/doc/cpp/cpp_guide.md
new file mode 100644
index 000000000000..494aa63d6a99
--- /dev/null
+++ b/doc/cpp/cpp_guide.md
@@ -0,0 +1,9 @@
+C++ API Document
+================
+This is the place where you can find general guideline of mxnet's internal
+as well as API References to each modules.
+
+Contents
+--------
+* [Symbolic Interface](symbolic.md)
+* [C API](c_api.md)
diff --git a/doc/cpp/symbolic.md b/doc/cpp/symbolic.md
new file mode 100644
index 000000000000..19addab2b91f
--- /dev/null
+++ b/doc/cpp/symbolic.md
@@ -0,0 +1,15 @@
+Symbolic Interface
+==================
+
+API Reference
+-------------
+```eval_rst
+.. doxygenclass:: mxnet::StaticGraph
+   :members:
+
+.. doxygenclass:: mxnet::Symbol
+   :members:
+
+.. doxygenclass:: mxnet::Executor
+   :members:
+```
\ No newline at end of file
diff --git a/doc/index.md b/doc/index.md
index 86099881930a..e5d2703b829a 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,7 +3,8 @@ MXNet Documentation
 
 Contents
 --------
-* [Python API Reference](python/python_api.rst)
+* [Python API Reference](python/python_api.md)
+* [C++ Developer Guide](cpp/cpp_guide.md)
 
 Indices and tables
 ------------------
@@ -12,4 +13,4 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-```
\ No newline at end of file
+```
diff --git a/doc/python/python_api.rst b/doc/python/python_api.md
similarity index 92%
rename from doc/python/python_api.rst
rename to doc/python/python_api.md
index 3b726ef575e7..e79a3519b83e 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.md
@@ -4,5 +4,7 @@ This page gives the Python API reference of mxnet.
 
 Symbolic Interface
 ------------------
+```eval_rst
 .. automodule:: mxnet.symbol
     :members:
+```
\ No newline at end of file
diff --git a/include/mxnet/registry.h b/include/mxnet/registry.h
index 716438ec4c72..6be358cdbd80 100644
--- a/include/mxnet/registry.h
+++ b/include/mxnet/registry.h
@@ -255,7 +255,7 @@ struct OperatorPropertyEntry {
   }
   /*!
    * \brief set the parameter documents.
-   * \param description the description.
+   * \param param_doc the parameter documentation.
    * \return reference to self.
    */
   inline OperatorPropertyEntry &set_param_doc(const std::string &param_doc) {
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 2fc603a9d759..9c66f7a74f57 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -340,7 +340,6 @@ def creator(*args, **kwargs):
         return s
     creator.__name__ = func_name
     creator.__doc__ = docs.value
-    print creator.__doc__
     return creator
 
 

From 661cfd498e43804da7f1e9340ca671bfe7a40efe Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 14:12:56 -0600
Subject: [PATCH 13/20] refactor reg

---
 Makefile                                      |  12 +-
 include/mxnet/c_api.h                         |  24 +-
 include/mxnet/registry.h                      | 282 +++++-------------
 python/mxnet/symbol.py                        |  40 ++-
 python/test_infer_shape.py                    |   8 +-
 python/test_symbol.py                         |  12 +-
 src/c_api.cc                                  |  82 +++--
 src/narray/narray.cc                          |  16 +-
 ...{narray_op-inl.h => narray_function-inl.h} |  12 +-
 .../{narray_op_cpu.cc => narray_function.cc}  |   6 +-
 .../{narray_op_gpu.cu => narray_function.cu}  |  10 +-
 src/narray/{narray_op.h => narray_function.h} |   0
 src/operator/activation.cc                    |   6 +-
 src/operator/elementwise_sum-inl.h            |   3 +-
 src/operator/elementwise_sum.cc               |   5 +-
 src/operator/fully_connected.cc               |  10 +-
 src/operator/pooling-inl.h                    |   5 +-
 src/operator/pooling.cc                       |   6 +-
 src/operator/softmax.cc                       |   6 +-
 src/registry.cc                               |  39 +--
 test/api_registry_test.cc                     |  10 -
 21 files changed, 234 insertions(+), 360 deletions(-)
 rename src/narray/{narray_op-inl.h => narray_function-inl.h} (85%)
 rename src/narray/{narray_op_cpu.cc => narray_function.cc} (81%)
 rename src/narray/{narray_op_gpu.cu => narray_function.cu} (92%)
 rename src/narray/{narray_op.h => narray_function.h} (100%)
 delete mode 100644 test/api_registry_test.cc

diff --git a/Makefile b/Makefile
index 73b9e357460e..f574afde31b0 100644
--- a/Makefile
+++ b/Makefile
@@ -55,17 +55,16 @@ ifneq ($(ADD_LDFLAGS), NONE)
 endif
 
 #BIN = test/test_threaded_engine test/api_registry_test
-BIN = test/api_registry_test test/test_storage
-OBJ = narray_op_cpu.o
+OBJ = narray_function_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o elementwise_sum_cpu.o graph_executor.o pooling_cpu.o softmax_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_op_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o
+	CUOBJ += narray_function_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -78,8 +77,8 @@ $(DMLC_CORE)/libdmlc.a:
 storage.o: src/storage/storage.cc
 engine.o: src/dag_engine/simple_engine.cc
 narray.o: src/narray/narray.cc
-narray_op_cpu.o: src/narray/narray_op_cpu.cc src/narray/narray_op-inl.h
-narray_op_gpu.o: src/narray/narray_op_gpu.cu src/narray/narray_op-inl.h
+narray_function_cpu.o: src/narray/narray_function.cc src/narray/narray_function-inl.h
+narray_function_gpu.o: src/narray/narray_function.cu src/narray/narray_function-inl.h
 symbol.o: src/symbol/symbol.cc
 graph_executor.o: src/symbol/graph_executor.cc
 static_graph.o : src/symbol/static_graph.cc
@@ -100,7 +99,6 @@ softmax_gpu.o: src/operator/softmax.cu
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
 
-test/api_registry_test: test/api_registry_test.cc lib/libmxnet.a
 test/test_storage: test/test_storage.cc lib/libmxnet.a
 #test/test_threaded_engine: test/test_threaded_engine.cc api/libmxnet.a
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 32a2156a7725..e2dcc2df835a 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -214,13 +214,23 @@ MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
 MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
                                                AtomicSymbolCreator **out_array);
 /*!
- * \brief Get the name of AtomicSymbol.
- * \param creator the AtomicSymbolCreator
- * \param out the returned name of the creator
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
-                                          const char **out);
+ * \brief Get the detailed information about atomic symbol.
+ * \param creator the AtomicSymbolCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
+                                          const char **name,
+                                          const char **description,
+                                          mx_uint *num_args,
+                                          const char ***arg_names,
+                                          const char ***arg_type_infos,
+                                          const char ***arg_descriptions);
 /*!
  * \brief Get the docstring of AtomicSymbol.
  * \param creator the AtomicSymbolCreator
diff --git a/include/mxnet/registry.h b/include/mxnet/registry.h
index 6be358cdbd80..2e02881411dd 100644
--- a/include/mxnet/registry.h
+++ b/include/mxnet/registry.h
@@ -7,6 +7,7 @@
 #define MXNET_REGISTRY_H_
 
 #include <dmlc/base.h>
+#include <dmlc/registry.h>
 #include <map>
 #include <string>
 #include <vector>
@@ -16,56 +17,12 @@
 #include "./operator.h"
 
 namespace mxnet {
-
-/*! \brief registry template */
-template <typename Entry>
-class Registry {
- public:
-  /*! \return get a singleton */
-  static Registry *Get();
-  /*!
-   * \brief register a name function under name
-   * \param name name of the function
-   * \return ref to the registered entry, used to set properties
-   */
-  Entry &Register(const std::string& name);
-  /*! \return list of functions in the registry */
-  inline static const std::vector<const Entry*> &List() {
-    return Get()->fun_list_;
-  }
-  /*!
-   * \brief find an function entry with corresponding name
-   * \param name name of the function
-   * \return the corresponding function, can be NULL
-   */
-  inline static const Entry *Find(const std::string &name) {
-    const std::map<std::string, Entry*> &fmap = Get()->fmap_;
-    typename std::map<std::string, Entry*>::const_iterator p = fmap.find(name);
-    if (p != fmap.end()) {
-      return p->second;
-    } else {
-      return NULL;  //  c++11 is not required
-    }
-  }
-
- private:
-  /*! \brief list of functions */
-  std::vector<const Entry*> fun_list_;
-  /*! \brief map of name->function */
-  std::map<std::string, Entry*> fmap_;
-  /*! \brief constructor */
-  Registry() {}
-  /*! \brief destructor */
-  ~Registry() {
-    for (typename std::map<std::string, Entry*>::iterator p = fmap_.begin();
-         p != fmap_.end(); ++p) {
-      delete p->second;
-    }
-  }
-};
-
+/*! \brief definition of NArray function */
+typedef std::function<void (NArray **used_vars,
+                            real_t *scalars,
+                            NArray **mutate_vars)> NArrayAPIFunction;
 /*! \brief mask information on how functions can be exposed */
-enum FunctionTypeMask {
+enum NArrayFunctionTypeMask {
   /*! \brief all the use_vars should go before scalar */
   kNArrayArgBeforeScalar = 1,
   /*! \brief all the scalar should go before use_vars */
@@ -81,14 +38,10 @@ enum FunctionTypeMask {
   kAcceptEmptyMutateTarget = 1 << 2
 };
 
-/*! \brief registry entry */
-struct NArrayFunctionEntry {
-  /*! \brief definition of NArray function */
-  typedef std::function<void (NArray **used_vars,
-                              real_t *scalars,
-                              NArray **mutate_vars)> Function;
-  /*! \brief function name */
-  std::string name;
+/*! \brief Registry entry for NArrayFunction */
+struct NArrayFunctionReg
+    : public dmlc::FunctionRegEntryBase<NArrayFunctionReg,
+                                        NArrayAPIFunction> {
   /*! \brief number of variable used by this function */
   unsigned num_use_vars;
   /*! \brief number of variable mutated by this function */
@@ -97,74 +50,31 @@ struct NArrayFunctionEntry {
   unsigned num_scalars;
   /*! \brief information on how function should be called from API */
   int type_mask;
-  /*! \brief the real function */
-  Function body;
   /*!
    * \brief constructor
-   * \param name name of the function
    */
-  explicit NArrayFunctionEntry(const std::string &name)
-      : name(name),
-        num_use_vars(0),
+  explicit NArrayFunctionReg()
+      : num_use_vars(0),
         num_mutate_vars(0),
         num_scalars(0),
-        type_mask(0),
-        body(nullptr) {}
-  /*!
-   * \brief set the number of mutate variables
-   * \param n number of mutate variablesx
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionEntry &set_num_use_vars(unsigned n) {
-    num_use_vars = n; return *this;
-  }
-  /*!
-   * \brief set the number of mutate variables
-   * \param n number of mutate variablesx
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionEntry &set_num_mutate_vars(unsigned n) {
-    num_mutate_vars = n; return *this;
-  }
-  /*!
-   * \brief set the number of scalar arguments
-   * \param n number of scalar arguments
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionEntry &set_num_scalars(unsigned n) {
-    num_scalars = n; return *this;
-  }
-  /*!
-   * \brief set the function body
-   * \param f function body to set
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionEntry &set_body(Function f) {
-    body = f; return *this;
-  }
-  /*!
-   * \brief set type mask
-   * \param tmask typemask
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionEntry &set_type_mask(int tmask) {
-    type_mask = tmask; return *this;
-  }
+        type_mask(0) {}
   /*!
    * \brief set the function body to a binary NArray function
    *  this will also auto set the parameters correctly
    * \param fbinary function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NArrayFunctionEntry &set_function(void fbinary(const NArray &lhs,
-                                                        const NArray &rhs,
-                                                        NArray *out)) {
+  inline NArrayFunctionReg &set_function(void fbinary(const NArray &lhs,
+                                                      const NArray &rhs,
+                                                      NArray *out)) {
     body = [fbinary] (NArray **used_vars,
                       real_t *s, NArray **mutate_vars) {
       fbinary(*used_vars[0], *used_vars[1], mutate_vars[0]);
     };
     num_use_vars = 2; num_mutate_vars = 1;
     type_mask = kNArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+    this->add_argument("lhs", "NArray", "Left operand to the function.");
+    this->add_argument("rhs", "NArray", "Right operand to the function.");
     return *this;
   }
   /*!
@@ -173,139 +83,91 @@ struct NArrayFunctionEntry {
    * \param funary function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NArrayFunctionEntry &set_function(void funary(const NArray &src,
-                                                       NArray *out)) {
+  inline NArrayFunctionReg &set_function(void funary(const NArray &src,
+                                                     NArray *out)) {
     body = [funary] (NArray **used_vars,
                      real_t *s, NArray **mutate_vars) {
       funary(*used_vars[0], mutate_vars[0]);
     };
     num_use_vars = 1; num_mutate_vars = 1;
     type_mask = kNArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+    this->add_argument("src", "NArray", "Source input to the function.");
     return *this;
   }
   /*!
-   * \brief invoke the function
-   * \param use_vars variables used by the function
-   * \param scalars the scalar arguments passed to function
-   * \param mutate_vars the variables mutated by the function
-   */
-  inline void operator()(NArray **use_vars,
-                         real_t *scalars,
-                         NArray **mutate_vars) const {
-    body(use_vars, scalars, mutate_vars);
-  }
-};  // NArrayFunctionEntry
-
-/*!
- * \brief macro to register NArray function
- *
- * Example: the following code is example to register aplus
- * \code
- *
- * REGISTER_NARRAY_FUN(Plus)
- * .set_body([] (NArray **used_vars, real_t *scalars, NArray **mutate_vars) {
- *    BinaryPlus(*used_vars[0], *used_vars[1], mutate_vars[0]);
- *  })
- * .set_num_use_vars(2)
- * .set_num_mutate_vars(1);
- *
- * \endcode
- */
-#define REGISTER_NARRAY_FUN(name)                                \
-  static auto __ ## name ## _narray_fun__ =                      \
-      ::mxnet::Registry<NArrayFunctionEntry>::Get()->Register("" # name)
-
-
-/*! \brief OperatorPropertyEntry to register */
-struct OperatorPropertyEntry {
-  /*! \brief typedef Creator function */
-  typedef OperatorProperty*(*Creator)();
-  /*! \brief if OperatorProperty use param */
-  bool use_param;
-  /*! \brief name of the entry */
-  std::string name;
-  /*! \brief description of operator */
-  std::string description;
-  /*! \brief pair of name description the arguments */
-  std::vector<std::pair<std::string, std::string> > arguments;
-  /*! \brief documentation to parameters */
-  std::string param_doc;
-  /*! \brief function body to create OperatorProperty */
-  Creator body;
-  /*! \brief constructor */
-  explicit OperatorPropertyEntry(const std::string& name)
-      : use_param(true), name(name), body(NULL) {}
-  /*!
-   * \brief set the function body
-   * \param body body to set
-   * \return reference to self.
-   */
-  inline OperatorPropertyEntry &set_body(Creator body) {
-    this->body = body;
-    return *this;
-  }
-  /*!
-   * \brief describe the operator.
-   * \param description the description.
-   * \return reference to self.
+   * \brief set the number of mutate variables
+   * \param n number of mutate variablesx
+   * \return ref to the registered entry, used to set properties
    */
-  inline OperatorPropertyEntry &describe(const std::string &description) {
-    this->description = description;
-    return *this;
+  inline NArrayFunctionReg &set_num_use_vars(unsigned n) {
+    num_use_vars = n; return *this;
   }
   /*!
-   * \brief set the parameter documents.
-   * \param param_doc the parameter documentation.
-   * \return reference to self.
+   * \brief set the number of mutate variables
+   * \param n number of mutate variablesx
+   * \return ref to the registered entry, used to set properties
    */
-  inline OperatorPropertyEntry &set_param_doc(const std::string &param_doc) {
-    this->param_doc = param_doc;
-    return *this;
+  inline NArrayFunctionReg &set_num_mutate_vars(unsigned n) {
+    num_mutate_vars = n; return *this;
   }
   /*!
-   * \brief add argument to the entry.
-   * \param name name of the argument.
-   * \param desc description of the argument.
-   * \return reference to self.
+   * \brief set the number of scalar arguments
+   * \param n number of scalar arguments
+   * \return ref to the registered entry, used to set properties
    */
-  inline OperatorPropertyEntry &add_argument(const std::string &name,
-                                             const std::string &desc) {
-    arguments.push_back(std::make_pair(name, desc));
-    return *this;
+  inline NArrayFunctionReg &set_num_scalars(unsigned n) {
+    num_scalars = n; return *this;
   }
   /*!
-   * \brief invoke the function
-   * \return the created OperatorProperty
+   * \brief set type mask
+   * \param tmask typemask
+   * \return ref to the registered entry, used to set properties
    */
-  inline OperatorProperty* operator () () const {
-    return body();
+  inline NArrayFunctionReg &set_type_mask(int tmask) {
+    type_mask = tmask; return *this;
   }
+};  // NArrayFunctionReg
 
- private:
-  /*! \brief disable copy constructor */
-  OperatorPropertyEntry(const OperatorPropertyEntry& other) {}
-  /*! \brief disable assignment operator */
-  const OperatorPropertyEntry& operator = (const OperatorPropertyEntry& other) { return *this; }
+/*!
+ * \brief Macro to register NArray function
+ *
+ * Example: the following code is example to register a plus
+ * \code
+ *
+ * REGISTER_NARRAY_FUN(Plus)
+ * .set_function(Plus);
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_NARRAY_FUN(name)                                 \
+  DMLC_REGISTRY_REGISTER(::mxnet::NArrayFunctionReg, NArrayFunctionReg, name)
+
+/*! \brief typedef the factory function of operator property */
+typedef OperatorProperty *(*OperatorPropertyFactory)();
+/*!
+ * \brief Registry entry for OperatorProperty factory functions.
+ */
+struct OperatorPropertyReg
+    : public dmlc::FunctionRegEntryBase<OperatorPropertyReg,
+                                        OperatorPropertyFactory> {
 };
 
 /*!
- * \brief macro to register OperatorProperty to OperatorPropertyFactory
+ * \brief Macro to register OperatorProperty
  *
- * Example: the following code is example to register aplus
  * \code
- *
- * REGISTER_ATOMIC_SYMBOL(fullc)
- * .set_use_param(false)
+ * // example of registering a fully connected operator
+ * REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedOpProp)
+ * .describe("Fully connected layer");
  *
  * \endcode
  */
-#define REGISTER_OP_PROPERTY(name, OperatorPropertyType)                \
-  ::mxnet::OperatorProperty* __make_ ## OperatorPropertyType ## __() {  \
+#define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
+  static ::mxnet::OperatorProperty* __create__ ## OperatorPropertyType ## __() { \
     return new OperatorPropertyType;                                    \
   }                                                                     \
-  static ::mxnet::OperatorPropertyEntry& __ ## name ## _atomic_symbol__ = \
-      ::mxnet::Registry< ::mxnet::OperatorPropertyEntry >::Get()->Register("" # name) \
-      .set_body(__make_ ## OperatorPropertyType ## __)
+  DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
+  .set_body(__create__ ## OperatorPropertyType ## __)
 
 }  // namespace mxnet
 #endif  // MXNET_REGISTRY_H_
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 9c66f7a74f57..6fb5eda53912 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -292,10 +292,38 @@ def Group(symbols):
 def _make_atomic_symbol_function(handle):
     """Create an atomic symbol function by handle and funciton name."""
     name = ctypes.c_char_p()
-    docs = ctypes.c_char_p()
-    check_call(_LIB.MXSymbolGetAtomicSymbolName(handle, ctypes.byref(name)))
-    check_call(_LIB.MXSymbolGetAtomicSymbolDoc(handle, ctypes.byref(docs)))
-    func_name = name.value;
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+
+    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
+            handle, ctypes.byref(name), ctypes.byref(desc),
+            ctypes.byref(num_args),
+            ctypes.byref(arg_names),
+            ctypes.byref(arg_types),
+            ctypes.byref(arg_descs)))
+    func_name = name.value
+    param_str = []
+    for i in range(num_args.value):
+        ret = '%s : %s' % (arg_names[i], arg_types[i])
+        if len(arg_descs[i]) != 0:
+            ret += '\n    ' + arg_descs[i]
+        param_str.append(ret)
+
+    doc_str = ('%s\n\n' +
+               'Parameters\n' +
+               '----------\n' +
+               '%s\n' +
+               'name : string, required.\n' +
+               '    Name of the resulting symbol.\n\n' +
+               'Returns\n' +
+               '-------\n' +
+               'symbol: Symbol\n'+
+               '    The result symbol.')
+
+    doc_str = doc_str % (desc.value, '\n'.join(param_str))
 
     def creator(*args, **kwargs):
         """Activation Operator of Neural Net.
@@ -338,8 +366,9 @@ def creator(*args, **kwargs):
         s = Symbol(sym_handle)
         s._compose(*args, name=name, **symbol_kwargs)
         return s
+
     creator.__name__ = func_name
-    creator.__doc__ = docs.value
+    creator.__doc__ = doc_str
     return creator
 
 
@@ -347,6 +376,7 @@ def _init_module_functions():
     """List and add all the atomic symbol functions to current module."""
     plist = ctypes.POINTER(ctypes.c_void_p)()
     size = ctypes.c_uint()
+
     check_call(_LIB.MXSymbolListAtomicSymbolCreators(ctypes.byref(size),
                                                      ctypes.byref(plist)))
     module_obj = sys.modules[__name__]
diff --git a/python/test_infer_shape.py b/python/test_infer_shape.py
index b94388e5546d..236ad1e7ae71 100644
--- a/python/test_infer_shape.py
+++ b/python/test_infer_shape.py
@@ -1,11 +1,11 @@
 # pylint: skip-file
 import mxnet as mx
 
-data = mx.sym.Variable('data')
+data = mx.symbol.Variable('data')
 
-fc1 = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=1000)
-fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', num_hidden=10)
-fc3 = mx.sym.FullyConnected( name='fc2', num_hidden=10)
+fc1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=1000)
+fc2 = mx.symbol.FullyConnected(data=fc1, name='fc2', num_hidden=10)
+fc3 = mx.symbol.FullyConnected( name='fc2', num_hidden=10)
 
 print fc2.list_arguments()
 
diff --git a/python/test_symbol.py b/python/test_symbol.py
index 6d876fd46fb8..451ee39775c9 100644
--- a/python/test_symbol.py
+++ b/python/test_symbol.py
@@ -1,18 +1,18 @@
 # pylint: skip-file
 import mxnet as mx
 
-data = mx.sym.Variable('data')
+data = mx.symbol.Variable('data')
 print data.debug_str()
 
-fc1 = mx.sym.FullyConnected(data=data, name='fc1', no_bias=0)
-fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', no_bias=0)
+fc1 = mx.symbol.FullyConnected(data=data, name='fc1', no_bias=0)
+fc2 = mx.symbol.FullyConnected(data=fc1, name='fc2', no_bias=0)
 
 print fc2.debug_str()
 
 print fc2.list_arguments()
 
-fc3 = mx.sym.FullyConnected(name='fc3')
-fc4 = mx.sym.FullyConnected(data=fc3, name='fc4')
+fc3 = mx.symbol.FullyConnected(name='fc3')
+fc4 = mx.symbol.FullyConnected(data=fc3, name='fc4')
 
 print fc4.debug_str()
 
@@ -20,7 +20,7 @@
 composed_fc4 = fc4(fc3_data=fc2, name='composed')
 print composed_fc4.debug_str()
 
-multi_out = mx.sym.Group([composed_fc4, fc2])
+multi_out = mx.symbol.Group([composed_fc4, fc2])
 
 print multi_out.debug_str()
 print multi_out.list_arguments()
diff --git a/src/c_api.cc b/src/c_api.cc
index 04ecbda25bd9..9b2a72a94688 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -241,7 +241,7 @@ int MXNArrayGetContext(NArrayHandle handle,
 int MXListFunctions(mx_uint *out_size,
                     FunctionHandle **out_array) {
   API_BEGIN();
-  auto &vec = Registry<NArrayFunctionEntry>::List();
+  auto &vec = dmlc::Registry<NArrayFunctionReg>::List();
   *out_size = static_cast<mx_uint>(vec.size());
   *out_array = (FunctionHandle*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
   API_END();
@@ -250,14 +250,14 @@ int MXListFunctions(mx_uint *out_size,
 int MXGetFunction(const char *name,
                   FunctionHandle *out) {
   API_BEGIN();
-  *out = Registry<NArrayFunctionEntry>::Find(name);
+  *out = dmlc::Registry<NArrayFunctionReg>::Find(name);
   API_END();
 }
 
 int MXFuncGetName(FunctionHandle fun,
                   const char **out_name) {
   API_BEGIN();
-  auto *f = static_cast<const NArrayFunctionEntry*>(fun);
+  auto *f = static_cast<const NArrayFunctionReg*>(fun);
   *out_name = f->name.c_str();
   API_END();
 }
@@ -268,7 +268,7 @@ int MXFuncDescribe(FunctionHandle fun,
                    mx_uint *num_mutate_vars,
                    int *type_mask) {
   API_BEGIN();
-  auto *f = static_cast<const NArrayFunctionEntry*>(fun);
+  auto *f = static_cast<const NArrayFunctionReg*>(fun);
   *num_use_vars = f->num_use_vars;
   *num_scalars = f->num_scalars;
   *num_mutate_vars = f->num_mutate_vars;
@@ -281,10 +281,10 @@ int MXFuncInvoke(FunctionHandle fun,
                  mx_float *scalar_args,
                  NArrayHandle *mutate_vars) {
   API_BEGIN();
-  auto *f = static_cast<const NArrayFunctionEntry*>(fun);
-  (*f)((NArray**)(use_vars),  //  NOLINT(*)
-       scalar_args,
-       (NArray**)(mutate_vars));  //  NOLINT(*)
+  auto *f = static_cast<const NArrayFunctionReg*>(fun);
+  f->body((NArray**)(use_vars),  //  NOLINT(*)
+          scalar_args,
+          (NArray**)(mutate_vars));  //  NOLINT(*)
   API_END();
 }
 
@@ -295,7 +295,7 @@ int MXFuncInvoke(FunctionHandle fun,
 int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
                                      AtomicSymbolCreator **out_array) {
   API_BEGIN();
-  auto &vec = Registry<OperatorPropertyEntry>::List();
+  auto &vec = dmlc::Registry<OperatorPropertyReg>::List();
   *out_size = static_cast<mx_uint>(vec.size());
   *out_array = (AtomicSymbolCreator*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
   API_END();
@@ -304,51 +304,39 @@ int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
 int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
                                 const char **out) {
   API_BEGIN();
-  OperatorPropertyEntry *e = static_cast<OperatorPropertyEntry *>(creator);
+  OperatorPropertyReg *e = static_cast<OperatorPropertyReg *>(creator);
   *out = e->name.c_str();
   API_END();
 }
 
-int MXSymbolGetAtomicSymbolDoc(AtomicSymbolCreator creator,
-                               const char **out) {
-  OperatorPropertyEntry *e = static_cast<OperatorPropertyEntry *>(creator);
+int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
+                                const char **name,
+                                const char **description,
+                                mx_uint *num_args,
+                                const char ***arg_names,
+                                const char ***arg_type_infos,
+                                const char ***arg_descriptions) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  OperatorPropertyReg *e = static_cast<OperatorPropertyReg *>(creator);
+
   API_BEGIN();
-  std::ostringstream os, os_param;
-  if (e->description.length() != 0) {
-    os << e->description << "\n\n";
-  } else {
-    os << "Symbolic Operator "<< e->name << "\n\n";
+
+  *name = e->name.c_str();
+  *description = e->description.c_str();
+  *num_args = static_cast<mx_uint>(e->arguments.size());
+  ret->ret_vec_charp.clear();
+  for (size_t i = 0; i < e->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(e->arguments[i].name.c_str());
   }
-  // get parameter doc
-  for (auto kv : e->arguments) {
-    os_param << kv.first << " : Symbol\n";
-    if (kv.second.length() != 0) {
-      os_param << "    " << kv.second << '\n';
-    }
+  for (size_t i = 0; i < e->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(e->arguments[i].type_info_str.c_str());
   }
-  os_param << e->param_doc;
-  std::string param_doc = os_param.str();
-  if (param_doc.length() != 0) {
-    os << "Parameters\n"
-       << "----------\n"
-       << param_doc << '\n';
-  } else {
-    os << "Parameters\n"
-       << "----------\n"
-       << "args\n"
-       << "    Positional arguments to the Symbol.\n\n"
-       << "kwargs\n"
-       << "    Keyword arguments to the Symbol.\n\n";
+  for (size_t i = 0; i < e->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(e->arguments[i].description.c_str());
   }
-  // generate return
-  os << "Returns\n"
-     << "-------\n"
-     << "output : Symbol\n"
-     << "    "
-     << "The result output symbol.\n";
-  ret->ret_str = os.str();
-  *out = ret->ret_str.c_str();
+  *arg_names = dmlc::BeginPtr(ret->ret_vec_charp);
+  *arg_type_infos = dmlc::BeginPtr(ret->ret_vec_charp) + e->arguments.size();
+  *arg_descriptions = dmlc::BeginPtr(ret->ret_vec_charp) + (e->arguments.size() * 2);
   API_END();
 }
 
@@ -361,8 +349,8 @@ int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
   OperatorProperty *op = nullptr;
 
   API_BEGIN();
-  OperatorPropertyEntry *e = static_cast<OperatorPropertyEntry *>(creator);
-  op = (*e)();
+  OperatorPropertyReg *e = static_cast<OperatorPropertyReg *>(creator);
+  op = e->body();
   std::vector<std::pair<std::string, std::string> > kwargs;
   for (int i = 0; i < num_param; ++i) {
     kwargs.push_back({std::string(keys[i]), std::string(vals[i])});
diff --git a/src/narray/narray.cc b/src/narray/narray.cc
index 3618a38c9d59..2faf0789c607 100644
--- a/src/narray/narray.cc
+++ b/src/narray/narray.cc
@@ -7,7 +7,7 @@
 #include <mxnet/narray.h>
 #include <mxnet/registry.h>
 #include <mshadow/tensor.h>
-#include "./narray_op.h"
+#include "./narray_function.h"
 
 namespace mxnet {
 /*!
@@ -150,14 +150,14 @@ NArray &NArray::operator/=(const NArray &src) {
 }
 
 // register API function
-REGISTER_NARRAY_FUN(plus).set_function(BinaryOp<narray::Plus>);
-REGISTER_NARRAY_FUN(minus).set_function(BinaryOp<narray::Minus>);
-REGISTER_NARRAY_FUN(mul).set_function(BinaryOp<narray::Mul>);
-REGISTER_NARRAY_FUN(div).set_function(BinaryOp<narray::Div>);
+MXNET_REGISTER_NARRAY_FUN(plus).set_function(BinaryOp<narray::Plus>);
+MXNET_REGISTER_NARRAY_FUN(minus).set_function(BinaryOp<narray::Minus>);
+MXNET_REGISTER_NARRAY_FUN(mul).set_function(BinaryOp<narray::Mul>);
+MXNET_REGISTER_NARRAY_FUN(div).set_function(BinaryOp<narray::Div>);
 
-//  copy function is special
-//  that we need to remove kAcceptEmptyMutateTarget from it
-REGISTER_NARRAY_FUN(copy)
+// copy function is special
+// that we need to remove kAcceptEmptyMutateTarget from it
+MXNET_REGISTER_NARRAY_FUN(copy)
 .set_function(CopyFromTo)
 .set_type_mask(kNArrayArgBeforeScalar);
 
diff --git a/src/narray/narray_op-inl.h b/src/narray/narray_function-inl.h
similarity index 85%
rename from src/narray/narray_op-inl.h
rename to src/narray/narray_function-inl.h
index 0f378583529c..6488652ffe80 100644
--- a/src/narray/narray_op-inl.h
+++ b/src/narray/narray_function-inl.h
@@ -1,15 +1,15 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file narray_op-inl.h
+ * \file narray_function-inl.h
  * \brief
  */
-#ifndef MXNET_NARRAY_NARRAY_OP_INL_H_
-#define MXNET_NARRAY_NARRAY_OP_INL_H_
-#include "./narray_op.h"
+#ifndef MXNET_NARRAY_NARRAY_FUNCTION_INL_H_
+#define MXNET_NARRAY_NARRAY_FUNCTION_INL_H_
+#include "./narray_function.h"
 // this file will be included twice by CPU and GPU
 // macro to help specialize evaluation function
 #ifndef DECL_BINARY
-#define DECL_BINARY(XPU, OP, FUN)                                         \
+#define DECL_BINARY(XPU, OP, FUN)                                       \
   template<>                                                            \
   void Eval<XPU, OP>(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx) { \
     FUN<XPU, OP>(lhs, rhs, ret, ctx);                                   \
@@ -42,4 +42,4 @@ DECL_BINARY(DEVICE, Div, Eval_)
 }  // namespace narray
 }  // namespace mxnet
 
-#endif  // MXNET_NARRAY_NARRAY_OP_INL_H_
+#endif  // MXNET_NARRAY_NARRAY_FUNCTION_INL_H_
diff --git a/src/narray/narray_op_cpu.cc b/src/narray/narray_function.cc
similarity index 81%
rename from src/narray/narray_op_cpu.cc
rename to src/narray/narray_function.cc
index 8b6507ed0fb6..d67bb91a23aa 100644
--- a/src/narray/narray_op_cpu.cc
+++ b/src/narray/narray_function.cc
@@ -1,12 +1,12 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file narray_op_cpu.cc
+ * \file narray_function_cpu.cc
  * \brief
  */
 
 // this will be invoked by gcc and compile CPU version
-#include "./narray_op.h"
-#include "./narray_op-inl.h"
+#include "./narray_function.h"
+#include "./narray_function-inl.h"
 
 namespace mxnet {
 namespace narray {
diff --git a/src/narray/narray_op_gpu.cu b/src/narray/narray_function.cu
similarity index 92%
rename from src/narray/narray_op_gpu.cu
rename to src/narray/narray_function.cu
index 571757e41ee8..f632b5dd65c3 100644
--- a/src/narray/narray_op_gpu.cu
+++ b/src/narray/narray_function.cu
@@ -1,7 +1,7 @@
 // this will be invoked by nvcc and compile GPU version
 #include <dmlc/logging.h>
-#include "./narray_op.h"
-#include "./narray_op-inl.h"
+#include "./narray_function.h"
+#include "./narray_function-inl.h"
 
 namespace mxnet {
 namespace narray {
@@ -11,7 +11,7 @@ void Copy<cpu, gpu>(const TBlob &from, TBlob *to,
                     RunContext ctx) {
   mshadow::Copy(to->FlatTo2D<gpu, real_t>(),
                 from.FlatTo2D<cpu, real_t>(),
-                static_cast<mshadow::Stream<gpu>*>(ctx.stream));  
+                static_cast<mshadow::Stream<gpu>*>(ctx.stream));
 }
 
 template<>
@@ -20,7 +20,7 @@ void Copy<gpu, cpu>(const TBlob &from, TBlob *to,
                     RunContext ctx) {
   mshadow::Copy(to->FlatTo2D<cpu, real_t>(),
                 from.FlatTo2D<gpu, real_t>(),
-                static_cast<mshadow::Stream<gpu>*>(ctx.stream));  
+                static_cast<mshadow::Stream<gpu>*>(ctx.stream));
 }
 
 template<>
@@ -33,7 +33,7 @@ void Copy<gpu, gpu>(const TBlob &from, TBlob *to,
                    static_cast<mshadow::Stream<gpu>*>(ctx.stream));
    } else {
      CHECK(from.CheckContiguous() && to->CheckContiguous())
-         << "copy across only support continugous memory";     
+         << "copy across only support continugous memory";
      mshadow::Stream<gpu> *s = static_cast<mshadow::Stream<gpu>*>(ctx.stream);
      CHECK(s != NULL) << "need stream in GPU context";
      cudaMemcpyPeerAsync(to->dptr_,
diff --git a/src/narray/narray_op.h b/src/narray/narray_function.h
similarity index 100%
rename from src/narray/narray_op.h
rename to src/narray/narray_function.h
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index c2cdabc4144d..1872327fcd7b 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -30,10 +30,10 @@ Operator *ActivationProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(ActivationParam);
 
-REGISTER_OP_PROPERTY(Activation, ActivationProp)
+MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp)
 .describe("Apply activation function to input.")
-.add_argument("data", "Input data to activation function.")
-.set_param_doc(ActivationParam::__DOC__());
+.add_argument("data", "Symbol", "Input data to activation function.")
+.add_arguments(ActivationParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index 4a0d6e3fdd57..d4b28eb43f4c 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -26,7 +26,8 @@ enum ElementWiseSumOpOutputs {kOut};
 struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
   int size;
   DMLC_DECLARE_PARAMETER(ElementWiseSumParam) {
-    DMLC_DECLARE_FIELD(size).set_range(1, 100);
+    DMLC_DECLARE_FIELD(size).set_range(1, 100)
+        .describe("Number of inputs to be sumed.");
   }
 };
 
diff --git a/src/operator/elementwise_sum.cc b/src/operator/elementwise_sum.cc
index 38e29141c7b3..840e2e179868 100644
--- a/src/operator/elementwise_sum.cc
+++ b/src/operator/elementwise_sum.cc
@@ -19,6 +19,9 @@ Operator* ElementWiseSumProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(ElementWiseSumParam);
 
-REGISTER_OP_PROPERTY(ElementWiseSum, ElementWiseSumProp);
+MXNET_REGISTER_OP_PROPERTY(ElementWiseSum, ElementWiseSumProp)
+.describe("Perform an elementwise sum over all the inputs.")
+.add_arguments(ElementWiseSumParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index dad936e79ea5..69524800a916 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -19,11 +19,11 @@ Operator* FullyConnectedProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
-REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
+MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
 .describe("Apply matrix multiplication to input then add a bias.")
-.add_argument("data", "Input data to the FullyConnectedOp.")
-.add_argument("weight", "Weight matrix.")
-.add_argument("bias", "Bias parameter.")
-.set_param_doc(FullyConnectedParam::__DOC__());
+.add_argument("data", "Symbol", "Input data to the FullyConnectedOp.")
+.add_argument("weight", "Symbol", "Weight matrix.")
+.add_argument("bias", "Symbol", "Bias parameter.")
+.add_arguments(FullyConnectedParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 8b223e2476a2..7b577c51f944 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -40,8 +40,9 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     DMLC_DECLARE_FIELD(stride_y).set_range(1, 10000);
     DMLC_DECLARE_FIELD(pad_x).set_default(0).set_range(0, 10000);
     DMLC_DECLARE_FIELD(pad_y).set_default(0).set_range(0, 10000);
-    DMLC_DECLARE_FIELD(type).set_default(kMaxPooling)\
-      .add_enum("max", kMaxPooling).add_enum("avg", kAvgPooling)\
+    DMLC_DECLARE_FIELD(type).set_default(kMaxPooling)
+      .add_enum("max", kMaxPooling)
+      .add_enum("avg", kAvgPooling)
       .add_enum("sum", kSumPooling);
   }
 };
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index a6ebc91e0873..883948e27981 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -28,7 +28,11 @@ Operator* PoolingProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
-REGISTER_OP_PROPERTY(Pooling, PoolingProp);
+MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
+.describe("Perform spatial pooling on inputs.")
+.add_argument("data", "Symbol", "Input data to the pooling operator.")
+.add_arguments(PoolingParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/softmax.cc b/src/operator/softmax.cc
index 91fd7a1170ce..8cab48d6f28a 100644
--- a/src/operator/softmax.cc
+++ b/src/operator/softmax.cc
@@ -21,7 +21,11 @@ Operator *SoftmaxProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(SoftmaxParam);
 
-REGISTER_OP_PROPERTY(Softmax, SoftmaxProp);
+MXNET_REGISTER_OP_PROPERTY(Softmax, SoftmaxProp)
+.describe("Perform a softmax transformation on input.")
+.add_argument("data", "Symbol", "Input data to softmax.")
+.add_arguments(SoftmaxParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/registry.cc b/src/registry.cc
index f64980d8bacc..8587f4666aab 100644
--- a/src/registry.cc
+++ b/src/registry.cc
@@ -1,42 +1,25 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file api_registry.cc
- * \brief
+ * \file registry.cc
+ * \brief central place for registry definition in mxnet.
  */
 #include <dmlc/base.h>
-#include <dmlc/logging.h>
+#include <dmlc/registry.h>
 #include <mxnet/registry.h>
 #include <mxnet/symbolic.h>
 
-namespace mxnet {
-
-template <typename Entry>
-Entry &Registry<Entry>::Register(const std::string& name) {
-  CHECK_EQ(fmap_.count(name), 0);
-  Entry *e = new Entry(name);
-  fmap_[name] = e;
-  fun_list_.push_back(e);
-  return *e;
-}
-
-template <typename Entry>
-Registry<Entry> *Registry<Entry>::Get() {
-  static Registry<Entry> instance;
-  return &instance;
-}
-
-
-template NArrayFunctionEntry &Registry<NArrayFunctionEntry>::Register(const std::string& name);
-template Registry<NArrayFunctionEntry> *Registry<NArrayFunctionEntry>::Get();
-
-template OperatorPropertyEntry &Registry<OperatorPropertyEntry>::Register(const std::string& name);
-template Registry<OperatorPropertyEntry> *Registry<OperatorPropertyEntry>::Get();
+// enable the registries
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::mxnet::NArrayFunctionReg);
+DMLC_REGISTRY_ENABLE(::mxnet::OperatorPropertyReg);
+}  // namespace dmlc
 
+namespace mxnet {
 // implementation of all factory functions
 OperatorProperty *OperatorProperty::Create(const char* type_name) {
-  auto *creator = Registry<OperatorPropertyEntry>::Find(type_name);
+  auto *creator = dmlc::Registry<OperatorPropertyReg>::Find(type_name);
   CHECK_NE(creator, nullptr)
       << "Cannot find Operator " << type_name << " in registry";
-  return (*creator)();
+  return creator->body();
 }
 }  // namespace mxnet
diff --git a/test/api_registry_test.cc b/test/api_registry_test.cc
deleted file mode 100644
index 0f7cef3ba858..000000000000
--- a/test/api_registry_test.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (c) 2015 by Contributors
-// dummy code to test layer interface
-// used to demonstrate how interface can be used
-#include <mxnet/registry.h>
-
-int main(int argc, char *argv[]) {
-  auto fadd = mxnet::Registry<mxnet::NArrayFunctionEntry>::Find("Plus");
-  printf("f.name=%s\n", fadd->name.c_str());
-  return 0;
-}

From f8d4e0db736bb0fc4517884f9d08387e900446e1 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 15:43:10 -0600
Subject: [PATCH 14/20] minor fix

---
 src/operator/softmax-inl.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index 47f52089d99d..b0486cdb9a9e 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -59,8 +59,8 @@ class SoftmaxOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
+    CHECK_GE(in_grad.size(), 1);
+    CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
     Tensor<xpu, 2> out = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
@@ -96,9 +96,9 @@ class SoftmaxProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
+    SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[0]));
     out_shape->clear();
     out_shape->push_back(dshape);
-    out_shape->emplace_back(Shape1(dshape[0]));
     return true;
   }
 
@@ -116,7 +116,7 @@ class SoftmaxProp : public OperatorProperty {
       const std::vector<int> &out_grad,
       const std::vector<int> &in_data,
       const std::vector<int> &out_data) const {
-    return {out_data[kOut], in_data[kLabel]};
+    return {in_data[kLabel], out_data[kOut]};
   }
 
   virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(

From 50dc62f11e98f6d1d17a0e0ed7740eba6fe1583e Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 20:02:58 -0600
Subject: [PATCH 15/20] change registry

---
 Makefile                        |   5 +-
 doc/index.md                    |   2 +-
 include/mxnet/c_api.h           |  20 ++-
 include/mxnet/narray.h          | 129 ++++++++++++++++++
 include/mxnet/operator.h        |  32 +++++
 include/mxnet/registry.h        | 173 -----------------------
 python/mxnet/__init__.py        |   6 +-
 python/mxnet/base.py            |   2 +-
 python/mxnet/narray.py          | 234 +++++++++++++++++++++++++-------
 python/mxnet/symbol.py          |   7 +-
 python/test_python.py           |   2 +-
 src/c_api.cc                    |  77 +++++++----
 src/narray/narray.cc            |  19 ++-
 src/operator/activation.cc      |   2 -
 src/operator/elementwise_sum.cc |   1 -
 src/operator/fully_connected.cc |   1 -
 src/operator/pooling.cc         |   2 -
 src/operator/softmax.cc         |   2 -
 src/registry.cc                 |  25 ----
 19 files changed, 431 insertions(+), 310 deletions(-)
 delete mode 100644 include/mxnet/registry.h
 delete mode 100644 src/registry.cc

diff --git a/Makefile b/Makefile
index f574afde31b0..fe906dcd1a1e 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ endif
 #BIN = test/test_threaded_engine test/api_registry_test
 OBJ = narray_function_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o registry.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o
+OBJCXX11 = engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
@@ -82,9 +82,8 @@ narray_function_gpu.o: src/narray/narray_function.cu src/narray/narray_function-
 symbol.o: src/symbol/symbol.cc
 graph_executor.o: src/symbol/graph_executor.cc
 static_graph.o : src/symbol/static_graph.cc
-registry.o: src/registry.cc
+operator.o: src/operator/operator.cc
 c_api.o: src/c_api.cc
-operator.o: src/operator/static_operator_wrapper.cc
 fully_connected_cpu.o: src/operator/fully_connected.cc
 fully_connected_gpu.o: src/operator/fully_connected.cu
 activation_cpu.o: src/operator/activation.cc
diff --git a/doc/index.md b/doc/index.md
index e5d2703b829a..54c20e37c729 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,7 +3,7 @@ MXNet Documentation
 
 Contents
 --------
-* [Python API Reference](python/python_api.md)
+* [Python User Guide](python/python_guide.md)
 * [C++ Developer Guide](cpp/cpp_guide.md)
 
 Indices and tables
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index e2dcc2df835a..1178af9db5fd 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -165,13 +165,23 @@ MXNET_DLL int MXListFunctions(mx_uint *out_size,
 MXNET_DLL int MXGetFunction(const char *name,
                             FunctionHandle *out);
 /*!
- * \brief get the name of function handle
- * \param fun the function handle
- * \param out_name the name of the function
+ * \brief Get the information of the function handle.
+ * \param fun The function handle.
+ * \param name The returned name of the function.
+ * \param description The returned description of the function.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXFuncGetName(FunctionHandle fun,
-                            const char **out_name);
+MXNET_DLL int MXFuncGetInfo(FunctionHandle fun,
+                            const char **name,
+                            const char **description,
+                            mx_uint *num_args,
+                            const char ***arg_names,
+                            const char ***arg_type_infos,
+                            const char ***arg_descriptions);
 /*!
  * \brief get the argument requirements of the function
  * \param fun input function handle
diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 3202fd676cb7..7bcce54aafef 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -8,6 +8,7 @@
 
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
+#include <dmlc/registry.h>
 #include <memory>
 #include "./base.h"
 #include "./context.h"
@@ -272,5 +273,133 @@ NArray operator*(const NArray &lhs, const NArray &rhs);
  * \return a new result narray
  */
 NArray operator/(const NArray &lhs, const NArray &rhs);
+
+//--------------------------------------------------------------
+// The following part are API Registration of NArray functions.
+//--------------------------------------------------------------
+/*! \brief definition of NArray function */
+typedef std::function<void (NArray **used_vars,
+                            real_t *scalars,
+                            NArray **mutate_vars)> NArrayAPIFunction;
+/*! \brief mask information on how functions can be exposed */
+enum NArrayFunctionTypeMask {
+  /*! \brief all the use_vars should go before scalar */
+  kNArrayArgBeforeScalar = 1,
+  /*! \brief all the scalar should go before use_vars */
+  kScalarArgBeforeNArray = 1 << 1,
+  /*!
+   * \brief whether this function allows the handles in the target to
+   *  be empty NArray that are not yet initialized, and will initialize
+   *  them when the function is invoked.
+   *
+   *  most function should support this, except copy between different
+   *  devices, which requires the NArray to be pre-initialized with context
+   */
+  kAcceptEmptyMutateTarget = 1 << 2
+};
+/*! \brief Registry entry for NArrayFunction */
+struct NArrayFunctionReg
+    : public dmlc::FunctionRegEntryBase<NArrayFunctionReg,
+                                        NArrayAPIFunction> {
+  /*! \brief number of variable used by this function */
+  unsigned num_use_vars;
+  /*! \brief number of variable mutated by this function */
+  unsigned num_mutate_vars;
+  /*! \brief number of scalars used by this function */
+  unsigned num_scalars;
+  /*! \brief information on how function should be called from API */
+  int type_mask;
+  /*!
+   * \brief constructor
+   */
+  explicit NArrayFunctionReg()
+      : num_use_vars(0),
+        num_mutate_vars(0),
+        num_scalars(0),
+        type_mask(0) {}
+  /*!
+   * \brief set the function body to a binary NArray function
+   *  this will also auto set the parameters correctly
+   * \param fbinary function body to set
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NArrayFunctionReg &set_function(void fbinary(const NArray &lhs,
+                                                      const NArray &rhs,
+                                                      NArray *out)) {
+    body = [fbinary] (NArray **used_vars,
+                      real_t *s, NArray **mutate_vars) {
+      fbinary(*used_vars[0], *used_vars[1], mutate_vars[0]);
+    };
+    num_use_vars = 2; num_mutate_vars = 1;
+    type_mask = kNArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+    this->add_argument("lhs", "NArray", "Left operand to the function.");
+    this->add_argument("rhs", "NArray", "Right operand to the function.");
+    return *this;
+  }
+  /*!
+   * \brief set the function body to a unary NArray function
+   *  this will also auto set the parameters correctly
+   * \param funary function body to set
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NArrayFunctionReg &set_function(void funary(const NArray &src,
+                                                     NArray *out)) {
+    body = [funary] (NArray **used_vars,
+                     real_t *s, NArray **mutate_vars) {
+      funary(*used_vars[0], mutate_vars[0]);
+    };
+    num_use_vars = 1; num_mutate_vars = 1;
+    type_mask = kNArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+    this->add_argument("src", "NArray", "Source input to the function.");
+    return *this;
+  }
+  /*!
+   * \brief set the number of mutate variables
+   * \param n number of mutate variablesx
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NArrayFunctionReg &set_num_use_vars(unsigned n) {
+    num_use_vars = n; return *this;
+  }
+  /*!
+   * \brief set the number of mutate variables
+   * \param n number of mutate variablesx
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NArrayFunctionReg &set_num_mutate_vars(unsigned n) {
+    num_mutate_vars = n; return *this;
+  }
+  /*!
+   * \brief set the number of scalar arguments
+   * \param n number of scalar arguments
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NArrayFunctionReg &set_num_scalars(unsigned n) {
+    num_scalars = n; return *this;
+  }
+  /*!
+   * \brief set type mask
+   * \param tmask typemask
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NArrayFunctionReg &set_type_mask(int tmask) {
+    type_mask = tmask; return *this;
+  }
+};  // NArrayFunctionReg
+
+/*!
+ * \brief Macro to register NArray function
+ *
+ * Example: the following code is example to register a plus
+ * \code
+ *
+ * REGISTER_NARRAY_FUN(Plus)
+ * .set_function(Plus);
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_NARRAY_FUN(name)                                 \
+  DMLC_REGISTRY_REGISTER(::mxnet::NArrayFunctionReg, NArrayFunctionReg, name)
+
 }  // namespace mxnet
 #endif  // MXNET_NARRAY_H_
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index e60afe6948a7..247f6ae5b2c9 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -9,6 +9,7 @@
 
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
+#include <dmlc/registry.h>
 #include <vector>
 #include <string>
 #include <utility>
@@ -363,5 +364,36 @@ class OperatorProperty {
   static OperatorProperty *Create(const char* type_name);
 };
 #endif
+
+
+/*! \brief typedef the factory function of operator property */
+typedef OperatorProperty *(*OperatorPropertyFactory)();
+/*!
+ * \brief Registry entry for OperatorProperty factory functions.
+ */
+struct OperatorPropertyReg
+    : public dmlc::FunctionRegEntryBase<OperatorPropertyReg,
+                                        OperatorPropertyFactory> {
+};
+
+//--------------------------------------------------------------
+// The following part are API Registration of Operators
+//--------------------------------------------------------------
+/*!
+ * \brief Macro to register OperatorProperty
+ *
+ * \code
+ * // example of registering a fully connected operator
+ * REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedOpProp)
+ * .describe("Fully connected layer");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
+  static ::mxnet::OperatorProperty* __create__ ## OperatorPropertyType ## __() { \
+    return new OperatorPropertyType;                                    \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
+  .set_body(__create__ ## OperatorPropertyType ## __)
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_H_
diff --git a/include/mxnet/registry.h b/include/mxnet/registry.h
deleted file mode 100644
index 2e02881411dd..000000000000
--- a/include/mxnet/registry.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file registry.h
- * \brief registry that registers all sorts of functions
- */
-#ifndef MXNET_REGISTRY_H_
-#define MXNET_REGISTRY_H_
-
-#include <dmlc/base.h>
-#include <dmlc/registry.h>
-#include <map>
-#include <string>
-#include <vector>
-#include <functional>
-#include "./base.h"
-#include "./narray.h"
-#include "./operator.h"
-
-namespace mxnet {
-/*! \brief definition of NArray function */
-typedef std::function<void (NArray **used_vars,
-                            real_t *scalars,
-                            NArray **mutate_vars)> NArrayAPIFunction;
-/*! \brief mask information on how functions can be exposed */
-enum NArrayFunctionTypeMask {
-  /*! \brief all the use_vars should go before scalar */
-  kNArrayArgBeforeScalar = 1,
-  /*! \brief all the scalar should go before use_vars */
-  kScalarArgBeforeNArray = 1 << 1,
-  /*!
-   * \brief whether this function allows the handles in the target to
-   *  be empty NArray that are not yet initialized, and will initialize
-   *  them when the function is invoked.
-   *
-   *  most function should support this, except copy between different
-   *  devices, which requires the NArray to be pre-initialized with context
-   */
-  kAcceptEmptyMutateTarget = 1 << 2
-};
-
-/*! \brief Registry entry for NArrayFunction */
-struct NArrayFunctionReg
-    : public dmlc::FunctionRegEntryBase<NArrayFunctionReg,
-                                        NArrayAPIFunction> {
-  /*! \brief number of variable used by this function */
-  unsigned num_use_vars;
-  /*! \brief number of variable mutated by this function */
-  unsigned num_mutate_vars;
-  /*! \brief number of scalars used by this function */
-  unsigned num_scalars;
-  /*! \brief information on how function should be called from API */
-  int type_mask;
-  /*!
-   * \brief constructor
-   */
-  explicit NArrayFunctionReg()
-      : num_use_vars(0),
-        num_mutate_vars(0),
-        num_scalars(0),
-        type_mask(0) {}
-  /*!
-   * \brief set the function body to a binary NArray function
-   *  this will also auto set the parameters correctly
-   * \param fbinary function body to set
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionReg &set_function(void fbinary(const NArray &lhs,
-                                                      const NArray &rhs,
-                                                      NArray *out)) {
-    body = [fbinary] (NArray **used_vars,
-                      real_t *s, NArray **mutate_vars) {
-      fbinary(*used_vars[0], *used_vars[1], mutate_vars[0]);
-    };
-    num_use_vars = 2; num_mutate_vars = 1;
-    type_mask = kNArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
-    this->add_argument("lhs", "NArray", "Left operand to the function.");
-    this->add_argument("rhs", "NArray", "Right operand to the function.");
-    return *this;
-  }
-  /*!
-   * \brief set the function body to a unary NArray function
-   *  this will also auto set the parameters correctly
-   * \param funary function body to set
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionReg &set_function(void funary(const NArray &src,
-                                                     NArray *out)) {
-    body = [funary] (NArray **used_vars,
-                     real_t *s, NArray **mutate_vars) {
-      funary(*used_vars[0], mutate_vars[0]);
-    };
-    num_use_vars = 1; num_mutate_vars = 1;
-    type_mask = kNArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
-    this->add_argument("src", "NArray", "Source input to the function.");
-    return *this;
-  }
-  /*!
-   * \brief set the number of mutate variables
-   * \param n number of mutate variablesx
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionReg &set_num_use_vars(unsigned n) {
-    num_use_vars = n; return *this;
-  }
-  /*!
-   * \brief set the number of mutate variables
-   * \param n number of mutate variablesx
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionReg &set_num_mutate_vars(unsigned n) {
-    num_mutate_vars = n; return *this;
-  }
-  /*!
-   * \brief set the number of scalar arguments
-   * \param n number of scalar arguments
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionReg &set_num_scalars(unsigned n) {
-    num_scalars = n; return *this;
-  }
-  /*!
-   * \brief set type mask
-   * \param tmask typemask
-   * \return ref to the registered entry, used to set properties
-   */
-  inline NArrayFunctionReg &set_type_mask(int tmask) {
-    type_mask = tmask; return *this;
-  }
-};  // NArrayFunctionReg
-
-/*!
- * \brief Macro to register NArray function
- *
- * Example: the following code is example to register a plus
- * \code
- *
- * REGISTER_NARRAY_FUN(Plus)
- * .set_function(Plus);
- *
- * \endcode
- */
-#define MXNET_REGISTER_NARRAY_FUN(name)                                 \
-  DMLC_REGISTRY_REGISTER(::mxnet::NArrayFunctionReg, NArrayFunctionReg, name)
-
-/*! \brief typedef the factory function of operator property */
-typedef OperatorProperty *(*OperatorPropertyFactory)();
-/*!
- * \brief Registry entry for OperatorProperty factory functions.
- */
-struct OperatorPropertyReg
-    : public dmlc::FunctionRegEntryBase<OperatorPropertyReg,
-                                        OperatorPropertyFactory> {
-};
-
-/*!
- * \brief Macro to register OperatorProperty
- *
- * \code
- * // example of registering a fully connected operator
- * REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedOpProp)
- * .describe("Fully connected layer");
- *
- * \endcode
- */
-#define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
-  static ::mxnet::OperatorProperty* __create__ ## OperatorPropertyType ## __() { \
-    return new OperatorPropertyType;                                    \
-  }                                                                     \
-  DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
-  .set_body(__create__ ## OperatorPropertyType ## __)
-
-}  // namespace mxnet
-#endif  // MXNET_REGISTRY_H_
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 94b71bce16cc..616ffbcd9270 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -10,12 +10,10 @@
 from __future__ import absolute_import
 
 from .context import Context, current_context
-from .narray import NArray
-from .function import _FunctionRegistry
+from . import narray
 from . import symbol
 
 __version__ = "0.1.0"
 
-# this is a global function registry that can be used to invoke functions
-op = NArray._init_function_registry(_FunctionRegistry())
+
 
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index e30c77d382a3..c514d6939988 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -70,10 +70,10 @@ def _load_lib():
 SymbolCreatorHandle = ctypes.c_void_p
 SymbolHandle = ctypes.c_void_p
 ExecutorHandle = ctypes.c_void_p
+
 #----------------------------
 # helper function definition
 #----------------------------
-
 def check_call(ret):
     """Check the return value of C API call
 
diff --git a/python/mxnet/narray.py b/python/mxnet/narray.py
index 61839ecc0a60..df762623bbad 100644
--- a/python/mxnet/narray.py
+++ b/python/mxnet/narray.py
@@ -1,15 +1,13 @@
 # coding: utf-8
-
 """NArray interface of mxnet"""
 from __future__ import absolute_import
 
 import ctypes
 from .base import _LIB
 from .base import c_array
-from .base import mx_uint, mx_float, NArrayHandle
+from .base import mx_uint, mx_float, NArrayHandle, FunctionHandle
 from .base import ctypes2numpy_shared
 from .base import check_call
-from .base import MXNetError
 from .context import Context
 
 def _new_empty_handle():
@@ -45,26 +43,10 @@ def _new_alloc_handle(shape, ctx, delay_alloc):
     return hdl
 
 class NArray(object):
-    """NArray object in mxnet
+    """NArray object in mxnet.
 
-    NArray is basic ndarray like data structure in mxnet
+    NArray is basic ndarray/Tensor like data structure in mxnet.
     """
-    # NArray static constants
-    _op = None
-
-    @staticmethod
-    def _init_function_registry(function_registry):
-        """Initialize the global variable op with new_op.
-
-        This function is used to resolve cyclic dependency of .narray on function
-
-        Parameters
-        ----------
-        function_registry : function._FunctionRegistry
-            FunctionRegistry to pass in in startup
-        """
-        NArray._op = function_registry
-        return function_registry
 
     def __init__(self, handle):
         """initialize a new NArray
@@ -81,42 +63,34 @@ def __del__(self):
         check_call(_LIB.MXNArrayFree(self.handle))
 
     def __add__(self, other):
-        hret = _new_empty_handle()
         if isinstance(other, NArray):
-            NArray._op.plus.invoke_with_handle_((other.handle, self.handle), (), (hret,))
+            return NArray._plus(self, other)
         else:
-            raise MXNetError('type %s not supported' % str(type(other)))
-        return NArray(handle=hret)
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def __radd__(self, other):
         return self.__add__(other)
 
     def __sub__(self, other):
-        hret = _new_empty_handle()
         if isinstance(other, NArray):
-            NArray._op.minus.invoke_with_handle_((other.handle, self.handle), (), (hret,))
+            return NArray._minus(self, other)
         else:
-            raise MXNetError('type %s not supported' % str(type(other)))
-        return NArray(handle=hret)
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def __mul__(self, other):
-        hret = _new_empty_handle()
         if isinstance(other, NArray):
-            NArray._op.mul.invoke_with_handle_((other.handle, self.handle), (), (hret,))
+            return NArray._mul(self, other)
         else:
-            raise MXNetError('type %s not supported' % str(type(other)))
-        return NArray(handle=hret)
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def __rmul__(self, other):
         return self.__mul__(other)
 
     def __div__(self, other):
-        hret = _new_empty_handle()
         if isinstance(other, NArray):
-            NArray._op.div.invoke_with_handle_((other.handle, self.handle), (), (hret,))
+            return NArray._div(self, other)
         else:
-            raise MXNetError('type %s not supported' % str(type(other)))
-        return NArray(handle=hret)
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def wait(self):
         """Wait until the data on current NArray is available."""
@@ -166,7 +140,7 @@ def numpy(self):
         return ctypes2numpy_shared(pdata, self.shape)
 
     def copyto(self, other):
-        """copy the content of current array to other.
+        """Copy the content of current array to other.
 
         When other is NArray, the content is copied over.
         When other is a Context, a new NArray in the context
@@ -175,22 +149,21 @@ def copyto(self, other):
         Parameters
         ----------
         other : NArray or Context
-            another narray we want to copy to,
-            or target context we want copy the data to
+            Target Narray or context we want to copy data to.
 
         Returns
         -------
-        the copy target NArray
+        dst : NArray
+            The copy target NArray
         """
         if isinstance(other, NArray):
-            NArray._op.copy.invoke_with_handle_((self.handle,), (), (other.handle,))
-            return other
+            return NArray._copyto(self, out=other)
         elif isinstance(other, Context):
-            hret = _new_alloc_handle(self.shape, other, True)
-            NArray._op.copy.invoke_with_handle_((self.handle,), (), (hret,))
-            return NArray(handle=hret)
+            hret = NArray(_new_alloc_handle(self.shape, other, True))
+            return NArray._copyto(self, out=hret)
         else:
-            raise MXNetError('copyto do not support type ' + type(other))
+            raise TypeError('copyto do not support type ' + type(other))
+
 
 def create(shape, ctx=Context.default_ctx):
     """Create a new NArray, with specified shape.
@@ -205,3 +178,170 @@ def create(shape, ctx=Context.default_ctx):
     a new NArray
     """
     return NArray(handle=_new_alloc_handle(shape, ctx, False))
+
+
+def _make_narray_function(handle):
+    """Create a NArray function from the FunctionHandle."""
+    # Constants for type masks.
+    NARRAY_ARG_BEFORE_SCALAR = 1
+    SCALAR_ARG_BEFORE_NARRAY = 1 << 1
+    ACCEPT_EMPTY_MUTATE_TARGET = 1 << 2
+    # Get the property of NArray
+    n_mutate_vars = 0
+    n_used_vars = mx_uint()
+    n_scalars = mx_uint()
+    n_mutate_vars = mx_uint()
+    type_mask = ctypes.c_int()
+    check_call(_LIB.MXFuncDescribe(
+            handle,
+            ctypes.byref(n_used_vars),
+            ctypes.byref(n_scalars),
+            ctypes.byref(n_mutate_vars),
+            ctypes.byref(type_mask)))
+    n_mutate_vars = n_mutate_vars.value
+    n_used_vars = n_used_vars.value
+    n_scalars = n_scalars.value
+    type_mask = type_mask.value
+    accept_empty_mutate = (type_mask & ACCEPT_EMPTY_MUTATE_TARGET) != 0
+    # infer type of the function
+    if (type_mask & NARRAY_ARG_BEFORE_SCALAR) != 0:
+        use_vars_range = range(0, n_used_vars)
+        scalar_range = range(n_used_vars, n_used_vars + n_scalars)
+    else:
+        scalar_range = range(0, n_scalars)
+        use_vars_range = range(n_scalars, n_used_vars + n_scalars)
+
+    # Get the information from the function
+    name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+
+    check_call(_LIB.MXFuncGetInfo(
+            handle, ctypes.byref(name), ctypes.byref(desc),
+            ctypes.byref(num_args),
+            ctypes.byref(arg_names),
+            ctypes.byref(arg_types),
+            ctypes.byref(arg_descs)))
+    func_name = name.value
+
+    param_str = []
+    for i in range(num_args.value):
+        ret = '%s : %s' % (arg_names[i], arg_types[i])
+        if len(arg_descs[i]) != 0:
+            ret += '\n    ' + arg_descs[i]
+        param_str.append(ret)
+
+    doc_str = ('%s\n\n' +
+               'Parameters\n' +
+               '----------\n' +
+               '%s\n' +
+               'out : NArray, optional\n' +
+               '    The output NArray to hold the result.\n\n'+
+               'Returns\n' +
+               '-------\n' +
+               'out : NArray\n'+
+               '    The output of binary function.')
+    doc_str = doc_str % (desc.value, '\n'.join(param_str))
+
+    # Definition of internal functions.
+    def binary_narray_function(lhs, rhs, out=None):
+        """Internal binary function
+        """
+        if out:
+            if isinstance(out, NArray):
+                raise TypeError('out must be NArray')
+        else:
+            if not accept_empty_mutate:
+                raise TypeError('argument out is required to call %s' % func_name)
+            out = NArray(_new_empty_handle())
+        check_call(_LIB.MXFuncInvoke(
+                handle,
+                c_array(NArrayHandle, (lhs.handle, rhs.handle)),
+                c_array(mx_float, ()),
+                c_array(NArrayHandle, (out.handle,))))
+        return out
+
+    def unary_narray_function(src, out=None):
+        """internal NArray function"""
+        if out:
+            if isinstance(out, NArray):
+                raise TypeError('out must be NArray')
+        else:
+            if not accept_empty_mutate:
+                raise TypeError('argument out is required to call %s' % func_name)
+            out = NArray(_new_empty_handle())
+        check_call(_LIB.MXFuncInvoke(
+                handle,
+                c_array(NArrayHandle, (src.handle)),
+                c_array(mx_float, ()),
+                c_array(NArrayHandle, (out.handle,))))
+        return out
+
+    def generic_narray_function(*args, **kwargs):
+        """Invoke this function by passing in parameters
+
+        Parameters
+        ----------
+        *args
+            Positional arguments of input scalars and NArray
+        out : NArray or tuple of NArray, optional
+            Output NArray, used to hold the output result.
+
+        Returns
+        -------
+        out : NArray
+            The result NArray(tuple) of result of computation.
+        """
+        if 'out' in kwargs:
+            mutate_vars = kwargs['out']
+            if isinstance(mutate_vars, NArray):
+                mutate_vars = (mutate_vars,)
+            if len(mutate_vars) != n_mutate_vars:
+                raise TypeError('expect %d out in %s', n_mutate_vars, func_name)
+        else:
+            if accept_empty_mutate:
+                mutate_vars = tuple(
+                    NArray(_new_empty_handle()) for i in range(n_mutate_vars))
+            else:
+                raise TypeError('argument out is required to call %s' % func_name)
+        check_call(_LIB.MXFuncInvoke(
+                handle,
+                c_array(NArrayHandle, [args[i].handle for i in use_vars_range]),
+                c_array(mx_float, [args[i] for i in scalar_range]),
+                c_array(NArrayHandle, [v.handle for v in mutate_vars])))
+        if n_mutate_vars == 1:
+            return mutate_vars[0]
+        else:
+            return mutate_vars
+    # End of function declaration
+    if n_mutate_vars == 1 and n_used_vars ==2 and n_scalars == 0:
+        ret_function = binary_narray_function
+    elif n_mutate_vars == 1 and n_used_vars ==2 and n_scalars == 0:
+        ret_function = unary_narray_function
+    else:
+        ret_function = generic_narray_function
+    ret_function.__name__ = func_name
+    ret_function.__doc__ = doc_str
+    return ret_function
+
+
+def _init_narray_module():
+    """List and add all the narray functions to current module."""
+    plist = ctypes.POINTER(FunctionHandle)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListFunctions(ctypes.byref(size),
+                                    ctypes.byref(plist)))
+    for i in range(size.value):
+        hdl = FunctionHandle(plist[i])
+        function = _make_narray_function(hdl)
+        # if function name starts with underscore, register as static method of NArray
+        if function.__name__.startswith('_'):
+            setattr(NArray, function.__name__, staticmethod(function))
+        else:
+            setattr(module_obj, function.__name__, function)
+
+# Initialize the NArray module
+_init_narray_module()
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 6fb5eda53912..087fdb182e26 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -226,7 +226,6 @@ def bind(self, ctx, args, args_grad, reqs):
         """
         # TODO(bing): consider a more friendly interface
         # For example, pass in args_grad by dict
-
         enum = {"null" : 0, "write_to" : 1, "in_place":2, "add_to" : 3}
         if not isinstance(ctx, Context):
             raise TypeError("Context type error")
@@ -322,7 +321,6 @@ def _make_atomic_symbol_function(handle):
                '-------\n' +
                'symbol: Symbol\n'+
                '    The result symbol.')
-
     doc_str = doc_str % (desc.value, '\n'.join(param_str))
 
     def creator(*args, **kwargs):
@@ -372,7 +370,7 @@ def creator(*args, **kwargs):
     return creator
 
 
-def _init_module_functions():
+def _init_symbol_module():
     """List and add all the atomic symbol functions to current module."""
     plist = ctypes.POINTER(ctypes.c_void_p)()
     size = ctypes.c_uint()
@@ -386,5 +384,4 @@ def _init_module_functions():
         setattr(module_obj, function.__name__, function)
 
 # Initialize the atomic symbo in startups
-_init_module_functions()
-
+_init_symbol_module()
diff --git a/python/test_python.py b/python/test_python.py
index 7aa4c432f1db..905d16c283f8 100644
--- a/python/test_python.py
+++ b/python/test_python.py
@@ -9,7 +9,7 @@
 
 c = b * a
 
-cc = mx.op.mul(b, a)
+cc = mx.narray.NArray._mul(b, a)
 
 print(c.context)
 print(cc.numpy)
diff --git a/src/c_api.cc b/src/c_api.cc
index 9b2a72a94688..a5ed648469e1 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -8,7 +8,6 @@
 #include <mxnet/base.h>
 #include <mxnet/narray.h>
 #include <mxnet/symbolic.h>
-#include <mxnet/registry.h>
 #include <mxnet/operator.h>
 #include <mxnet/c_api.h>
 #include <vector>
@@ -120,13 +119,13 @@ class MXAPIThreadLocalStore {
 #define API_BEGIN() try {
 /*! \brief every function starts with API_BEGIN();
      and finishes with API_END() or API_END_HANDLE_ERROR */
-#define API_END() } catch(dmlc::Error &e) { return MXHandleException(e); } return 0;
+#define API_END() } catch(dmlc::Error &_except_) { return MXHandleException(_except_); } return 0;
 /*!
  * \brief every function starts with API_BEGIN();
  *   and finishes with API_END() or API_END_HANDLE_ERROR
  *   The finally clause contains procedure to cleanup states when an error happens.
  */
-#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &e) { Finalize; return MXHandleException(e); } return 0; // NOLINT(*)
+#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return MXHandleException(_except_); } return 0; // NOLINT(*)
 
 /*! \brief return str message of the last error */
 const char *MXGetLastError() {
@@ -143,6 +142,39 @@ int MXHandleException(const dmlc::Error &e) {
   return -1;
 }
 
+// Internal function to get the information
+// from function registry
+// Used to implement MXSymbolGetAtomicSymbolInfo and MXFuncGetInfo
+template<typename FunRegType>
+inline int MXAPIGetFunctionRegInfo(const FunRegType *e,
+                                   const char **name,
+                                   const char **description,
+                                   mx_uint *num_args,
+                                   const char ***arg_names,
+                                   const char ***arg_type_infos,
+                                   const char ***arg_descriptions) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+
+  API_BEGIN();
+  *name = e->name.c_str();
+  *description = e->description.c_str();
+  *num_args = static_cast<mx_uint>(e->arguments.size());
+  ret->ret_vec_charp.clear();
+  for (size_t i = 0; i < e->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(e->arguments[i].name.c_str());
+  }
+  for (size_t i = 0; i < e->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(e->arguments[i].type_info_str.c_str());
+  }
+  for (size_t i = 0; i < e->arguments.size(); ++i) {
+    ret->ret_vec_charp.push_back(e->arguments[i].description.c_str());
+  }
+  *arg_names = dmlc::BeginPtr(ret->ret_vec_charp);
+  *arg_type_infos = dmlc::BeginPtr(ret->ret_vec_charp) + e->arguments.size();
+  *arg_descriptions = dmlc::BeginPtr(ret->ret_vec_charp) + (e->arguments.size() * 2);
+  API_END();
+}
+
 // NOTE: return value is added in API_END
 int MXNArrayCreateNone(NArrayHandle *out) {
   API_BEGIN();
@@ -254,12 +286,16 @@ int MXGetFunction(const char *name,
   API_END();
 }
 
-int MXFuncGetName(FunctionHandle fun,
-                  const char **out_name) {
-  API_BEGIN();
-  auto *f = static_cast<const NArrayFunctionReg*>(fun);
-  *out_name = f->name.c_str();
-  API_END();
+int MXFuncGetInfo(FunctionHandle fun,
+                  const char **name,
+                  const char **description,
+                  mx_uint *num_args,
+                  const char ***arg_names,
+                  const char ***arg_type_infos,
+                  const char ***arg_descriptions) {
+  return MXAPIGetFunctionRegInfo(static_cast<const NArrayFunctionReg *>(fun),
+                                 name, description, num_args,
+                                 arg_names, arg_type_infos, arg_descriptions);
 }
 
 int MXFuncDescribe(FunctionHandle fun,
@@ -316,28 +352,9 @@ int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
                                 const char ***arg_names,
                                 const char ***arg_type_infos,
                                 const char ***arg_descriptions) {
-  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   OperatorPropertyReg *e = static_cast<OperatorPropertyReg *>(creator);
-
-  API_BEGIN();
-
-  *name = e->name.c_str();
-  *description = e->description.c_str();
-  *num_args = static_cast<mx_uint>(e->arguments.size());
-  ret->ret_vec_charp.clear();
-  for (size_t i = 0; i < e->arguments.size(); ++i) {
-    ret->ret_vec_charp.push_back(e->arguments[i].name.c_str());
-  }
-  for (size_t i = 0; i < e->arguments.size(); ++i) {
-    ret->ret_vec_charp.push_back(e->arguments[i].type_info_str.c_str());
-  }
-  for (size_t i = 0; i < e->arguments.size(); ++i) {
-    ret->ret_vec_charp.push_back(e->arguments[i].description.c_str());
-  }
-  *arg_names = dmlc::BeginPtr(ret->ret_vec_charp);
-  *arg_type_infos = dmlc::BeginPtr(ret->ret_vec_charp) + e->arguments.size();
-  *arg_descriptions = dmlc::BeginPtr(ret->ret_vec_charp) + (e->arguments.size() * 2);
-  API_END();
+  return MXAPIGetFunctionRegInfo(e, name, description, num_args,
+                                 arg_names, arg_type_infos, arg_descriptions);
 }
 
 int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
diff --git a/src/narray/narray.cc b/src/narray/narray.cc
index 2faf0789c607..f595acc0c44b 100644
--- a/src/narray/narray.cc
+++ b/src/narray/narray.cc
@@ -1,14 +1,18 @@
 /*!
  *  Copyright (c) 2015 by Contributors
  * \file narray.cc
- * \brief
+ * \brief narry module of mxnet
  */
 #include <dmlc/logging.h>
+#include <dmlc/registry.h>
 #include <mxnet/narray.h>
-#include <mxnet/registry.h>
 #include <mshadow/tensor.h>
 #include "./narray_function.h"
 
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::mxnet::NArrayFunctionReg);
+}  // namespace dmlc
+
 namespace mxnet {
 /*!
  * \brief run a binary operation
@@ -150,14 +154,15 @@ NArray &NArray::operator/=(const NArray &src) {
 }
 
 // register API function
-MXNET_REGISTER_NARRAY_FUN(plus).set_function(BinaryOp<narray::Plus>);
-MXNET_REGISTER_NARRAY_FUN(minus).set_function(BinaryOp<narray::Minus>);
-MXNET_REGISTER_NARRAY_FUN(mul).set_function(BinaryOp<narray::Mul>);
-MXNET_REGISTER_NARRAY_FUN(div).set_function(BinaryOp<narray::Div>);
+// those with underscore will be registered at NArray
+MXNET_REGISTER_NARRAY_FUN(_plus).set_function(BinaryOp<narray::Plus>);
+MXNET_REGISTER_NARRAY_FUN(_minus).set_function(BinaryOp<narray::Minus>);
+MXNET_REGISTER_NARRAY_FUN(_mul).set_function(BinaryOp<narray::Mul>);
+MXNET_REGISTER_NARRAY_FUN(_div).set_function(BinaryOp<narray::Div>);
 
 // copy function is special
 // that we need to remove kAcceptEmptyMutateTarget from it
-MXNET_REGISTER_NARRAY_FUN(copy)
+MXNET_REGISTER_NARRAY_FUN(_copyto)
 .set_function(CopyFromTo)
 .set_type_mask(kNArrayArgBeforeScalar);
 
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index 1872327fcd7b..53a48f321b59 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -4,8 +4,6 @@
  * \brief activation op
  * \author Bing Xu
 */
-
-#include <mxnet/registry.h>
 #include "./activation-inl.h"
 #include "./mshadow_op.h"
 
diff --git a/src/operator/elementwise_sum.cc b/src/operator/elementwise_sum.cc
index 840e2e179868..e8c3968c94ab 100644
--- a/src/operator/elementwise_sum.cc
+++ b/src/operator/elementwise_sum.cc
@@ -3,7 +3,6 @@
  * \file elementwise_sum.cc
  * \brief elementwise sum operator
 */
-#include <mxnet/registry.h>
 #include "./elementwise_sum-inl.h"
 namespace mxnet {
 namespace op {
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 69524800a916..0ea327cf5df5 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -3,7 +3,6 @@
  * \file fully_connected.cc
  * \brief fully connect operator
 */
-#include <mxnet/registry.h>
 #include "./fully_connected-inl.h"
 namespace mxnet {
 namespace op {
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 883948e27981..23ec136e5059 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -4,8 +4,6 @@
  * \brief
  * \author Bing Xu
 */
-
-#include <mxnet/registry.h>
 #include "./pooling-inl.h"
 
 namespace mxnet {
diff --git a/src/operator/softmax.cc b/src/operator/softmax.cc
index 8cab48d6f28a..2c2516ba9bc9 100644
--- a/src/operator/softmax.cc
+++ b/src/operator/softmax.cc
@@ -4,8 +4,6 @@
  * \brief
  * \author Bing Xu
 */
-
-#include <mxnet/registry.h>
 #include "./softmax-inl.h"
 
 namespace mxnet {
diff --git a/src/registry.cc b/src/registry.cc
deleted file mode 100644
index 8587f4666aab..000000000000
--- a/src/registry.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file registry.cc
- * \brief central place for registry definition in mxnet.
- */
-#include <dmlc/base.h>
-#include <dmlc/registry.h>
-#include <mxnet/registry.h>
-#include <mxnet/symbolic.h>
-
-// enable the registries
-namespace dmlc {
-DMLC_REGISTRY_ENABLE(::mxnet::NArrayFunctionReg);
-DMLC_REGISTRY_ENABLE(::mxnet::OperatorPropertyReg);
-}  // namespace dmlc
-
-namespace mxnet {
-// implementation of all factory functions
-OperatorProperty *OperatorProperty::Create(const char* type_name) {
-  auto *creator = dmlc::Registry<OperatorPropertyReg>::Find(type_name);
-  CHECK_NE(creator, nullptr)
-      << "Cannot find Operator " << type_name << " in registry";
-  return creator->body();
-}
-}  // namespace mxnet

From 25bc1effc7df61aaba96ff5f56f3bc9b9ea71bda Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 20:03:38 -0600
Subject: [PATCH 16/20] remove funciton.py

---
 python/mxnet/function.py | 131 ---------------------------------------
 1 file changed, 131 deletions(-)
 delete mode 100644 python/mxnet/function.py

diff --git a/python/mxnet/function.py b/python/mxnet/function.py
deleted file mode 100644
index 9903be604d6c..000000000000
--- a/python/mxnet/function.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# coding: utf-8
-"""NArray functions support of mxnet"""
-from __future__ import absolute_import
-
-import ctypes
-from .base import _LIB
-from .base import c_array
-from .base import mx_uint, mx_float, NArrayHandle
-from .base import check_call, MXNetError
-from .narray import NArray, _new_empty_handle
-
-class _Function(object):
-    """Function Object."""
-    # constants for type masks
-    NARRAY_ARG_BEFORE_SCALAR = 1
-    SCALAR_ARG_BEFORE_NARRAY = 1 << 1
-    ACCEPT_EMPTY_MUTATE_TARGET = 1 << 2
-
-    def __init__(self, handle, name):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : FunctionHandle
-            the function handle of the function
-
-        name : string
-            the name of the function
-        """
-        self.handle = handle
-        self.name = name
-        n_used_vars = mx_uint()
-        n_scalars = mx_uint()
-        n_mutate_vars = mx_uint()
-        type_mask = ctypes.c_int()
-        check_call(_LIB.MXFuncDescribe(
-            self.handle,
-            ctypes.byref(n_used_vars),
-            ctypes.byref(n_scalars),
-            ctypes.byref(n_mutate_vars),
-            ctypes.byref(type_mask)))
-        self.n_used_vars = n_used_vars.value
-        self.n_scalars = n_scalars.value
-        self.n_mutate_vars = n_mutate_vars.value
-        self.type_mask = type_mask.value
-        # infer type of the function
-        if (self.type_mask & _Function.NARRAY_ARG_BEFORE_SCALAR) != 0:
-            self.use_vars_range = range(0, self.n_used_vars)
-            self.scalar_range = range(self.n_used_vars,
-                                      self.n_used_vars + self.n_scalars)
-        else:
-            self.scalar_range = range(0, self.n_scalars)
-            self.use_vars_range = range(self.n_scalars,
-                                        self.n_scalars + self.n_used_vars)
-        self.accept_empty_mutate = (self.type_mask &
-                                    _Function.ACCEPT_EMPTY_MUTATE_TARGET) != 0
-
-    def __call__(self, *args, **kwargs):
-        """Invoke this function by passing in parameters
-
-        Parameters
-        ----------
-        *args: positional arguments
-            positional arguments of input scalars and NArray
-        mutate_vars: kwarg(optional)
-            provide the NArray to store the result of the operation
-        Returns
-        -------
-        the result NArrays of mutated result
-        """
-        if 'mutate_vars' in kwargs:
-            mutate_vars = kwargs['mutate_vars']
-            if isinstance(mutate_vars, NArray):
-                mutate_vars = (mutate_vars,)
-            if len(mutate_vars) != self.n_mutate_vars:
-                raise MXNetError('expect %d mutate_vars in op.%s', self.n_mutate_vars, self.name)
-        else:
-            if self.accept_empty_mutate:
-                mutate_vars = tuple(
-                    NArray(_new_empty_handle()) for i in range(self.n_mutate_vars))
-            else:
-                raise MXNetError('mutate_vars argument is required to call op.%s' % self.name)
-
-        self.invoke_with_handle_([args[i].handle for i in self.use_vars_range],
-                                 [args[i] for i in self.scalar_range],
-                                 [v.handle for v in mutate_vars])
-        if self.n_mutate_vars == 1:
-            return mutate_vars[0]
-        else:
-            return mutate_vars
-
-    def invoke_with_handle_(self, use_vars, scalars, mutate_vars):
-        """Invoke this function by passing in arguments as tuples
-
-        This is a very primitive call to the function handle that
-        involves passing in a C handle
-
-        Parameters
-        ----------
-        fhandle : FunctionHandle
-            function handle of C API
-
-        use_vars : tuple
-            tuple of NArray handles
-
-        scalars : tuple
-            tuple of real number arguments
-
-        mutate_vars : tuple
-            tuple of NArray handles to mutate
-        """
-        check_call(_LIB.MXFuncInvoke(
-            self.handle,
-            c_array(NArrayHandle, use_vars),
-            c_array(mx_float, scalars),
-            c_array(NArrayHandle, mutate_vars)))
-
-class _FunctionRegistry(object):
-    """Function Registry"""
-    def __init__(self):
-        plist = ctypes.POINTER(ctypes.c_void_p)()
-        size = ctypes.c_uint()
-        check_call(_LIB.MXListFunctions(ctypes.byref(size),
-                                        ctypes.byref(plist)))
-        hmap = {}
-        for i in range(size.value):
-            hdl = ctypes.c_void_p(plist[i])
-            name = ctypes.c_char_p()
-            check_call(_LIB.MXFuncGetName(hdl, ctypes.byref(name)))
-            hmap[name.value] = _Function(hdl, name.value)
-        self.__dict__.update(hmap)

From c2570ec0ecaebe1a99844dcc82fba1463245bfca Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 20:36:22 -0600
Subject: [PATCH 17/20] add

---
 doc/python/narray.md       | 12 ++++++++++++
 doc/python/python_guide.md |  8 ++++++++
 doc/python/symbol.md       | 21 +++++++++++++++++++++
 src/operator/operator.cc   | 22 ++++++++++++++++++++++
 4 files changed, 63 insertions(+)
 create mode 100644 doc/python/narray.md
 create mode 100644 doc/python/python_guide.md
 create mode 100644 doc/python/symbol.md
 create mode 100644 src/operator/operator.cc

diff --git a/doc/python/narray.md b/doc/python/narray.md
new file mode 100644
index 000000000000..e4befbb908ba
--- /dev/null
+++ b/doc/python/narray.md
@@ -0,0 +1,12 @@
+Python NArray API
+=================
+NArray is the basic computation element in mxnet.
+It is like numpy.ndarray, but comes with two unique features *gpu execution* and *dependency scheduling*.
+
+
+NArray API Reference
+--------------------
+```eval_rst
+.. automodule:: mxnet.narray
+    :members:
+```
diff --git a/doc/python/python_guide.md b/doc/python/python_guide.md
new file mode 100644
index 000000000000..5d3acec884be
--- /dev/null
+++ b/doc/python/python_guide.md
@@ -0,0 +1,8 @@
+MXNet Python Guide
+==================
+This page gives the user guide of mxnet python package.
+
+Contents
+--------
+* [NArray API](narray.md)
+* [Symbolic API](symbol.md)
diff --git a/doc/python/symbol.md b/doc/python/symbol.md
new file mode 100644
index 000000000000..f9bb0585c4b2
--- /dev/null
+++ b/doc/python/symbol.md
@@ -0,0 +1,21 @@
+Python Symbolic API
+===================
+Symbolic part of mxnet allows you to describe a computational graph in a declarative way.
+The Symbol object is a lightweight contains the head of the computation graph.
+The Symbol can be binded to Executor, where the computation resources are actually allocated and computation
+
+
+Symbolic API Reference
+----------------------
+```eval_rst
+.. automodule:: mxnet.symbol
+    :members:
+```
+
+
+Executor API Reference
+----------------------
+```eval_rst
+.. automodule:: mxnet.executor
+    :members:
+```
diff --git a/src/operator/operator.cc b/src/operator/operator.cc
new file mode 100644
index 000000000000..b189a3f22a89
--- /dev/null
+++ b/src/operator/operator.cc
@@ -0,0 +1,22 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file operator.cc
+ * \brief operator module of mxnet
+ */
+#include <dmlc/logging.h>
+#include <dmlc/registry.h>
+#include <mxnet/operator.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::mxnet::OperatorPropertyReg);
+}  // namespace dmlc
+
+namespace mxnet {
+// implementation of all factory functions
+OperatorProperty *OperatorProperty::Create(const char* type_name) {
+  auto *creator = dmlc::Registry<OperatorPropertyReg>::Find(type_name);
+  CHECK_NE(creator, nullptr)
+      << "Cannot find Operator " << type_name << " in registry";
+  return creator->body();
+}
+}  // namespace mxnet

From c993bd005ee8b6066fa704a6cbff74981963be82 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 20:26:31 -0600
Subject: [PATCH 18/20] change

---
 include/mxnet/operator.h      |  3 +-
 src/operator/activation-inl.h |  4 +-
 src/operator/activation.cc    |  2 +-
 src/operator/activation.cu    |  2 +-
 src/operator/pooling-inl.h    | 90 +++++++++++++++++------------------
 src/operator/pooling.cc       |  2 +-
 src/operator/pooling.cu       |  2 +-
 src/operator/softmax-inl.h    |  7 +--
 8 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 247f6ae5b2c9..76b6721f5622 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -363,8 +363,6 @@ class OperatorProperty {
    */
   static OperatorProperty *Create(const char* type_name);
 };
-#endif
-
 
 /*! \brief typedef the factory function of operator property */
 typedef OperatorProperty *(*OperatorPropertyFactory)();
@@ -395,5 +393,6 @@ struct OperatorPropertyReg
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
   .set_body(__create__ ## OperatorPropertyType ## __)
+#endif  // DMLC_USE_CXX11
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_H_
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index c6ade088c545..7315d908aa0d 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -27,9 +27,9 @@ enum ActivationOpType {kReLU, kSigmoid, kTanh};
 
 struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
-  int type;
+  int act_type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
-    DMLC_DECLARE_FIELD(type).set_default(kReLU)
+    DMLC_DECLARE_FIELD(act_type).set_default(kReLU)
         .add_enum("relu", kReLU)
         .add_enum("sigmoid", kSigmoid)
         .add_enum("tanh", kTanh)
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index 53a48f321b59..a3a246723171 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -11,7 +11,7 @@ namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>(ActivationParam param) {
-  switch (param.type) {
+  switch (param.act_type) {
     case kReLU: return new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad>();
     case kSigmoid: return new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
     case kTanh: return new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index 5b7b576e59d7..b1b8fc4fb8b0 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -11,7 +11,7 @@ namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<gpu>(ActivationParam param) {
-  switch(param.type) {
+  switch(param.act_type) {
     case kReLU: return new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad>();
     case kSigmoid: return new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
     case kTanh: return new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 7b577c51f944..5a299a6fd017 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -25,25 +25,24 @@ enum PoolingOpOutputs {kOut};
 enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
 
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
-  int kernel_x;
-  int kernel_y;
-  int stride_x;
-  int stride_y;
-  int pad_x;
-  int pad_y;
-  int type;
+  TShape kernel;
+  TShape stride;
+  TShape pad;
+  int pool_type;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     // TODO(bing) change to only set lower bound
-    DMLC_DECLARE_FIELD(kernel_x).set_range(1, 10000);
-    DMLC_DECLARE_FIELD(kernel_y).set_range(1, 10000);
-    DMLC_DECLARE_FIELD(stride_x).set_range(1, 10000);
-    DMLC_DECLARE_FIELD(stride_y).set_range(1, 10000);
-    DMLC_DECLARE_FIELD(pad_x).set_default(0).set_range(0, 10000);
-    DMLC_DECLARE_FIELD(pad_y).set_default(0).set_range(0, 10000);
-    DMLC_DECLARE_FIELD(type).set_default(kMaxPooling)
+    int shape[] = {0,0};
+    DMLC_DECLARE_FIELD(kernel).describe("pooling kernel size: (y, x)");
+    DMLC_DECLARE_FIELD(pool_type).set_default(kMaxPooling)
       .add_enum("max", kMaxPooling)
       .add_enum("avg", kAvgPooling)
-      .add_enum("sum", kSumPooling);
+      .add_enum("sum", kSumPooling)
+      .describe("Pooling type to be applied.");
+    DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
+      .describe("pad for pooling: (y, x)");
+    shape[0] = shape[1] = 1;
+    DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
+      .describe("stride: for pooling (y, x)");
   }
 };
 
@@ -52,6 +51,7 @@ class PoolingOp : public Operator {
  public:
   explicit PoolingOp(PoolingParam p) {
     this->param_ = p;
+    std::cout << param_.kernel << std::endl;
   }
 
   virtual void Forward(const OpContext &ctx,
@@ -68,19 +68,19 @@ class PoolingOp : public Operator {
     Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
     // TODO(bing): dual stride in mshadow
-    if (param_.type == kMaxPooling || param_.type == kSumPooling) {
-      out = pool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+    if (param_.pool_type == kMaxPooling || param_.pool_type == kSumPooling) {
+      out = pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                           out_shape,
-                          param_.kernel_y,
-                          param_.kernel_x,
-                          param_.kernel_y);
-    } else if (param_.type == kAvgPooling) {
-      out = (1.0f / (param_.kernel_y * param_.kernel_x)) * \
-            pool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+                          param_.kernel[0],
+                          param_.kernel[1],
+                          param_.kernel[0]);
+    } else if (param_.pool_type == kAvgPooling) {
+      out = (1.0f / (param_.kernel[0] * param_.kernel[1])) * \
+            pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                           out_shape,
-                          param_.kernel_y,
-                          param_.kernel_x,
-                          param_.kernel_y);
+                          param_.kernel[0],
+                          param_.kernel[1],
+                          param_.kernel[0]);
     }
   }
 
@@ -106,29 +106,29 @@ class PoolingOp : public Operator {
 
     mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
 
-    if (param_.type == kMaxPooling || param_.type == kSumPooling) {
+    if (param_.pool_type == kMaxPooling || param_.pool_type == kSumPooling) {
       Assign(input_grad, req[kData],
-             crop(unpool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
                                   pad(grad, 0, 0),
-                                  param_.kernel_y,
-                                  param_.kernel_x,
-                                  param_.stride_y),
+                                  param_.kernel[0],
+                                  param_.kernel[1],
+                                  param_.stride[0]),
                   in_shape,
-                  param_.pad_y,
-                  param_.pad_x));
-    } else if (param_.type == kAvgPooling) {
+                  param_.pad[0],
+                  param_.pad[1]));
+    } else if (param_.pool_type == kAvgPooling) {
       Assign(input_grad, req[kData],
-             (1.0f / param_.kernel_y / param_.kernel_x) *\
-             crop(unpool<Reducer>(pad(data, param_.pad_y, param_.pad_x),
+             (1.0f / param_.kernel[0] / param_.kernel[1]) *\
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
                                   pad(grad, 0, 0),
-                                  param_.kernel_y,
-                                  param_.kernel_x,
-                                  param_.stride_y),
+                                  param_.kernel[0],
+                                  param_.kernel[1],
+                                  param_.stride[0]),
                   in_shape,
-                  param_.pad_y,
-                  param_.pad_x));
+                  param_.pad[0],
+                  param_.pad[1]));
     }
   }
 
@@ -155,10 +155,10 @@ class PoolingProp : public OperatorProperty {
       "Pooling: Input data should be 4D in (batch, channel, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    oshape[2] = std::min(dshape[2] + 2 * param_.pad_y - param_.kernel_y + param_.stride_y - 1,
-                         dshape[2] + 2 * param_.pad_y - 1) / param_.stride_y + 1;
-    oshape[3] = std::min(dshape[3] + 2 * param_.pad_x - param_.kernel_x + param_.stride_x - 1,
-                         dshape[3] + 2 * param_.pad_x - 1) / param_.stride_x + 1;
+    oshape[2] = std::min(dshape[2] + 2 * param_.pad[0] - param_.kernel[0] + param_.stride[0] - 1,
+                         dshape[2] + 2 * param_.pad[0] - 1) / param_.stride[0] + 1;
+    oshape[3] = std::min(dshape[3] + 2 * param_.pad[1] - param_.kernel[1] + param_.stride[1] - 1,
+                         dshape[3] + 2 * param_.pad[1] - 1) / param_.stride[1] + 1;
     CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
     out_shape->clear();
     out_shape->push_back(oshape);
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 23ec136e5059..cf7e965a802f 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -10,7 +10,7 @@ namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param) {
-  switch (param.type) {
+  switch (param.pool_type) {
     case kMaxPooling: return new PoolingOp<cpu, mshadow::red::maximum>(param);
     case kAvgPooling: return new PoolingOp<cpu, mshadow::red::sum>(param);
     case kSumPooling: return new PoolingOp<cpu, mshadow::red::sum>(param);
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index 2db6d9ea549a..5037050ccd6f 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -11,7 +11,7 @@ namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<gpu>(PoolingParam param) {
-  switch (param.type) {
+  switch (param.pool_type) {
     case kMaxPooling: return new PoolingOp<gpu, mshadow::red::maximum>(param);
     case kAvgPooling: return new PoolingOp<gpu, mshadow::red::sum>(param);
     case kSumPooling: return new PoolingOp<gpu, mshadow::red::sum>(param);
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index b0486cdb9a9e..c62d4c980941 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -26,7 +26,8 @@ enum SoftmaxOpOutputs {kOut};
 struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   float grad_scale;
   DMLC_DECLARE_PARAMETER(SoftmaxParam) {
-    DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f);
+    DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
+      .describe("Scale the gradient by a float factor");
   };
 };
 
@@ -63,7 +64,7 @@ class SoftmaxOp : public Operator {
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 2> out = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
     SoftmaxGrad(grad, out, label);
     if (param_.grad_scale < 1.0) {
@@ -124,7 +125,7 @@ class SoftmaxProp : public OperatorProperty {
       const std::vector<int> &in_data,
       const std::vector<int> &out_data,
       const std::vector<void*> &in_grad) const {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_data[kOut], in_grad[kData]}};
   }
 
   virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(

From 2cbe804dc9cbe52672e38c7861dcc7b2754fd903 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 21:00:56 -0600
Subject: [PATCH 19/20] wtf

---
 include/mxnet/narray.h       |  2 +-
 python/mxnet/narray.py       | 49 ++++++++++++++++++------------------
 python/mxnet/symbol.py       | 10 ++++----
 src/narray/narray_function.h |  6 ++---
 src/operator/pooling-inl.h   |  2 +-
 src/operator/softmax-inl.h   |  4 +--
 6 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/mxnet/narray.h b/include/mxnet/narray.h
index 7bcce54aafef..3400a541d0e8 100644
--- a/include/mxnet/narray.h
+++ b/include/mxnet/narray.h
@@ -312,7 +312,7 @@ struct NArrayFunctionReg
   /*!
    * \brief constructor
    */
-  explicit NArrayFunctionReg()
+  NArrayFunctionReg()
       : num_use_vars(0),
         num_mutate_vars(0),
         num_scalars(0),
diff --git a/python/mxnet/narray.py b/python/mxnet/narray.py
index df762623bbad..8b826e7b31ae 100644
--- a/python/mxnet/narray.py
+++ b/python/mxnet/narray.py
@@ -1,4 +1,5 @@
 # coding: utf-8
+# pylint: disable=invalid-name, protected-access, too-many-locals, fixme, no-member
 """NArray interface of mxnet"""
 from __future__ import absolute_import
 
@@ -192,11 +193,11 @@ def _make_narray_function(handle):
     n_scalars = mx_uint()
     n_mutate_vars = mx_uint()
     type_mask = ctypes.c_int()
-    check_call(_LIB.MXFuncDescribe(
-            handle,
-            ctypes.byref(n_used_vars),
-            ctypes.byref(n_scalars),
-            ctypes.byref(n_mutate_vars),
+    check_call(_LIB.MXFuncDescribe( \
+            handle, \
+            ctypes.byref(n_used_vars), \
+            ctypes.byref(n_scalars), \
+            ctypes.byref(n_mutate_vars), \
             ctypes.byref(type_mask)))
     n_mutate_vars = n_mutate_vars.value
     n_used_vars = n_used_vars.value
@@ -219,11 +220,11 @@ def _make_narray_function(handle):
     arg_types = ctypes.POINTER(ctypes.c_char_p)()
     arg_descs = ctypes.POINTER(ctypes.c_char_p)()
 
-    check_call(_LIB.MXFuncGetInfo(
-            handle, ctypes.byref(name), ctypes.byref(desc),
-            ctypes.byref(num_args),
-            ctypes.byref(arg_names),
-            ctypes.byref(arg_types),
+    check_call(_LIB.MXFuncGetInfo( \
+            handle, ctypes.byref(name), ctypes.byref(desc), \
+            ctypes.byref(num_args), \
+            ctypes.byref(arg_names), \
+            ctypes.byref(arg_types), \
             ctypes.byref(arg_descs)))
     func_name = name.value
 
@@ -257,10 +258,10 @@ def binary_narray_function(lhs, rhs, out=None):
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
             out = NArray(_new_empty_handle())
-        check_call(_LIB.MXFuncInvoke(
-                handle,
-                c_array(NArrayHandle, (lhs.handle, rhs.handle)),
-                c_array(mx_float, ()),
+        check_call(_LIB.MXFuncInvoke( \
+                handle, \
+                c_array(NArrayHandle, (lhs.handle, rhs.handle)), \
+                c_array(mx_float, ()), \
                 c_array(NArrayHandle, (out.handle,))))
         return out
 
@@ -273,10 +274,10 @@ def unary_narray_function(src, out=None):
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
             out = NArray(_new_empty_handle())
-        check_call(_LIB.MXFuncInvoke(
-                handle,
-                c_array(NArrayHandle, (src.handle)),
-                c_array(mx_float, ()),
+        check_call(_LIB.MXFuncInvoke( \
+                handle, \
+                c_array(NArrayHandle, (src.handle)), \
+                c_array(mx_float, ()), \
                 c_array(NArrayHandle, (out.handle,))))
         return out
 
@@ -307,19 +308,19 @@ def generic_narray_function(*args, **kwargs):
                     NArray(_new_empty_handle()) for i in range(n_mutate_vars))
             else:
                 raise TypeError('argument out is required to call %s' % func_name)
-        check_call(_LIB.MXFuncInvoke(
-                handle,
-                c_array(NArrayHandle, [args[i].handle for i in use_vars_range]),
-                c_array(mx_float, [args[i] for i in scalar_range]),
+        check_call(_LIB.MXFuncInvoke( \
+                handle, \
+                c_array(NArrayHandle, [args[i].handle for i in use_vars_range]), \
+                c_array(mx_float, [args[i] for i in scalar_range]), \
                 c_array(NArrayHandle, [v.handle for v in mutate_vars])))
         if n_mutate_vars == 1:
             return mutate_vars[0]
         else:
             return mutate_vars
     # End of function declaration
-    if n_mutate_vars == 1 and n_used_vars ==2 and n_scalars == 0:
+    if n_mutate_vars == 1 and n_used_vars == 2 and n_scalars == 0:
         ret_function = binary_narray_function
-    elif n_mutate_vars == 1 and n_used_vars ==2 and n_scalars == 0:
+    elif n_mutate_vars == 1 and n_used_vars == 2 and n_scalars == 0:
         ret_function = unary_narray_function
     else:
         ret_function = generic_narray_function
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 087fdb182e26..5818d0f53305 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -297,11 +297,11 @@ def _make_atomic_symbol_function(handle):
     arg_types = ctypes.POINTER(ctypes.c_char_p)()
     arg_descs = ctypes.POINTER(ctypes.c_char_p)()
 
-    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
-            handle, ctypes.byref(name), ctypes.byref(desc),
-            ctypes.byref(num_args),
-            ctypes.byref(arg_names),
-            ctypes.byref(arg_types),
+    check_call(_LIB.MXSymbolGetAtomicSymbolInfo( \
+            handle, ctypes.byref(name), ctypes.byref(desc), \
+            ctypes.byref(num_args), \
+            ctypes.byref(arg_names), \
+            ctypes.byref(arg_types), \
             ctypes.byref(arg_descs)))
     func_name = name.value
     param_str = []
diff --git a/src/narray/narray_function.h b/src/narray/narray_function.h
index 21a8da782972..50e86aeed9ed 100644
--- a/src/narray/narray_function.h
+++ b/src/narray/narray_function.h
@@ -3,8 +3,8 @@
  * \file narray_op.h
  * \brief the real execution functions of narray operations
  */
-#ifndef MXNET_NARRAY_NARRAY_OP_H_
-#define MXNET_NARRAY_NARRAY_OP_H_
+#ifndef MXNET_NARRAY_NARRAY_FUNCTION_H_
+#define MXNET_NARRAY_NARRAY_FUNCTION_H_
 #include <dmlc/logging.h>
 #include <mshadow/tensor.h>
 #include <mxnet/base.h>
@@ -44,4 +44,4 @@ void Copy(const TBlob &from, TBlob *to,
 
 }  // namespace narray
 }  // namespace mxnet
-#endif  // MXNET_NARRAY_NARRAY_OP_H_
+#endif  // MXNET_NARRAY_NARRAY_FUNCTION_H_
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 5a299a6fd017..6ccae18abf22 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -31,7 +31,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   int pool_type;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     // TODO(bing) change to only set lower bound
-    int shape[] = {0,0};
+    int shape[] = {0, 0};
     DMLC_DECLARE_FIELD(kernel).describe("pooling kernel size: (y, x)");
     DMLC_DECLARE_FIELD(pool_type).set_default(kMaxPooling)
       .add_enum("max", kMaxPooling)
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index c62d4c980941..7997b3b107b3 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -83,7 +83,7 @@ Operator* CreateOp(SoftmaxParam param);
 #if DMLC_USE_CXX11
 class SoftmaxProp : public OperatorProperty {
  public:
-  virtual std::vector<std::string> ListArguments() const override {
+  virtual std::vector<std::string> ListArguments() const {
     return {"data", "label"};
   }
 
@@ -135,9 +135,9 @@ class SoftmaxProp : public OperatorProperty {
   }
 
   Operator* CreateOperator(Context ctx) const;
+
  private:
   SoftmaxParam param_;
-
 };  // class SoftmaxProp
 #endif  // DMLC_USE_CXX11
 

From a8e73a833ed86efc300c6458bfb985ff0d920c99 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 22 Aug 2015 21:21:03 -0600
Subject: [PATCH 20/20] fix lint

---
 python/mxnet/narray.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/narray.py b/python/mxnet/narray.py
index 8b826e7b31ae..0693c2fa2cb4 100644
--- a/python/mxnet/narray.py
+++ b/python/mxnet/narray.py
@@ -4,6 +4,7 @@
 from __future__ import absolute_import
 
 import ctypes
+import sys
 from .base import _LIB
 from .base import c_array
 from .base import mx_uint, mx_float, NArrayHandle, FunctionHandle
@@ -185,7 +186,6 @@ def _make_narray_function(handle):
     """Create a NArray function from the FunctionHandle."""
     # Constants for type masks.
     NARRAY_ARG_BEFORE_SCALAR = 1
-    SCALAR_ARG_BEFORE_NARRAY = 1 << 1
     ACCEPT_EMPTY_MUTATE_TARGET = 1 << 2
     # Get the property of NArray
     n_mutate_vars = 0
@@ -335,6 +335,8 @@ def _init_narray_module():
     size = ctypes.c_uint()
     check_call(_LIB.MXListFunctions(ctypes.byref(size),
                                     ctypes.byref(plist)))
+
+    module_obj = sys.modules[__name__]
     for i in range(size.value):
         hdl = FunctionHandle(plist[i])
         function = _make_narray_function(hdl)