From f664f9ddd78a2530772b763dec40141e9c366f5d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 11 Oct 2015 22:17:09 -0600
Subject: [PATCH 001/122] LSTM init

---
 dmlc-core                     |   2 +-
 example/LSTM/PennTree.ipynb   | 643 ++++++++++++++++++++++++++++++++++
 mshadow                       |   2 +-
 ps-lite                       |   2 +-
 src/operator/block_grad-inl.h | 110 ++++++
 src/operator/block_grad.cc    |  26 ++
 src/operator/block_grad.cu    |  18 +
 src/operator/reshape-inl.h    |   4 +-
 src/operator/reshape.cc       |   6 +-
 src/operator/reshape.cu       |   4 +-
 10 files changed, 808 insertions(+), 9 deletions(-)
 create mode 100644 example/LSTM/PennTree.ipynb
 create mode 100644 src/operator/block_grad-inl.h
 create mode 100644 src/operator/block_grad.cc
 create mode 100644 src/operator/block_grad.cu

diff --git a/dmlc-core b/dmlc-core
index df27b04189ac..046a4a77e74d 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit df27b04189ac0f4cfc06eafe026c76856af2d84d
+Subproject commit 046a4a77e74d45e5ac16f2a598c31d56d5ccce3d
diff --git a/example/LSTM/PennTree.ipynb b/example/LSTM/PennTree.ipynb
new file mode 100644
index 000000000000..798525284b8b
--- /dev/null
+++ b/example/LSTM/PennTree.ipynb
@@ -0,0 +1,643 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LSTM on PennTreeBank\n",
+    "-----\n",
+    "This is an example to show how to use MXNet low-level symbol to make a LSTM network.\n",
+    "\n",
+    "We would like to thank Wojciech Zaremba for his work LSTM in Torch. The data is same to Wojciech used in Torch LSTM. https://github.com/wojzaremba/lstm\n",
+    "\n",
+    "To get the data, please download directly from:\n",
+    "\n",
+    "Training text: https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt\n",
+    "\n",
+    "Validation text: https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt\n",
+    "\n",
+    "Test text: https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "import numpy as np\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "    Build LSTM Symbol\n",
+    "\n",
+    "    Parameters:\n",
+    "    ----------\n",
+    "    num_hidden: int\n",
+    "        hidden unit in LSTM\n",
+    "    x: symbol\n",
+    "        input x\n",
+    "    prev_c: symbol\n",
+    "        previous cell\n",
+    "    prev_h: symbol\n",
+    "        previous hidden\n",
+    "    layer_prefix: str\n",
+    "        name prefix for layer\n",
+    "    t_prefix: str\n",
+    "        name prefix for time\n",
+    "    arg_param: dict: str->symbol\n",
+    "        arguments symbol for the lstm symbol\n",
+    "    aux_param: dict: str->symbol\n",
+    "        auxiliary states symbol for the lstm symbol\n",
+    "\n",
+    "    Returns:\n",
+    "    --------\n",
+    "    output: symbol\n",
+    "        grouped lstm output [c, h]\n",
+    "\n",
+    "    arg_param: dict: str->symbol\n",
+    "        arguments symbol of the lstm symbol\n",
+    "\n",
+    "    aux_param: dict: str->symbol\n",
+    "        auxiliary states symbol of the lstm symbol\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def lstm_symbol(num_hidden,\n",
+    "                x, prev_c, prev_h,\n",
+    "                layer_prefix, t_prefix,\n",
+    "                arg_param=None, aux_param=None,\n",
+    "                **kwargs):\n",
+    "    # name and variable\n",
+    "    i2h_name = \"%s_i2h\" % layer_prefix\n",
+    "    h2h_name = \"%s_h2h\" % layer_prefix\n",
+    "    exist_flag = True\n",
+    "    if arg_param == None or i2h_name + \"_weight\" not in arg_param:\n",
+    "        exist_flag = False\n",
+    "\n",
+    "    if not exist_flag:\n",
+    "        if arg_param == None:\n",
+    "            arg_param = {}\n",
+    "        arg_param[i2h_name + \"_weight\"] = mx.sym.Variable(i2h_name + \"_weight\")\n",
+    "        arg_param[i2h_name + \"_bias\"] = mx.sym.Variable(i2h_name + \"_bias\")\n",
+    "        arg_param[h2h_name + \"_weight\"] = mx.sym.Variable(h2h_name + \"_weight\")\n",
+    "        arg_param[h2h_name + \"_bias\"] = mx.sym.Variable(h2h_name + \"_bias\")\n",
+    "    if not exist_flag:\n",
+    "        if aux_param == None:\n",
+    "            aux_param = {}\n",
+    "        aux_param[i2h_name + \"_moving_mean\"] = mx.sym.Variable(i2h_name + \"_moving_mean\")\n",
+    "        aux_param[i2h_name + \"_moving_var\"] = mx.sym.Variable(i2h_name + \"_moving_var\")\n",
+    "        aux_param[h2h_name + \"_moving_mean\"] = mx.sym.Variable(h2h_name + \"_moving_mean\")\n",
+    "        aux_param[h2h_name + \"_moving_var\"] = mx.sym.Variable(h2h_name + \"_moving_var\")\n",
+    "\n",
+    "    # transform \n",
+    "    i2h = mx.sym.FullyConnected(*[x,\n",
+    "                                  arg_param[i2h_name + \"_weight\"],\n",
+    "                                  arg_param[i2h_name + \"_bias\"]],\n",
+    "                                  num_hidden=num_hidden * 4,\n",
+    "                                  name=i2h_name)\n",
+    "    h2h = mx.sym.FullyConnected(*[prev_h,\n",
+    "                                  arg_param[h2h_name + \"_weight\"],\n",
+    "                                  arg_param[h2h_name + \"_bias\"]],\n",
+    "                                  num_hidden=num_hidden * 4,\n",
+    "                                  name=h2h_name)\n",
+    "    gates = i2h + h2h\n",
+    "\n",
+    "    # gates\n",
+    "    slice_gates = mx.sym.SliceChannel(data=gates, num_outputs=4)\n",
+    "    in_gate = mx.sym.Activation(data=slice_gates[0], act_type=\"sigmoid\")\n",
+    "    in_transform = mx.sym.Activation(data=slice_gates[1], act_type=\"tanh\")\n",
+    "    forget_gate = mx.sym.Activation(data=slice_gates[2], act_type=\"sigmoid\")\n",
+    "    out_gate = mx.sym.Activation(data=slice_gates[3], act_type=\"sigmoid\")\n",
+    "\n",
+    "    # cal states\n",
+    "    next_c = (forget_gate * prev_c) + (in_gate * in_transform)\n",
+    "    next_h = out_gate * mx.sym.Activation(data=next_c, act_type=\"tanh\")\n",
+    "    # We need to block gradient to set 0 gradient back automatically\n",
+    "    next_c = mx.sym.BlockGrad(data=next_c, name=\"%s_%s_c\" % (t_prefix, layer_prefix))\n",
+    "    next_h = mx.sym.BlockGrad(data=next_h, name=\"%s_%s_h\" % (t_prefix, layer_prefix))\n",
+    "    # if you like you can add a dropout symbol here\n",
+    "    # next_h = mx.sym.Dropout(data=next_h, p=0.5)\n",
+    "    output = mx.symbol.Group([next_c, next_h])\n",
+    "    return (output, arg_param, aux_param)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "    Build a multi-layer LSTM model for a single component in unrolled RNN\n",
+    "\n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    num_layer: int\n",
+    "        layers of LSTM network\n",
+    "    num_hidden: int\n",
+    "        hidden unit in each LSTM layer\n",
+    "    num_embed: int\n",
+    "        dimention of word embedding\n",
+    "    num_label: int\n",
+    "        output label dimention\n",
+    "    prev_states: list of tuple (prev_c, prev_h)\n",
+    "        prev_states for each LSTM layer\n",
+    "    t_prefix: str\n",
+    "        prefix name of time\n",
+    "    embed_var: list of symbol\n",
+    "        vairable for embedding layer\n",
+    "    cls_var: list of symbol\n",
+    "        variable for linear classifier\n",
+    "    arg_param: dict: str->symbol\n",
+    "        arguments symbol of the lstm symbol\n",
+    "    aux_param: dict: str->symbol\n",
+    "        auxiliary states symbol of the lstm symbol\n",
+    "\n",
+    "    Returns:\n",
+    "    layers : list of symbol\n",
+    "        layers of current component\n",
+    "    arg_param: dict: str->symbol\n",
+    "        arguments symbol of the lstm symbol\n",
+    "\n",
+    "    aux_param: dict: str->symbol\n",
+    "        auxiliary states symbol of the lstm symbol\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def create_model(num_layer, num_hidden, num_embed, num_label,\n",
+    "                 prev_states,\n",
+    "                 t_prefix,\n",
+    "                 embed_var, cls_var, arg_param=None, aux_param=None,\n",
+    "                 **kwargs):\n",
+    "    layers = []\n",
+    "    data = mx.sym.Variable(\"%s_data\" % t_prefix)\n",
+    "    embed_layer = mx.sym.FullyConnected(*[data, embed_var[0], embed_var[1]],\n",
+    "                                        num_hidden=num_embed, name=\"embedding\")\n",
+    "    for i in range(num_layer):\n",
+    "        layer_prefix = \"layer_%d\" % i\n",
+    "        prev_c, prev_h = prev_states[i]\n",
+    "        if i == 0:\n",
+    "            data = embed_layer\n",
+    "        else:\n",
+    "            data = layers[-1][1]\n",
+    "        args = None\n",
+    "        auxs = None\n",
+    "        if arg_param != None:\n",
+    "            args = arg_param\n",
+    "        if aux_param != None:\n",
+    "            auxs = aux_param\n",
+    "        lstm, arg_param, aux_param = lstm_symbol(num_hidden,\n",
+    "                                                 data, prev_c, prev_h,\n",
+    "                                                 layer_prefix, t_prefix,\n",
+    "                                                 args, auxs,\n",
+    "                                                 **kwargs)\n",
+    "        layers.append(lstm)\n",
+    "    fc = mx.sym.FullyConnected(*[layers[-1][1], cls_var[0], cls_var[1]],\n",
+    "                               num_hidden=num_label, name=\"cls\")\n",
+    "    sm = mx.sym.Softmax(data=fc, name=\"%s\" % t_prefix)\n",
+    "    layers.append(sm)\n",
+    "    return layers, arg_param, aux_param"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "    Setup Recurrent Network Symbol\n",
+    "\n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    seq_len: int\n",
+    "        length of sequence\n",
+    "    num_layer: int\n",
+    "        layer of hidden lstm layers\n",
+    "    num_embed: int\n",
+    "        dimention of embeeding layer\n",
+    "    num_label: int\n",
+    "        dimention of output space\n",
+    "    models = []\n",
+    "\n",
+    "    Returns:\n",
+    "    --------\n",
+    "    rnn: symbol\n",
+    "        A final symbol of RNN network\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def setup_rnn_symbol(seq_len, num_layer, num_hidden, num_embed, num_label, **kwargs):\n",
+    "    models = []\n",
+    "    arg_param = None\n",
+    "    aux_param = None\n",
+    "    embed_var = [mx.sym.Variable(\"embed_weight\"), mx.sym.Variable(\"embed_bias\")]\n",
+    "    cls_var = [mx.sym.Variable(\"cls_weight\"), mx.sym.Variable(\"cls_bias\")]\n",
+    "    init_states = []\n",
+    "\n",
+    "    for i in range(num_layer):\n",
+    "        init_c = mx.sym.Variable(\"init_c_%d\" % i)\n",
+    "        init_h = mx.sym.Variable(\"init_h_%d\" % i)\n",
+    "        init_states.append([init_c, init_h])\n",
+    "\n",
+    "    for i in range(seq_len):\n",
+    "        t_prefix = \"t_%d\" % i\n",
+    "        if i == 0:\n",
+    "            states = init_states\n",
+    "        else:\n",
+    "            states = [(models[-1][j][0], models[-1][j][1]) for j in range(num_layer)]\n",
+    "        model, arg_param, aux_param = create_model(num_layer, num_hidden, num_embed, num_label,\n",
+    "                                                    states, t_prefix,\n",
+    "                                                    embed_var, cls_var,\n",
+    "                                                    arg_param, aux_param,\n",
+    "                                                    **kwargs)\n",
+    "        models.append(model)\n",
+    "    prob = mx.sym.Group([md[-1] for md in models])\n",
+    "    state = mx.sym.Group([models[-1][i] for i in range(num_layer)])\n",
+    "    rnn = mx.sym.Group([prob, state])\n",
+    "    return rnn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "    Setup Recurrent Network Executor\n",
+    "\n",
+    "    Parameters:\n",
+    "    -----------\n",
+    "    ctx: Context\n",
+    "        running context\n",
+    "    seq_len: int\n",
+    "        length of sequence\n",
+    "    num_layer: int\n",
+    "        layer of hidden lstm layers\n",
+    "    num_embed: int\n",
+    "        dimention of embeeding layer\n",
+    "    num_label: int\n",
+    "        dimention of output space\n",
+    "    batch_size: int\n",
+    "        number of batch_size\n",
+    "    Returns:\n",
+    "    --------\n",
+    "    rnn: executor\n",
+    "        A final RNN network\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def setup_rnn(ctx, seq_len, num_layer, num_hidden, num_embed, num_label, batch_size,\n",
+    "              initializer=mx.init.Uniform(0.05)):\n",
+    "\n",
+    "    # get symbol\n",
+    "    rnn_sym = setup_rnn_symbol(seq_len, num_layer, num_hidden, num_embed, num_label)\n",
+    "    input_shapes = {}\n",
+    "    for name in rnn_sym.list_arguments():\n",
+    "        if \"init\" in name:\n",
+    "            input_shapes[name] = (batch_size, num_hidden)\n",
+    "        if \"data\" in name:\n",
+    "            input_shapes[name] = (batch_size, num_label)\n",
+    "    # bind symbol\n",
+    "    rnn_model = rnn_sym.simple_bind(ctx=ctx, **input_shapes)\n",
+    "    # init weight\n",
+    "    names = rnn_sym.list_arguments()\n",
+    "    args = dict(zip(names, rnn_model.arg_arrays))\n",
+    "    grad = dict(zip(names, rnn_model.grad_arrays))\n",
+    "    for name, arr in args.items():\n",
+    "        if name.endswith(\"weight\") or name.endswith(\"bias\") or \\\n",
+    "           name.endswith(\"gamma\") or name.endswith(\"beta\"):\n",
+    "            initializer(name, arr)\n",
+    "    # structure for later use\n",
+    "    param_array = []\n",
+    "    for i in range(len(names)):\n",
+    "        name = names[i]\n",
+    "        if name.endswith(\"weight\") or name.endswith(\"bias\") or \\\n",
+    "           name.endswith(\"gamma\") or name.endswith(\"beta\"):\n",
+    "            param_array.append((i, args[name], grad[name]))\n",
+    "    \n",
+    "    init_states = [(args[\"init_c_%d\" % i], args[\"init_h_%d\" % i]) for i in range(num_layer)]\n",
+    "    last_states = [(rnn_model.outputs[seq_len + i * 2], rnn_model.outputs[seq_len + i *2 + 1]) for i in range(num_layer)]\n",
+    "    return (rnn_sym, rnn_model, param_array, init_states, last_states)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def Logloss(y, prob):\n",
+    "    #eps = 1e-6\n",
+    "    #return -np.sum(np.log(np.maximum(np.choose(y.astype(\"int32\"), prob.T), eps)))\n",
+    "    loss = 0.0\n",
+    "    for i in range(prob.shape[0]):\n",
+    "        loss += -np.log(np.max(prob[i, y[i]], 1e-8))\n",
+    "    loss /= prob.shape[0]\n",
+    "    return loss\n",
+    "\n",
+    "def set_onehot_input(onehot, xidx):\n",
+    "    onehot[:] = 0.\n",
+    "    onehot[np.arange(onehot.shape[0]), xidx.astype(\"int32\")] = 1.\n",
+    "\n",
+    "def load_data(path, dic=None):\n",
+    "    fi = open(path)\n",
+    "    content = fi.read()\n",
+    "    content = content.replace('\\n', '<eos>')\n",
+    "    content = content.split(' ')\n",
+    "    print(\"Loading %s, size of data = %d\" % (path, len(content)))\n",
+    "    x = np.zeros(len(content))\n",
+    "    if dic == None:\n",
+    "        dic = {}\n",
+    "    idx = 0\n",
+    "    for i in range(len(content)):\n",
+    "        word = content[i]\n",
+    "        if len(word) == 0:\n",
+    "            continue\n",
+    "        if not word in dic:\n",
+    "            dic[word] = idx\n",
+    "            idx += 1\n",
+    "        x[i] = dic[word]\n",
+    "    print(\"Unique token: %d\" % len(dic))\n",
+    "    return x, dic\n",
+    "\n",
+    "def replicate_data(x, batch_size):\n",
+    "    nbatch = int(x.shape[0] / batch_size)\n",
+    "    x_cut = x[:nbatch * batch_size]\n",
+    "    data = x_cut.reshape((nbatch, batch_size), order='F')\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading ./data/ptb.train.txt, size of data = 929590\n",
+      "Unique token: 10000\n",
+      "Loading ./data/ptb.valid.txt, size of data = 73761\n",
+      "Unique token: 10000\n"
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 20\n",
+    "seq_len = 20\n",
+    "vocab = 10000\n",
+    "rnn_hidden = 200\n",
+    "embed = 200\n",
+    "num_layer = 2\n",
+    "num_round = 4\n",
+    "ctx = mx.cpu()\n",
+    "optimizer = mx.optimizer.SGD(learning_rate=0.01, wd=0.0001)\n",
+    "# rnn model\n",
+    "rnn_sym, rnn, param_array, init_states, last_states,  = setup_rnn(ctx=ctx, \n",
+    "                                                                  seq_len=seq_len, \n",
+    "                                                                  num_layer=num_layer, \n",
+    "                                                                  num_hidden=rnn_hidden, \n",
+    "                                                                  num_embed=embed, \n",
+    "                                                                  num_label=vocab, \n",
+    "                                                                  batch_size=batch_size)\n",
+    "seq_prob = [mx.nd.zeros(ctx=mx.cpu(), shape=rnn.outputs[i].shape) for i in range(seq_len)]\n",
+    "param_dict = dict(zip(rnn_sym.list_arguments(), rnn.arg_arrays))\n",
+    "# load data\n",
+    "X_train, dic = load_data(\"./data/ptb.train.txt\")\n",
+    "X_val, _ = load_data(\"./data/ptb.valid.txt\", dic)\n",
+    "X_train_batch = replicate_data(X_train, batch_size)\n",
+    "X_val_batch = replicate_data(X_val, batch_size)\n",
+    "onehot = np.zeros((batch_size, vocab), dtype='float32')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch [0], Batch [20]: NLL=8.503, Prep=4931.846\n",
+      "Epoch [0], Batch [40]: NLL=8.511, Prep=4971.079\n",
+      "Epoch [0], Batch [60]: NLL=8.366, Prep=4300.328\n",
+      "Epoch [0], Batch [80]: NLL=8.273, Prep=3917.564\n",
+      "Epoch [0], Batch [100]: NLL=8.241, Prep=3793.372\n",
+      "Epoch [0], Batch [120]: NLL=8.146, Prep=3448.532\n",
+      "Epoch [0], Batch [140]: NLL=8.062, Prep=3172.689\n",
+      "Epoch [0], Batch [160]: NLL=8.041, Prep=3105.142\n",
+      "Epoch [0], Batch [180]: NLL=8.107, Prep=3318.143\n",
+      "Epoch [0], Batch [200]: NLL=8.091, Prep=3264.713\n",
+      "Epoch [0], Batch [220]: NLL=8.025, Prep=3055.690\n",
+      "Epoch [0], Batch [240]: NLL=8.020, Prep=3040.329\n",
+      "Epoch [0], Batch [260]: NLL=7.993, Prep=2960.196\n",
+      "Epoch [0], Batch [280]: NLL=7.970, Prep=2892.389\n",
+      "Epoch [0], Batch [300]: NLL=8.021, Prep=3042.987\n",
+      "Epoch [0], Batch [320]: NLL=7.979, Prep=2918.540\n",
+      "Epoch [0], Batch [340]: NLL=7.951, Prep=2839.064\n",
+      "Epoch [0], Batch [360]: NLL=7.982, Prep=2927.875\n",
+      "Epoch [0], Batch [380]: NLL=7.989, Prep=2948.181\n",
+      "Epoch [0], Batch [400]: NLL=7.966, Prep=2880.162\n",
+      "Epoch [0], Batch [420]: NLL=7.942, Prep=2813.045\n",
+      "Epoch [0], Batch [440]: NLL=7.954, Prep=2847.630\n",
+      "Epoch [0], Batch [460]: NLL=7.914, Prep=2735.021\n",
+      "Epoch [0], Batch [480]: NLL=7.878, Prep=2637.867\n",
+      "Epoch [0], Batch [500]: NLL=7.872, Prep=2624.014\n",
+      "Epoch [0], Batch [520]: NLL=7.845, Prep=2552.158\n",
+      "Epoch [0], Batch [540]: NLL=7.813, Prep=2472.874\n",
+      "Epoch [0], Batch [560]: NLL=7.801, Prep=2443.447\n",
+      "Epoch [0], Batch [580]: NLL=7.772, Prep=2372.508\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:6: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:7: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:21: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-10-e75e396a22bc>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     34\u001b[0m     \u001b[1;31m# train\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     35\u001b[0m     \u001b[1;32mwhile\u001b[0m \u001b[0mnbatch\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mX_train_batch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m         \u001b[0mset_rnn_inputs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseq_len\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0monehot\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_train_batch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_dict\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     37\u001b[0m         \u001b[0mrnn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mis_train\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     38\u001b[0m         \u001b[0mget_rnn_outputs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseq_len\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrnn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mseq_prob\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m<ipython-input-10-e75e396a22bc>\u001b[0m in \u001b[0;36mset_rnn_inputs\u001b[1;34m(seq_len, idx, onehot, X, param_dict)\u001b[0m\n\u001b[0;32m      7\u001b[0m         \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnext_idx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m         \u001b[0mset_onehot_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0monehot\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m         \u001b[0mparam_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdata_key\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0monehot\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     10\u001b[0m         \u001b[0mparam_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlabel_key\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m         \u001b[0midx\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/home/bing/github/mxnet/python/mxnet/ndarray.py\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, in_slice, value)\u001b[0m\n\u001b[0;32m    191\u001b[0m             \u001b[0mNDArray\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_value\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    192\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgeneric\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 193\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sync_copyfrom\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    194\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    195\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'type %s not supported'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m/home/bing/github/mxnet/python/mxnet/ndarray.py\u001b[0m in \u001b[0;36m_sync_copyfrom\u001b[1;34m(self, source_array)\u001b[0m\n\u001b[0;32m    224\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m             \u001b[0msource_array\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mctypes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata_as\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmx_float_p\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 226\u001b[1;33m             ctypes.c_size_t(source_array.size)))\n\u001b[0m\u001b[0;32m    227\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    228\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_slice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstop\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "def set_rnn_inputs(seq_len, idx, onehot, X, param_dict):\n",
+    "    for j in range(seq_len):\n",
+    "        data_key = \"t_%d_data\" % j\n",
+    "        label_key = \"t_%d_label\" % j\n",
+    "        next_idx = (idx + 1) % X.shape[0]\n",
+    "        x = X[idx, :]\n",
+    "        y = X[next_idx, :]\n",
+    "        set_onehot_input(onehot, x)\n",
+    "        param_dict[data_key][:] = onehot\n",
+    "        param_dict[label_key][:] = y\n",
+    "        idx += 1\n",
+    "\n",
+    "def get_rnn_outputs(seq_len, rnn, seq_prob):\n",
+    "    for j in range(seq_len):\n",
+    "        seq_prob[j][:] = rnn.outputs[j]\n",
+    "\n",
+    "def get_nll(seq_len, idx, X, seq_prob):\n",
+    "    nll = 0.\n",
+    "    for j in range(seq_len):\n",
+    "        next_idx = (idx + 1) % X.shape[0]\n",
+    "        y = X[next_idx, :]\n",
+    "        nll += Logloss(y, seq_prob[j].asnumpy())\n",
+    "    return nll\n",
+    "    \n",
+    "\n",
+    "for i in range(num_round):\n",
+    "    nbatch = 0.\n",
+    "    nll = 0.\n",
+    "    # reset states\n",
+    "    for init_c, init_h in init_states:\n",
+    "        init_c[:] = 0.\n",
+    "        init_h[:] = 0.\n",
+    "    tic = time.time()\n",
+    "    # train\n",
+    "    while nbatch < X_train_batch.shape[0]:\n",
+    "        set_rnn_inputs(seq_len, nbatch, onehot, X_train_batch, param_dict)\n",
+    "        rnn.forward(is_train=True)\n",
+    "        get_rnn_outputs(seq_len, rnn, seq_prob)\n",
+    "        rnn.backward()\n",
+    "        for ind, weight, grad in param_array:\n",
+    "            optimizer.update(ind, weight, grad, None)\n",
+    "        for j in range(num_layer):\n",
+    "            init_states[j][0][:] = last_states[j][0]\n",
+    "            init_states[j][1][:] = last_states[j][1]\n",
+    "        nll += get_nll(seq_len, nbatch, X_train_batch, seq_prob)\n",
+    "        nbatch += seq_len\n",
+    "        if nbatch % 1000 == 0:\n",
+    "            print(\"Epoch [%d], Batch [%d]: NLL=%.3f, Prep=%.3f\" % (i, nbatch, nll / nbatch, np.exp(nll / nbatch)))\n",
+    "    toc = time.time()\n",
+    "    print(\"Epoch [%d] Train: Time: %.3f sec, NLL=%.3f, Prep=%.3f\" % (i, toc - tic, nll / nbatch, np.exp(nll / nbatch)))\n",
+    "    nbatch = 0\n",
+    "    nll = 0.\n",
+    "    for init_c, init_h in init_states:\n",
+    "        init_c[:] = 0.\n",
+    "        init_h[:] = 0.\n",
+    "    while nbatch < X_val_batch.shape[0]:\n",
+    "        set_rnn_inputs(seq_len, nbatch, onehot, X_val_batch, param_dict)\n",
+    "        rnn.forward(is_train=False)\n",
+    "        get_rnn_outputs(seq_len, rnn, seq_prob)\n",
+    "        nll += get_nll(seq_len, nbatch, X_val_batch, seq_prob)\n",
+    "        nbatch += seq_len\n",
+    "    print(\"Epoch [%d] Val: NLL=%.3f, Prep=%.3f\" % (i, nll / nbatch, np.exp(nll / nbatch)))\n",
+    "    \n",
+    "        \n",
+    "    \n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<mxnet.symbol.Symbol at 0x7f8c5f3df080>"
+      ]
+     },
+     "execution_count": 154,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/mshadow b/mshadow
index 87d7c88aa41f..3fb5e7365388 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 87d7c88aa41f241e7ea5890201e124fcc1231a6e
+Subproject commit 3fb5e7365388293d5e75077fa068e9fdea02d4fb
diff --git a/ps-lite b/ps-lite
index 43385e25ef63..504faa73a826 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 43385e25ef63521bc0aa7e6493104106aafb3038
+Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
new file mode 100644
index 000000000000..4ac7f174dc82
--- /dev/null
+++ b/src/operator/block_grad-inl.h
@@ -0,0 +1,110 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file block_grad-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_BLOCK_GRAD_INL_H_
+#define MXNET_OPERATOR_BLOCK_GRAD_INL_H_
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./mshadow_op.h"
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+enum BlockGradientOpInputs {kData};
+enum BlockGradientOpOutputs {kOut};
+
+template<typename xpu>
+class BlockGradientOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    out = F<mshadow_op::identity>(data);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+    grad = 0.f;
+  }
+};  // class BlockGradientOp
+
+template<typename xpu>
+Operator *CreateOp();
+
+#if DMLC_USE_CXX11
+class BlockGradientProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {}
+
+  std::map<std::string, std::string> GetParams() const override {
+    return {};
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = in_shape->at(kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    return new BlockGradientProp();
+  }
+
+  std::string TypeString() const override {
+    return "BlockGrad";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const override {
+    return {};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void*> &out_data) const override {
+    return {{in_data[kData], out_data[kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+};  // class BlockGradientProperty
+
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_BLOCK_GRAD_INL_H_
diff --git a/src/operator/block_grad.cc b/src/operator/block_grad.cc
new file mode 100644
index 000000000000..67256f79f268
--- /dev/null
+++ b/src/operator/block_grad.cc
@@ -0,0 +1,26 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file block_grad.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./block_grad-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>() {
+  return new BlockGradientOp<cpu>();
+}
+
+Operator *BlockGradientProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp);
+}
+
+MXNET_REGISTER_OP_PROPERTY(BlockGrad, BlockGradientProp)
+.describe("Get output from a symbol and pass 0 gradient back")
+.add_argument("data", "Symbol", "Input data.");
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/block_grad.cu b/src/operator/block_grad.cu
new file mode 100644
index 000000000000..22707e940b7e
--- /dev/null
+++ b/src/operator/block_grad.cu
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file block_grad.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./block_grad-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>() {
+  return new BlockGradientOp<gpu>();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 730751b1a594..a2a7e58cb3e7 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -33,6 +33,8 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 template<typename xpu>
 class ReshapeOp : public Operator {
  public:
+  explicit ReshapeOp(ReshapeParam param) {}  // Do nothing, just make a special factory
+
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
@@ -80,7 +82,7 @@ class ReshapeOp : public Operator {
 };  // class ReshapeOp
 
 template<typename xpu>
-Operator* CreateOp();
+Operator* CreateOp(ReshapeParam);
 
 #if DMLC_USE_CXX11
 class ReshapeProp : public OperatorProperty {
diff --git a/src/operator/reshape.cc b/src/operator/reshape.cc
index 6bd077172d4a..bc4375b136ce 100644
--- a/src/operator/reshape.cc
+++ b/src/operator/reshape.cc
@@ -11,12 +11,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>() {
-  return new ReshapeOp<cpu>();
+Operator *CreateOp<cpu>(ReshapeParam param) {
+  return new ReshapeOp<cpu>(param);
 }
 
 Operator* ReshapeProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp);
+  DO_BIND_DISPATCH(CreateOp, param_);
 }
 
 DMLC_REGISTER_PARAMETER(ReshapeParam);
diff --git a/src/operator/reshape.cu b/src/operator/reshape.cu
index b810862f3c73..06bbaec1fdfd 100644
--- a/src/operator/reshape.cu
+++ b/src/operator/reshape.cu
@@ -11,8 +11,8 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>() {
-  return new ReshapeOp<gpu>();
+Operator *CreateOp<gpu>(ReshapeParam param) {
+  return new ReshapeOp<gpu>(param);
 }
 
 }  // namespace op

From ad70deee19fb32d05299e97a9cb3b5b1254fcf70 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 18 Oct 2015 16:31:49 -0700
Subject: [PATCH 002/122] Update index.md

---
 doc/R-package/index.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/doc/R-package/index.md b/doc/R-package/index.md
index 68bc97aed699..744fcd6247f8 100644
--- a/doc/R-package/index.md
+++ b/doc/R-package/index.md
@@ -11,12 +11,7 @@ Sounds exciting? This page contains links to all the related documents on R pack
 
 Get Started
 -----------
-There are several information to get you started
-* [Installation Guide](../build.md) contains instructions to install mxnet.
-* [Tutorials](#tutorials) contains various examples how how mxnet can be applied to different cool tasks :)
-* [Contributor Guide](http://mxnet.readthedocs.org/en/latest/contribute.html#r-package)
-  - The R package section gives various guidelines on how to contribute code, tutorial, rmarkdown examples to mxnet.
-  - Your contribution is always welcomed!
+Checkout the [Installation Guide](../build.md) contains instructions to install mxnet, and [Tutorials](#tutorials) for examples on how to use mxnet for various tasks. 
 
 Tutorials
 ---------
@@ -25,3 +20,10 @@ Tutorials
 * [Handwritten Digits Classification Competition](mnistCompetition.md)
 * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md)
 
+Resources
+---------
+There are several information to get you started
+* [Installation Guide](../build.md) contains instructions to install mxnet.
+* [Contributor Guide](http://mxnet.readthedocs.org/en/latest/contribute.html#r-package)
+  - The R package section gives various guidelines on how to contribute code, tutorial, rmarkdown examples to mxnet.
+  - Your contribution is always welcomed!

From bc0dd8d5edb45e1d762f5f1f161a70dbc325d9ae Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 18 Oct 2015 16:41:46 -0700
Subject: [PATCH 003/122] Update README.md

Make R Package visible
---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 664dfac89f88..4175604eced5 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,7 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
-* [Note on Programming Models for Deep Learning](http://mxnet.readthedocs.org/en/latest/program_model.html)
-* [Pretrained Inception BatchNorm Network](example/notebooks/predict-with-pretrained-model.ipynb)
-* [Working with Numpy](example/mnist/mlp_numpy.py)
+* [MXNet R Package brings Deep learning for R!](https://github.com/dmlc/mxnet/tree/master/R-package)
 * [Note on Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
 
 Contents

From ddb7b4b7d9bea744e038ceffeff126e0b992a352 Mon Sep 17 00:00:00 2001
From: Xiaodong <xd_deng@hotmail.com>
Date: Mon, 19 Oct 2015 08:29:31 +0800
Subject: [PATCH 004/122] Update README.md

---
 R-package/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-package/README.md b/R-package/README.md
index 3c46288fb8c8..859cf95e4551 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -3,11 +3,11 @@
 [![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
 [![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.readthedocs.org/en/latest/R-package/index.html)
 
-You have find MXNet R Package! The MXNet R packages brings flexible and efficient GPU
+You have found MXNet R Package! The MXNet R packages brings flexible and efficient GPU
 computing and state-of-art deep learning to R.
 
 - It enables you to write seamless tensor/matrix computation with multiple GPUs in R.
-- It also enables you construct and customize the state-of-art deep learning models in R,
+- It also enables you to construct and customize the state-of-art deep learning models in R,
   and apply them to tasks such as image classification and data science challenges.
 
 Sounds exciting? This page contains links to all the related documents on R package.

From 016e049fee799bbc1bb0c1e3e3d3ff858603840b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 18 Oct 2015 17:34:16 -0700
Subject: [PATCH 005/122] Update CONTRIBUTORS.md

Update contributors
---
 CONTRIBUTORS.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 63685d9af1fd..d562d9b200c8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -40,14 +40,16 @@ List of Contributors
 * [Full List of Contributors](https://github.com/dmlc/mxnet/graphs/contributors)
   - To contributors: please add your name to the list when you submit a patch to the project:)
 * [Qiang Kou](https://github.com/thirdwing)
-  - KK is a R ninja, he will make mxnet available for R users.
+  - KK is a R ninja, he makes mxnet available for R users.
+* [Tong He](https://github.com/hetong007)
+  - Tong is the major maintainer of MXNetR, he designs the mxnet interface and wrote many of the tutorials on R.
 * [Feng Wang](https://github.com/happynear)
   - Feng makes mxnet compatible with Windows Visual Studio.
 * [Li Dong](https://github.com/donglixp)
 * [Piji Li](https://github.com/lipiji)
 * [Hu Shiwen](https://github.com/yajiedesign)
 * [Boyuan Deng](https://github.com/bryandeng)
-* [Tong He](https://github.com/hetong007)
 * [Junran He](https://github.com/junranhe)
   - Junran makes device kvstore allocation strategy smarter
 * [Shuzhe Wu](https://github.com/II-Matto)
+* [Xiaodong](https://github.com/XD-DENG)

From dd0d70347b2599c253b7f5be4744787709f6662a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tianqi.tchen@gmail.com>
Date: Sun, 18 Oct 2015 23:38:02 -0700
Subject: [PATCH 006/122] [R] windows compatiblity

---
 R-package/src/Makevars.win | 13 +++++++++++++
 R-package/src/io.cc        |  2 +-
 R-package/src/io.h         |  2 +-
 R-package/src/ndarray.cc   |  2 +-
 include/mxnet/c_api.h      |  6 +++---
 5 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 R-package/src/Makevars.win

diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
new file mode 100644
index 000000000000..67452b8f634a
--- /dev/null
+++ b/R-package/src/Makevars.win
@@ -0,0 +1,13 @@
+# _*_ mode: makefile; _*_
+PKGROOT=../../
+
+# This file is only used for compilation from github
+# It will be replaced by more formal Rpackage structure
+# Where PKGROOT moved to root directory
+
+.PHONY: all mxnet
+all: $(SHLIB)
+
+
+PKG_CPPFLAGS = -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include
+PKG_LIBS = -L../inst/libs/x64/ -llibmxnet
diff --git a/R-package/src/io.cc b/R-package/src/io.cc
index d0916b25b5ec..2ae5cf832a43 100644
--- a/R-package/src/io.cc
+++ b/R-package/src/io.cc
@@ -123,7 +123,7 @@ int ArrayDataIter::NumPad() const {
 Rcpp::RObject ArrayDataIter::Create(const Rcpp::NumericVector& data,
                                     const Rcpp::NumericVector& label,
                                     const Rcpp::NumericVector& unif_rnds,
-                                    size_t batch_size,
+                                    int batch_size,
                                     bool shuffle) {
   return Rcpp::internal::make_new_object(
       new ArrayDataIter(data, label, unif_rnds, batch_size, shuffle));
diff --git a/R-package/src/io.h b/R-package/src/io.h
index e643b958e944..8a68ec7d30df 100644
--- a/R-package/src/io.h
+++ b/R-package/src/io.h
@@ -118,7 +118,7 @@ class ArrayDataIter : public DataIter {
   static Rcpp::RObject Create(const Rcpp::NumericVector& data,
                               const Rcpp::NumericVector& label,
                               const Rcpp::NumericVector& unif_rnds,
-                              size_t batch_size,
+                              int batch_size,
                               bool shuffle);
 
  private:
diff --git a/R-package/src/ndarray.cc b/R-package/src/ndarray.cc
index a15f05aac0e1..f944384f065a 100644
--- a/R-package/src/ndarray.cc
+++ b/R-package/src/ndarray.cc
@@ -586,7 +586,7 @@ Context::RObjectType ctx(const NDArray::RObjectType& src) {
   return NDArray(src).ctx().RObject();
 }
 
-size_t Size(const NDArray::RObjectType& src) {
+unsigned long Size(const NDArray::RObjectType& src) {  // NOLINT(*)
   return NDArray(src).Size();
 }
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index b0cf64323afe..82ab2ccb1239 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -11,11 +11,11 @@
 #endif
 
 /*! \brief MXNET_DLL prefix for windows" */
-#ifdef _MSC_VER
+#ifdef _WIN32
 #ifdef MXNET_EXPORTS
-#define MXNET_DLL MXNET_EXTERN_C __declspec(dllexport)
+#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllexport)
 #else
-#define MXNET_DLL MXNET_EXTERN_C __declspec(dllimport)
+#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllimport)
 #endif
 #else
 #define MXNET_DLL MXNET_EXTERN_C

From 59e64d4286459c49df442124548db9b80c8a29e9 Mon Sep 17 00:00:00 2001
From: Xiao Nan <road2stat@gmail.com>
Date: Mon, 19 Oct 2015 02:00:21 -0500
Subject: [PATCH 007/122] add regression metrics MAE and RMSLE

---
 R-package/NAMESPACE                 |  2 ++
 R-package/R/metric.R                | 26 +++++++++++++++++++++-----
 R-package/man/mx.metric.accuracy.Rd |  4 ++--
 R-package/man/mx.metric.mae.Rd      | 20 ++++++++++++++++++++
 R-package/man/mx.metric.rmse.Rd     |  4 ++--
 R-package/man/mx.metric.rmsle.Rd    | 20 ++++++++++++++++++++
 6 files changed, 67 insertions(+), 9 deletions(-)
 create mode 100644 R-package/man/mx.metric.mae.Rd
 create mode 100644 R-package/man/mx.metric.rmsle.Rd

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index ab141f4abdc6..e1bb9570fb40 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -33,7 +33,9 @@ export(mx.io.extract)
 export(mx.kv.create)
 export(mx.metric.accuracy)
 export(mx.metric.custom)
+export(mx.metric.mae)
 export(mx.metric.rmse)
+export(mx.metric.rmsle)
 export(mx.model.FeedForward.create)
 export(mx.model.load)
 export(mx.model.save)
diff --git a/R-package/R/metric.R b/R-package/R/metric.R
index 97cc7314977d..7d0b09fca922 100644
--- a/R-package/R/metric.R
+++ b/R-package/R/metric.R
@@ -1,7 +1,7 @@
 #' Helper function to create a customized metric
-#' 
+#'
 #' @export
-mx.metric.custom <-function(name, feval) {
+mx.metric.custom <- function(name, feval) {
   init <- function() {
     c(0, 0)
   }
@@ -18,7 +18,7 @@ mx.metric.custom <-function(name, feval) {
   return(ret)
 }
 
-#' Accuracy metric
+#' Accuracy metric for classification
 #'
 #' @export
 mx.metric.accuracy <- mx.metric.custom("accuracy", function(label, pred) {
@@ -26,10 +26,26 @@ mx.metric.accuracy <- mx.metric.custom("accuracy", function(label, pred) {
   return(sum((label + 1) == ypred) / length(label))
 })
 
-#' RMSE metric
-#' 
+#' RMSE (Root Mean Squared Error) metric for regression
+#'
 #' @export
 mx.metric.rmse <- mx.metric.custom("rmse", function(label, pred) {
   res <- sqrt(mean((label-pred)^2))
   return(res)
 })
+
+#' MAE (Mean Absolute Error) metric for regression
+#'
+#' @export
+mx.metric.mae <- mx.metric.custom("mae", function(label, pred) {
+  res <- mean(abs(label-pred))
+  return(res)
+})
+
+#' RMSLE (Root Mean Squared Logarithmic Error) metric for regression
+#'
+#' @export
+mx.metric.rmsle <- mx.metric.custom("rmsle", function(label, pred) {
+  res <- sqrt(mean((log(pred + 1) - log(label + 1))^2))
+  return(res)
+})
diff --git a/R-package/man/mx.metric.accuracy.Rd b/R-package/man/mx.metric.accuracy.Rd
index c8f4049a6ea6..174d77fed8f9 100644
--- a/R-package/man/mx.metric.accuracy.Rd
+++ b/R-package/man/mx.metric.accuracy.Rd
@@ -3,7 +3,7 @@
 \docType{data}
 \name{mx.metric.accuracy}
 \alias{mx.metric.accuracy}
-\title{Accuracy metric}
+\title{Accuracy metric for classification}
 \format{\preformatted{List of 3
  $ init  :function ()  
  $ update:function (label, pred, state)  
@@ -14,7 +14,7 @@
 mx.metric.accuracy
 }
 \description{
-Accuracy metric
+Accuracy metric for classification
 }
 \keyword{datasets}
 
diff --git a/R-package/man/mx.metric.mae.Rd b/R-package/man/mx.metric.mae.Rd
new file mode 100644
index 000000000000..a98df21f7d7f
--- /dev/null
+++ b/R-package/man/mx.metric.mae.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/metric.R
+\docType{data}
+\name{mx.metric.mae}
+\alias{mx.metric.mae}
+\title{MAE (Mean Absolute Error) metric for regression}
+\format{\preformatted{List of 3
+ $ init  :function ()  
+ $ update:function (label, pred, state)  
+ $ get   :function (state)  
+ - attr(*, "class")= chr "mx.metric"
+}}
+\usage{
+mx.metric.mae
+}
+\description{
+MAE (Mean Absolute Error) metric for regression
+}
+\keyword{datasets}
+
diff --git a/R-package/man/mx.metric.rmse.Rd b/R-package/man/mx.metric.rmse.Rd
index f6f4cc2d1d87..76b4696a910b 100644
--- a/R-package/man/mx.metric.rmse.Rd
+++ b/R-package/man/mx.metric.rmse.Rd
@@ -3,7 +3,7 @@
 \docType{data}
 \name{mx.metric.rmse}
 \alias{mx.metric.rmse}
-\title{RMSE metric}
+\title{RMSE (Root Mean Squared Error) metric for regression}
 \format{\preformatted{List of 3
  $ init  :function ()  
  $ update:function (label, pred, state)  
@@ -14,7 +14,7 @@
 mx.metric.rmse
 }
 \description{
-RMSE metric
+RMSE (Root Mean Squared Error) metric for regression
 }
 \keyword{datasets}
 
diff --git a/R-package/man/mx.metric.rmsle.Rd b/R-package/man/mx.metric.rmsle.Rd
new file mode 100644
index 000000000000..3e2737fe07b7
--- /dev/null
+++ b/R-package/man/mx.metric.rmsle.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/metric.R
+\docType{data}
+\name{mx.metric.rmsle}
+\alias{mx.metric.rmsle}
+\title{RMSLE (Root Mean Squared Logarithmic Error) metric for regression}
+\format{\preformatted{List of 3
+ $ init  :function ()  
+ $ update:function (label, pred, state)  
+ $ get   :function (state)  
+ - attr(*, "class")= chr "mx.metric"
+}}
+\usage{
+mx.metric.rmsle
+}
+\description{
+RMSLE (Root Mean Squared Logarithmic Error) metric for regression
+}
+\keyword{datasets}
+

From 29c7b1cffbaee5ffea1f61824cc522d0c7301f01 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 19 Oct 2015 10:01:35 -0700
Subject: [PATCH 008/122] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index d562d9b200c8..47d0de5078a7 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -53,3 +53,4 @@ List of Contributors
   - Junran makes device kvstore allocation strategy smarter
 * [Shuzhe Wu](https://github.com/II-Matto)
 * [Xiaodong](https://github.com/XD-DENG)
+* [Nan Xiao](https://github.com/road2stat)

From 31fb96639e208923a60e4e26a5fc19a8404c1dab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Benesty?=
 <pommedeterresautee@users.noreply.github.com>
Date: Mon, 19 Oct 2015 20:51:22 +0200
Subject: [PATCH 009/122] small typo

---
 doc/build.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index db8b4c585504..98c94344b33d 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -120,7 +120,7 @@ cd python; python setup.py develop --user
 
 R Package Installation
 ----------------------
-To install the python package. First finish the [Build MXNet Library](#build-mxnet-library) step.
+To install the R package. First finish the [Build MXNet Library](#build-mxnet-library) step.
 Then use the following command to install mxnet at root folder
 
 ```bash
@@ -131,4 +131,4 @@ Hopefully, we will now have mxnet on R!
 
 ## Note on Library Build
 We isolate the library build with Rcpp end to maximize the portability
-  - MSVC is needed on windows to build the mxnet library, because of CUDA compatiblity issue of toolchains.
\ No newline at end of file
+  - MSVC is needed on windows to build the mxnet library, because of CUDA compatiblity issue of toolchains.

From b969aadeea38bee190f58043cf7ae2a1553ba138 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Benesty?=
 <pommedeterresautee@users.noreply.github.com>
Date: Mon, 19 Oct 2015 20:56:41 +0200
Subject: [PATCH 010/122] Error in R install command

---
 doc/build.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/build.md b/doc/build.md
index 98c94344b33d..6907d78bf154 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -124,7 +124,7 @@ To install the R package. First finish the [Build MXNet Library](#build-mxnet-li
 Then use the following command to install mxnet at root folder
 
 ```bash
-R CMD INSTALL R-Package
+R CMD INSTALL R-package
 ```
 
 Hopefully, we will now have mxnet on R!

From ac3cd267b55bc9269dcc4dcb3fb6cbd27b4fef41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Benesty?=
 <pommedeterresautee@users.noreply.github.com>
Date: Mon, 19 Oct 2015 21:01:57 +0200
Subject: [PATCH 011/122] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4175604eced5..0b0fd543e7f2 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Features
 * Auto parallelization
   - Write numpy-style ndarray GPU programs, which will be automatically parallelized.
 * Language agnostic
-  - With support for python, c++, more to come.
+  - With support for python, c++, R, more to come.
 * Cloud friendly
   - Directly load/save from S3, HDFS, AZure
 * Easy extensibility

From cf0e6fb2000368d1b74f9f68e52c9052447fb198 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Mon, 19 Oct 2015 16:57:55 -0400
Subject: [PATCH 012/122] [kvstore] switch to mli/ps-lite

---
 .gitmodules | 2 +-
 ps-lite     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index c1084105e4aa..6505cc089282 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,4 +6,4 @@
 	url = https://github.com/dmlc/dmlc-core.git
 [submodule "ps-lite"]
 	path = ps-lite
-	url = https://github.com/dmlc/ps-lite.git
+	url = https://github.com/mli/ps-lite.git
diff --git a/ps-lite b/ps-lite
index 504faa73a826..1955d3dd0217 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
+Subproject commit 1955d3dd021794e207231c408b4f020f0191a33a

From ce42f1688fca82e9de8019301c96cf3f5d6c36c6 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Mon, 19 Oct 2015 21:32:33 -0400
Subject: [PATCH 013/122] [kvstore] refactor to new ps-lite

---
 src/kvstore/kvstore.cc            |  25 +-
 src/kvstore/kvstore_dist.h        | 157 ++++-------
 src/kvstore/kvstore_dist_server.h | 232 ++++++++++++++++
 src/kvstore/mxnet_ps_node.h       | 431 ------------------------------
 4 files changed, 283 insertions(+), 562 deletions(-)
 create mode 100644 src/kvstore/kvstore_dist_server.h
 delete mode 100644 src/kvstore/mxnet_ps_node.h

diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index b4044d0a1a0c..0de025ba9a35 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -10,7 +10,6 @@
 #include "./kvstore_device.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "./kvstore_dist.h"
-#include "./mxnet_ps_node.h"
 #endif  // MXNET_USE_DIST_KVSTORE
 
 namespace mxnet {
@@ -36,7 +35,7 @@ KVStore* KVStore::Create(const char *type_name) {
         kv->IsWorkerNode() &&
         kv->get_rank() == 0) {
       // configure the server to be the sync mode
-      kv->SendCommandToServers(kvstore::CommandID::kSyncMode, "");
+      kv->SendCommandToServers(kvstore::kSyncMode, "");
     }
 #else
     LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use " << tname;
@@ -50,25 +49,3 @@ KVStore* KVStore::Create(const char *type_name) {
 }
 
 }  // namespace mxnet
-
-#if MXNET_USE_DIST_KVSTORE
-
-namespace ps {
-
-App* App::Create(int argc, char *argv[]) {
-  NodeInfo n;
-  if (n.IsWorker()) {
-    return new ::mxnet::kvstore::MXNetWorker();
-  } else if (n.IsServer()) {
-    return new ::mxnet::kvstore::MXNetServer();
-  } else if (n.IsScheduler()) {
-    return new ::mxnet::kvstore::MXNetScheduler();
-  } else {
-    LOG(FATAL) << "unknown node";
-  }
-  return NULL;
-}
-
-}  // namespace ps
-
-#endif  // MXNET_USE_DIST_KVSTORE
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index ef5cb6b999d7..fe6176ce9c9a 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -8,11 +8,9 @@
 #include <string>
 #include <vector>
 #include "./kvstore_local.h"
-#include "./mxnet_ps_node.h"
 #include "mxnet/engine.h"
-// #include "dmlc/parameter.h"
-#include "ps.h"
-#include "base/range.h"
+#include "ps/ps.h"
+#include "./kvstore_dist_server.h"
 
 namespace mxnet {
 namespace kvstore {
@@ -29,27 +27,22 @@ namespace kvstore {
  */
 class KVStoreDist : public KVStoreLocal {
  public:
-  KVStoreDist()
-      : server_(NULL),
-        cache_(NULL),
-        barrier_count_(0) {
+  KVStoreDist() : ps_worker_(nullptr), server_(nullptr) {
     if (IsWorkerNode()) {
-      cache_ = new ps::KVCache<ps::Key, real_t>(PS_KV_ID);
-      StartPS();
+      ps_worker_ = new ps::KVWorker<real_t>(0);
+      ps::Start("mxnet\0");
     }
   }
 
   virtual ~KVStoreDist() {
     Engine::Get()->WaitForAll();
-    delete cache_;
-
     if (IsWorkerNode()) {
       if (get_rank() == 0) {
         // stop the executor at servers
-        SendCommandToServers(CommandID::kStop, "");
+        SendCommandToServers(kStopServer, "");
       }
-      Barrier();
-      ps::StopSystem();
+      ps::Finalize();
+      delete ps_worker_;
     }
   }
 
@@ -62,8 +55,6 @@ class KVStoreDist : public KVStoreLocal {
       Wait(keys);
     } else {
       // do nothing
-      // // simply increase the clock. it's necessary for BSP
-      // cache_->executor()->IncrClock(keys.size());
     }
     Barrier();
   }
@@ -90,15 +81,9 @@ class KVStoreDist : public KVStoreLocal {
 
         // do push
         real_t* data = static_cast<real_t*>(merged.data().dptr_);
-        ps::SArray<real_t> vals(data, size, ps::EmptyDel<real_t>());
-        ps::SyncOpts opts;
-        opts.callback = [cb]() { cb(); };
-        CHECK_NOTNULL(cache_)->Push(
-            opts.GetTask(),
-            pskv.keys,
-            vals,
-            pskv.vals_size,
-            opts.callback);
+        ps::SArray<real_t> vals(data, size, false);  // false means no delete
+        CHECK_NOTNULL(ps_worker_)->ZPush(
+            pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
       };
       Engine::Get()->PushAsync(
           push_to_servers,
@@ -134,18 +119,10 @@ class KVStoreDist : public KVStoreLocal {
         // convert to ps keys
         PSKV& pskv = EncodeKey(key, size);
 
-        // pull opts
-        ps::SyncOpts opts;
-        opts.callback = [cb]() { cb(); };
-
         // issue pull
-        CHECK_NOTNULL(cache_)->Pull(
-            opts.GetTask(),
-            pskv.keys,
-            opts.callback,
-            data,
-            size,
-            pskv.vals_size.data());
+        ps::SArray<real_t> vals(data, size, false);  // false means no delete
+        CHECK_NOTNULL(ps_worker_)->ZPull(
+            pskv.keys, &vals, &pskv.lens, 0, [cb](){ cb(); });
       };
 
       CHECK_NOTNULL(Engine::Get())->PushAsync(
@@ -172,36 +149,30 @@ class KVStoreDist : public KVStoreLocal {
   }
 
   void Barrier() override {
-    ps::Task task;
-    task.set_cmd(CommandID::SetBarrier(barrier_count_++));
-    auto node = CHECK_NOTNULL(ps::NodeInfo::MyApp());
-    node->Wait(node->Submit(task, ps::NodeInfo::SchedulerID()));
+    ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
   }
 
 
   void SendCommandToServers(int cmd_id,
                             const std::string& cmd_body) override {
-    ps::Task task;
-    task.set_cmd(cmd_id);
-    task.set_msg(cmd_body);
-    auto node = CHECK_NOTNULL(ps::NodeInfo::MyApp());
-    node->Wait(node->Submit(task, ps::kServerGroup));
+    CHECK_NOTNULL(ps_worker_);
+    ps_worker_->Wait(ps_worker_->Request(cmd_id, cmd_body, ps::kServerGroup));
   }
 
-  int get_group_size() const override { return ps::NodeInfo::RankSize(); }
+  int get_group_size() const override { return ps::NumWorkers(); }
 
-  int get_rank() const override { return ps::NodeInfo::MyRank(); }
+  int get_rank() const override { return ps::MyRank(); }
 
   void RunServer(const Controller& controller) override {
     CHECK(!IsWorkerNode());
-    StartPS();
     if (IsServerNode()) {
-      server_ = new KVStoreDistServer(controller);
-      server_->Run();
-      delete server_;
-      server_ = nullptr;
+      server_ = new KVStoreDistServer();
+      server_->set_controller(controller);
     }
-    ps::StopSystem();
+    ps::Start("mxnet_server\0");
+    if (server_) server_->Run();
+    ps::Finalize();
+    delete server_; server_ = nullptr;
   }
 
  private:
@@ -232,25 +203,11 @@ class KVStoreDist : public KVStoreLocal {
   }
 
   /**
-   * \brief start the network threads in ps-lite
-   */
-  void StartPS() {
-    // hack argc argv
-    int argc = 1;
-    char** argv = new char*[1];
-    char name[] = "mxnet";
-    argv[0] = new char[strlen(name)+1];
-    memcpy(argv[0], name, strlen(name));
-    argv[0][strlen(name)] = '\0';
-    ps::StartSystem(&argc, &argv);
-  }
-
-  /**
-   * \brief struct for ps keys and vals_size
+   * \brief struct for ps keys and lens
    */
   struct PSKV {
     ps::SArray<ps::Key> keys;  // n keys
-    ps::SArray<int> vals_size;  // the length of the i-th value
+    ps::SArray<int> lens;  // the length of the i-th value
     int size;
   };
 
@@ -264,72 +221,58 @@ class KVStoreDist : public KVStoreLocal {
    */
   std::mutex mu_;
 
-  /**
-   * \brief key partition of server nodes in ps
-   */
-  std::vector<ps::Key> server_key_partition_;
-
   /**
    * \brief convert to keys in ps
    */
   inline PSKV& EncodeKey(int key, size_t size) {
-    CHECK_EQ(sizeof(ps::Key), 8) << "Do not use USE_KEY32=1 to compile ps-lite";
-    int num_servers = ps::NodeInfo::NumServers();
-    CHECK_GT(num_servers, 0);
-
     mu_.lock();
-    // init key parititon
-    if (server_key_partition_.empty()) {
-      auto all = ps::Range<ps::Key>::All();
-      for (int i = 0; i < num_servers; ++i) {
-        ps::Key key = all.EvenDivide(num_servers, i).begin();
-        server_key_partition_.push_back(
-            ((key >> CommandID::kIndexBits)+1) << CommandID::kIndexBits);
-      }
-    }
-
     PSKV& pskv = ps_kv_[key];
     mu_.unlock();
 
     if (!pskv.keys.empty()) {
       CHECK_EQ(pskv.size, size) << "The value size cannot be changed";
     } else {
+      auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
+      int num_servers = krs.size();
+      CHECK_GT(num_servers, 0);
+
       // a simple heuristic for load balance
       if (size < bigarray_bound_) {
         // send it to a single random picked server
         int server = (key * 9973) % num_servers;
-        pskv.keys.push_back(server_key_partition_[server] | key);
-        pskv.vals_size.push_back(size);
+        ps::Key ps_key = krs[server].begin() + key;
+        CHECK_LT(ps_key, krs[server].end());
+        pskv.keys.push_back(ps_key);
+        pskv.lens.push_back(size);
+        pskv.size = size;
       } else {
-        // divide it to all servers
-        auto all = ps::Range<size_t>(0, size);
+        // parition it to all servers
+        pskv.size = 0;
         for (int i = 0; i < num_servers; ++i) {
-          pskv.keys.push_back(server_key_partition_[i] | key);
-          pskv.vals_size.push_back(all.EvenDivide(num_servers, i).size());
+          size_t part_size =
+              static_cast<size_t>(static_cast<double>(size)/num_servers*(i+1)) -
+              static_cast<size_t>(static_cast<double>(size)/num_servers*i);
+          ps::Key ps_key = krs[i].begin() + key;
+          CHECK_LT(ps_key, krs[i].end());
+          pskv.keys.push_back(ps_key);
+          pskv.lens.push_back(part_size);
+          pskv.size += part_size;
         }
+        CHECK_EQ(pskv.size, size);
       }
-      pskv.size = size;
     }
     return pskv;
   }
 
-  /**
-   * \brief a server node
-   */
-  KVStoreDistServer* server_;
-
   /**
    * \brief for worker to push and pull data
-   * use KVCache rather than KVWorker for the c-style pull
    */
-  ps::KVCache<ps::Key, real_t>* cache_;
-
-
+  ps::KVWorker<real_t>* ps_worker_;
 
   /**
-   * \brief the count for barrier
+   * \brief the server handle
    */
-  int barrier_count_;
+  KVStoreDistServer* server_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
new file mode 100644
index 000000000000..50ef2d07fedc
--- /dev/null
+++ b/src/kvstore/kvstore_dist_server.h
@@ -0,0 +1,232 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file mxnet_node.h
+ * \brief implement mxnet nodes
+ */
+#ifndef MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
+#define MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
+#include <queue>
+#include <string>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <functional>
+#include <future>
+#include <vector>
+#include "ps/ps.h"
+#include "mxnet/kvstore.h"
+
+namespace mxnet {
+namespace kvstore {
+
+static const int kStopServer = -1;
+static const int kSyncMode = -2;
+
+/**
+ * \brief executor runs a function using the thread called \ref Start
+ */
+class Executor {
+ public:
+  /**
+   * \brief start the executor
+   */
+  void Start() {
+    std::unique_lock<std::mutex> lk(mu_);
+    while (true) {
+      cond_.wait(lk, [this]{return !queue_.empty();});
+      Block blk = std::move(queue_.front());
+      queue_.pop();
+      lk.unlock();
+
+      if (blk.f) {
+        blk.f(); blk.p.set_value();
+      } else {
+        blk.p.set_value(); break;
+      }
+      lk.lock();
+    }
+  }
+
+  /**
+   * \brief function
+   */
+  typedef std::function<void()> Func;
+
+  /**
+   * \brief let the thread called \ref Start to exec a function. threadsafe
+   */
+  void Exec(const Func& func) {
+    Block blk(func);
+    auto fut = blk.p.get_future();
+    {
+      std::lock_guard<std::mutex> lk(mu_);
+      queue_.push(std::move(blk));
+      cond_.notify_one();
+    }
+    fut.wait();
+  }
+
+  /**
+   * \brief stop the thread, threadsafe
+   */
+  void Stop() {
+    Exec(Func());
+  }
+
+ private:
+  struct Block {
+    explicit Block(const Func& func) : f(func) { }
+    Func f;
+    std::promise<void> p;
+  };
+  std::queue<Block> queue_;
+  std::mutex mu_;
+  std::condition_variable cond_;
+};
+
+class KVStoreDistServer {
+ public:
+  KVStoreDistServer() {
+    using namespace std::placeholders;
+    ps_server_ = new ps::KVServer<float>(0);
+    static_cast<ps::SimpleApp*>(ps_server_)->set_request_handle(
+        std::bind(&KVStoreDistServer::CommandHandle, this, _1, _2));
+    ps_server_->set_request_handle(
+        std::bind(&KVStoreDistServer::DataHandle, this, _1, _2, _3));
+    sync_mode_ = false;
+  }
+
+  ~KVStoreDistServer() {
+    delete ps_server_;
+  }
+
+  void set_controller(const KVStore::Controller& controller) {
+    CHECK(controller);
+    controller_ = controller;
+  }
+
+  void set_updater(const KVStore::Updater& updater)  {
+    CHECK(updater);
+    updater_ = updater;
+  }
+
+  /**
+   * \brief blocked until received the command \a kSyncMode
+   */
+  void Run() {
+    exec_.Start();
+  }
+
+ private:
+  void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
+    if (recved.head == kStopServer) {
+      exec_.Stop();
+    } else if (recved.head == kSyncMode) {
+      sync_mode_ = true;
+    } else {
+      // let the main thread to execute ctrl, which is necessary for python
+      exec_.Exec([this, recved]() {
+          CHECK(controller_);
+          controller_(recved.head, recved.body);
+        });
+    }
+    app->Response(recved);
+  }
+
+  void DataHandle(const ps::KVMeta& req_meta,
+                  const ps::KVPairs<real_t>& req_data,
+                  ps::KVServer<real_t>* server) {
+    // do some check
+    CHECK_EQ(req_data.keys.size(), (size_t)1);
+    if (req_meta.push) {
+      CHECK_EQ(req_data.lens.size(), (size_t)1);
+      CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]);
+    }
+
+    int key = DecodeKey(req_data.keys[0]);
+    auto& stored = store_[key];
+    if (req_meta.push) {
+      size_t ds[] = {(size_t)req_data.lens[0]};
+      TShape dshape(ds, ds + 1);
+      TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*)
+                      dshape, cpu::kDevMask);
+      NDArray recved = NDArray(recv_blob, 0);
+      if (stored.is_none()) {
+        // initialization
+        stored = NDArray(dshape, Context());
+      } else if (sync_mode_) {
+        // synced push
+        auto& merged = merge_buf_[key];
+        if (merged.array.is_none()) {
+          merged.array = NDArray(dshape, Context());
+        }
+
+        if (merged.request.size() == 0) {
+          CopyFromTo(recved, &merged.array, 0);
+        } else {
+          merged.array += recved;
+        }
+        merged.request.push_back(req_meta);
+
+        if (merged.request.size() == (size_t)ps::NumWorkers()) {
+          // let the main thread to execute updater_, which is necessary for
+          // python
+          merged.array.WaitToRead();
+          exec_.Exec([this, key, &merged, &stored](){
+              CHECK(updater_);
+              updater_(key, merged.array, &stored);
+            });
+          for (const auto& req : merged.request) {
+            server->Response(req);
+          }
+          merged.request.clear();
+        }
+      } else {
+        // async push
+        exec_.Exec([this, key, &recved, &stored](){
+            CHECK(updater_);
+            updater_(key, recved, &stored);
+          });
+      }
+      stored.WaitToRead();
+    } else {
+      // pull
+      ps::KVPairs<real_t> response;
+      response.keys = req_data.keys;
+      response.lens = req_data.lens;
+      response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_),
+                             stored.shape()[0]);
+    }
+  }
+
+  int DecodeKey(ps::Key key) {
+    auto kr = ps::Postoffice::Get()->GetServerKeyRanges()[ps::MyRank()];
+    return key - kr.begin();
+  }
+
+  /**
+   * \brief user defined
+   */
+  bool sync_mode_;
+  KVStore::Controller controller_;
+  KVStore::Updater updater_;
+
+  std::unordered_map<int, NDArray> store_;
+
+  struct MergeBuf {
+    std::vector<ps::KVMeta> request;
+    NDArray array;
+  };
+  std::unordered_map<int, MergeBuf> merge_buf_;
+
+  Executor exec_;
+
+  ps::KVServer<float>* ps_server_;
+};
+
+
+
+}  // namespace kvstore
+}  // namespace mxnet
+
+#endif  // MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
diff --git a/src/kvstore/mxnet_ps_node.h b/src/kvstore/mxnet_ps_node.h
deleted file mode 100644
index 569a4caac962..000000000000
--- a/src/kvstore/mxnet_ps_node.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file mxnet_node.h
- * \brief implement mxnet nodes
- */
-#ifndef MXNET_KVSTORE_MXNET_PS_NODE_H_
-#define MXNET_KVSTORE_MXNET_PS_NODE_H_
-#include <queue>
-#include <string>
-#include <mutex>
-#include <condition_variable>
-#include <memory>
-#include <functional>
-#include <vector>
-#include "ps.h"
-#include "mxnet/kvstore.h"
-
-namespace mxnet {
-namespace kvstore {
-
-/**
- * \brief encode/decode a command id
- */
-struct CommandID {
-  /**
-   * \brief commmand id for stoping
-   */
-  static const int kStop = -1;
-  /**
-   * \brief command id to set the server to the sync mode
-   */
-  static const int kSyncMode = -2;
-  /**
-   * \brief returns the commmand id given a barrier count
-   */
-  static int SetBarrier(int count) {
-    return - count - 10;
-  }
-  /**
-   * \brief returns true if it is a barrier command
-   */
-  static bool GetBarrier(int cmd_id, int* count) {
-    if (cmd_id <= 10) {
-      *count = - cmd_id - 10;
-      return true;
-    }
-    return false;
-  }
-
-  /**
-   * \brief number of bits used to encode the key in mxnet
-   */
-  static const int kIndexBits = 32;
-};
-
-/**
- * \brief a simple aggregator over time.
- */
-class Aggregator {
- public:
-  /**
-   * \param num number of nodes for aggregation
-   */
-  Aggregator(int num, ps::Customer* obj) {
-    num_ = num;
-    obj_ = obj;
-  }
-
-  using Message = std::shared_ptr<ps::Message>;
-
-  bool Has(int time) {
-    return msgs_.find(time) != msgs_.end();
-  }
-
-  void Add(int time, const Message& msg) {
-    msgs_[time].push_back(msg);
-    msg->replied = true;
-  }
-
-  size_t Size() {
-    return msgs_.size();
-  }
-
-  size_t Count(int time) {
-    return msgs_[time].size();
-  }
-
-  bool Done(int time) {
-    return Count(time) == (size_t)num_;
-  }
-
-  void Remove(int time) {
-    for (auto& m : msgs_[time]) {
-      CHECK_NOTNULL(obj_)->Reply(m.get());
-    }
-    msgs_.erase(time);
-  }
-
- private:
-  std::unordered_map<int, std::vector<Message>> msgs_;
-  int num_;
-  ps::Customer* obj_;
-};
-
-/** \brief to match worker/server's app id */
-#define PS_KV_ID 9
-
-/** \brief to match worker/server's app id */
-#define PS_APP_ID 10
-
-/**
- * \brief a server node on ps
- */
-class MXNetServer : public ps::App {
- public:
-  MXNetServer() : App(PS_APP_ID) { }
-  virtual ~MXNetServer() { }
-
-  void set_controller(const KVStore::Controller& ctrl) {
-    controller_ = ctrl;
-  }
-
-  void ProcessRequest(ps::Message* request) override {
-    // wait for one second if controller_ is not inited
-    for (int i = 0; i < 100; ++i) {
-      if (!controller_) usleep(10000);
-    }
-    CHECK(controller_);
-    controller_(request->task.cmd(), request->task.msg());
-  }
-
- private:
-  KVStore::Controller controller_;
-};
-
-/**
- * \brief a worker node on ps
- */
-class MXNetWorker : public ps::App {
- public:
-  MXNetWorker() : App(PS_APP_ID) { }
-  virtual ~MXNetWorker() { }
-};
-
-/**
- * \brief a scheduler node on ps
- */
-class MXNetScheduler : public ps::App {
- public:
-  MXNetScheduler()
-      : App(PS_APP_ID),
-        barrier_(ps::NodeInfo::NumWorkers(), this) {
-  }
-  virtual ~MXNetScheduler() { }
-
-  void ProcessRequest(ps::Message* request) override {
-    int count;
-    if (CommandID::GetBarrier(request->task.cmd(), &count)) {
-      barrier_.Add(count, LastRequest());
-      CHECK_EQ(barrier_.Size(), 1);
-
-      if (barrier_.Done(count)) {
-        barrier_.Remove(count);
-      }
-    }
-  }
-
- private:
-  Aggregator barrier_;
-};
-
-/**
- * \brief executor runs a function using it's own thread
- */
-class Executor {
- public:
-  /**
-   * \brief start the executor
-   */
-  void Start() {
-    std::unique_lock<std::mutex> lk(mu_);
-    while (true) {
-      cond_.wait(lk, [this]{return !queue_.empty();});
-      Block blk = std::move(queue_.front());
-      queue_.pop();
-      lk.unlock();
-
-      if (blk.f) {
-        blk.f(); blk.p.set_value();
-      } else {
-        blk.p.set_value(); break;
-      }
-
-      lk.lock();
-    }
-  }
-
-  /**
-   * \brief function
-   */
-  typedef std::function<void()> Func;
-
-  /**
-   * \brief exec a function. threadsafe
-   */
-  void Exec(const Func& func) {
-    Block blk(func);
-    auto fut = blk.p.get_future();
-    {
-      std::lock_guard<std::mutex> lk(mu_);
-      queue_.push(std::move(blk));
-      cond_.notify_one();
-    }
-    fut.wait();
-  }
-
-  /**
-   * \brief stop, threadsafe
-   */
-  void Stop() {
-    Exec(Func());
-  }
-
- private:
-  struct Block {
-    explicit Block(const Func& func) : f(func) { }
-    Func f;
-    std::promise<void> p;
-  };
-  std::queue<Block> queue_;
-  std::mutex mu_;
-  std::condition_variable cond_;
-};
-
-/**
- * \brief distributed kvstore for servers
- */
-class KVStoreDistServer {
- public:
-  explicit KVStoreDistServer(const KVStore::Controller& user_ctrl)
-      // set updater
-      : store_(ServerHandle(this), 1, 1, PS_KV_ID) {
-    // set controller
-    sync_mode_ = false;
-    auto controller
-        = [user_ctrl, this](int cmd_id, const std::string& cmd_body) {
-      if (cmd_id == CommandID::kStop) {
-        exec_.Stop();
-      } else if (cmd_id == CommandID::kSyncMode) {
-        sync_mode_ = true;
-      } else {
-        // let the main thread to execute ctrl, which is necessary for python
-        exec_.Exec([user_ctrl, cmd_id, cmd_body]() {
-            CHECK(user_ctrl);
-            user_ctrl(cmd_id, cmd_body);
-        });
-      }
-    };
-    auto node = CHECK_NOTNULL(ps::NodeInfo::MyApp());
-    static_cast<MXNetServer*>(node)->set_controller(controller);
-  }
-
-  // ~KVStoreDistServer() {
-  //   // clear all ndarrays before Engine is shutting down.
-  //   store_.server()->Clear();
-  // }
-
-  void set_updater(const KVStore::Updater& updater)  {
-    CHECK(updater);
-    updater_ = updater;
-  }
-
-  void Run() {
-    exec_.Start();
-  }
-
- private:
-  /**
-   * \brief value type stored at server
-   */
-  struct ServerVal {
-    std::vector<real_t> data;
-    inline void Load(dmlc::Stream *fi) { fi->Read(&data); }
-    inline void Save(dmlc::Stream *fo) const { fo->Write(data); }
-    inline bool Empty() const { return data.empty(); }
-  };
-
-  /**
-   * \brief server handle
-   */
-  class ServerHandle {
-   public:
-    explicit ServerHandle(KVStoreDistServer* kvstore)
-        : kvstore_(kvstore),
-          ps_obj_(nullptr),
-          aggregator_(nullptr) {
-    }
-
-    ~ServerHandle() {
-      delete aggregator_;
-    }
-
-    /**
-     * \brief get a cpu ndarray from a c-array without data copy
-     */
-    inline NDArray GetNDArray(real_t* data, size_t size) {
-      size_t ds[] = {size};
-      TShape dshape(ds, ds + 1);
-      TBlob data_blob(data, dshape, cpu::kDevMask);
-      return NDArray(data_blob, 0);
-    }
-
-    inline void Start(bool push, int timestamp, int cmd_id, void* msg) { }
-    inline void Finish() { }
-    inline void Load(dmlc::Stream *fi) { }
-    inline void Save(dmlc::Stream *fo) const { }
-
-    inline void Push(ps::Key recv_key,
-                     ps::Blob<const real_t> recv_val,
-                     ServerVal& my_val) {  // NOLINT(*)
-      // initialization
-      if (my_val.Empty()) {
-        my_val.data.resize(recv_val.size);
-        memcpy(my_val.data.data(), recv_val.data,
-               recv_val.size * sizeof(real_t));
-        return;
-      }
-
-      int key = DecodeKey(recv_key);
-      NDArray recv_array = GetNDArray((real_t*)recv_val.data,  // NOLINT(*)
-                                      recv_val.size);
-      NDArray my_array = GetNDArray(my_val.data.data(), my_val.data.size());
-
-      if (kvstore_->sync_mode_) {
-        // create aggregator
-        if (aggregator_ == nullptr) {
-          ps_obj_ = CHECK_NOTNULL(kvstore_)->store_.server();
-          aggregator_ = new Aggregator(
-              ps::NodeInfo::NumWorkers(), ps_obj_);
-        }
-
-        // init merge buf
-        std::vector<real_t>& buf = merge_buf_[key];
-        if (!aggregator_->Has(key)) {
-          if (buf.empty()) {
-            buf.resize(recv_val.size);
-          }
-          memset(buf.data(), 0, buf.size() * sizeof(real_t));
-        }
-
-        // add recved data into merge
-        NDArray merge = GetNDArray(buf.data(), buf.size());
-        merge += recv_array;
-
-        // update if aggregation is done
-        aggregator_->Add(key, ps_obj_->LastRequest());
-        if (aggregator_->Done(key)) {
-          // let the main thread to execute updater_, which is necessary for
-          // python
-          merge.WaitToRead();
-          kvstore_->exec_.Exec([this, key, &merge, &my_array](){
-              CHECK(kvstore_->updater_);
-              kvstore_->updater_(key, merge, &my_array);
-            });
-          aggregator_->Remove(key);
-        }
-      } else {
-        // runs eventual consistency model. so update immediately
-
-        // let the main thread to execute updater_, which is necessary for
-        // python
-        kvstore_->exec_.Exec([this, key, &recv_array, &my_array](){
-            CHECK(kvstore_->updater_);
-            kvstore_->updater_(key, recv_array, &my_array);
-          });
-      }
-      // place waittoread here rather than the beginning of pull.
-      my_array.WaitToRead();
-    }
-
-    inline void Pull(ps::Key recv_key,
-                     const ServerVal& my_val,
-                     ps::Blob<real_t>& send_val) {  // NOLINT(*)
-      CHECK(!my_val.Empty())
-          << DecodeKey(recv_key) << " is not inited";
-
-      send_val.data = (real_t*) my_val.data.data();  // NOLINT(*)
-      send_val.size = my_val.data.size();
-    }
-
-   private:
-    /**
-     * \brief convert from a key in ps
-     */
-    inline int DecodeKey(ps::Key key) {
-      return static_cast<int>(
-          (key << CommandID::kIndexBits) >> CommandID::kIndexBits);
-    }
-    /**
-     * \brief for BSP model
-     */
-    std::unordered_map<int, std::vector<real_t>> merge_buf_;
-    /**
-     * \brief the current timestamp
-     */
-    // int curr_timestamp_;
-
-    KVStoreDistServer* kvstore_;
-
-    ps::Customer* ps_obj_;
-    Aggregator* aggregator_;
-  };
-
-
-  /**
-   * \brief let the main thread execute python codes
-   */
-  Executor exec_;
-
-  bool sync_mode_;
-
-  KVStore::Updater updater_;
-
-  ps::OnlineServer<real_t, ServerVal, ServerHandle> store_;
-};
-
-
-}  // namespace kvstore
-}  // namespace mxnet
-
-#endif  // MXNET_KVSTORE_MXNET_PS_NODE_H_

From 55694dd0cb1f61562e0a162ad9a3420908ff7e58 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 00:02:13 -0400
Subject: [PATCH 014/122] [kvstore] bug fix

---
 Makefile                          |  6 +++---
 include/mxnet/kvstore.h           |  8 +-------
 ps-lite                           |  2 +-
 src/kvstore/kvstore_dist.h        | 17 +++++++++--------
 src/kvstore/kvstore_dist_server.h | 23 ++++++++++++++++++-----
 tests/python/multi-node/README.md |  2 +-
 6 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile
index 5ee09d05ff34..8cd116b57800 100644
--- a/Makefile
+++ b/Makefile
@@ -80,7 +80,7 @@ PS_PATH=./ps-lite
 DEPS_PATH=$(shell pwd)/deps
 include $(PS_PATH)/make/ps.mk
 ifeq ($(USE_DIST_KVSTORE), 1)
-	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/src
+	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/include
 	LIB_DEP += $(PS_PATH)/build/libps.a
 	LDFLAGS += -Wl,-rpath,$(DEPS_PATH)/lib $(PS_LDFLAGS_SO)
 endif
@@ -100,12 +100,12 @@ ifeq ($(USE_CUDA), 1)
 	ALL_DEP += $(CUOBJ)
 endif
 
-build/%.o: src/%.cc $(LIB_DEP)
+build/%.o: src/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
 	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
 
-build/%_gpu.o: src/%.cu $(LIB_DEP)
+build/%_gpu.o: src/%.cu
 	@mkdir -p $(@D)
 	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M build/$*_gpu.o $< >build/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index b84d1f62436f..8f547df0831a 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -188,13 +188,7 @@ class KVStore {
   }
 
   /*!
-   * \return The number of nodes in this group.
-   *
-   * Always returns 1 when type == "local". Otherwise, returns
-   *
-   * - number of workers if if `IsWorkerNode() == true`,
-   * - number of servers if if `IsServerNode() == true`,
-   * - 1 if `IsSchedulerNode() == true`,
+   * \return The number of worker nodes
    */
   virtual int get_group_size() const {
     return 1;
diff --git a/ps-lite b/ps-lite
index 1955d3dd0217..eb47c4d46c99 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 1955d3dd021794e207231c408b4f020f0191a33a
+Subproject commit eb47c4d46c9923783b38824fc15427b6d8760fde
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index fe6176ce9c9a..adab10322ffb 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -61,7 +61,7 @@ class KVStoreDist : public KVStoreLocal {
 
   void Push(const std::vector<int>& keys,
             const std::vector<NDArray>& values,
-            int priority) override {
+              int priority) override {
     // first aggregate the values over keys
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray> > grouped_vals;
@@ -73,17 +73,17 @@ class KVStoreDist : public KVStoreLocal {
       const NDArray& merged = MergePushValue(key, grouped_vals[i], priority);
 
       // push to servers
-      auto push_to_servers =
+auto push_to_servers =
           [this, key, merged](RunContext rctx, Engine::CallbackOnComplete cb) {
-        // convert to ps keys
+// convert to ps keys
         size_t size = merged.shape().Size();
         PSKV& pskv = EncodeKey(key, size);
 
         // do push
         real_t* data = static_cast<real_t*>(merged.data().dptr_);
         ps::SArray<real_t> vals(data, size, false);  // false means no delete
-        CHECK_NOTNULL(ps_worker_)->ZPush(
-            pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
+CHECK_NOTNULL(ps_worker_)->ZPush(
+        pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
       };
       Engine::Get()->PushAsync(
           push_to_servers,
@@ -119,10 +119,10 @@ class KVStoreDist : public KVStoreLocal {
         // convert to ps keys
         PSKV& pskv = EncodeKey(key, size);
 
-        // issue pull
-        ps::SArray<real_t> vals(data, size, false);  // false means no delete
+        // issue pull, false means no delete
+        auto vals = new ps::SArray<real_t>(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPull(
-            pskv.keys, &vals, &pskv.lens, 0, [cb](){ cb(); });
+            pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); });
       };
 
       CHECK_NOTNULL(Engine::Get())->PushAsync(
@@ -169,6 +169,7 @@ class KVStoreDist : public KVStoreLocal {
       server_ = new KVStoreDistServer();
       server_->set_controller(controller);
     }
+
     ps::Start("mxnet_server\0");
     if (server_) server_->Run();
     ps::Finalize();
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 50ef2d07fedc..62b251fa42ff 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -145,6 +145,10 @@ class KVStoreDistServer {
 
     int key = DecodeKey(req_data.keys[0]);
     auto& stored = store_[key];
+
+    // there are several WaitToRead here, this is because \a recved's memory
+    // could be deallocated when this function returns. so we need to make sure
+    // the operators with \a NDArray are actually finished
     if (req_meta.push) {
       size_t ds[] = {(size_t)req_data.lens[0]};
       TShape dshape(ds, ds + 1);
@@ -154,6 +158,9 @@ class KVStoreDistServer {
       if (stored.is_none()) {
         // initialization
         stored = NDArray(dshape, Context());
+        CopyFromTo(recved, &stored, 0);
+        server->Response(req_meta);
+        stored.WaitToRead();
       } else if (sync_mode_) {
         // synced push
         auto& merged = merge_buf_[key];
@@ -166,12 +173,12 @@ class KVStoreDistServer {
         } else {
           merged.array += recved;
         }
+
         merged.request.push_back(req_meta);
 
         if (merged.request.size() == (size_t)ps::NumWorkers()) {
           // let the main thread to execute updater_, which is necessary for
           // python
-          merged.array.WaitToRead();
           exec_.Exec([this, key, &merged, &stored](){
               CHECK(updater_);
               updater_(key, merged.array, &stored);
@@ -180,6 +187,9 @@ class KVStoreDistServer {
             server->Response(req);
           }
           merged.request.clear();
+          stored.WaitToRead();
+        } else {
+          merged.array.WaitToRead();
         }
       } else {
         // async push
@@ -187,15 +197,18 @@ class KVStoreDistServer {
             CHECK(updater_);
             updater_(key, recved, &stored);
           });
+        server->Response(req_meta);
+        stored.WaitToRead();
       }
-      stored.WaitToRead();
     } else {
       // pull
       ps::KVPairs<real_t> response;
+      CHECK(!stored.is_none()) << "init " << key << " first";
+      int len = stored.shape()[0];
       response.keys = req_data.keys;
-      response.lens = req_data.lens;
-      response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_),
-                             stored.shape()[0]);
+      response.lens = {len};
+      response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_), len);
+      server->Response(req_meta, response);
     }
   }
 
diff --git a/tests/python/multi-node/README.md b/tests/python/multi-node/README.md
index 0101e66186ed..32d308017c5f 100644
--- a/tests/python/multi-node/README.md
+++ b/tests/python/multi-node/README.md
@@ -10,6 +10,6 @@ least two gpus) and 2 servers.
 
 
 ```
-ln -s ../../../dmlc-core/tracker/dmlc_local.py
+ln -s ../../../dmlc-core/tracker/dmlc_local.py .
 ./dmlc_local.py -n 2 -s 2 ./dist_sync_mlp.py
 ```

From a2f47507d0409b6d27735c1d4cf49eb494a765be Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 01:14:25 -0400
Subject: [PATCH 015/122] [kvstore] bug fix

---
 ps-lite                           |  2 +-
 src/kvstore/kvstore_dist.h        | 11 ++++++-----
 src/kvstore/kvstore_dist_server.h |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ps-lite b/ps-lite
index eb47c4d46c99..a956db01bf52 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit eb47c4d46c9923783b38824fc15427b6d8760fde
+Subproject commit a956db01bf529a2969cda77c2944bac505226c92
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index adab10322ffb..18d50ffa5e61 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -73,16 +73,17 @@ class KVStoreDist : public KVStoreLocal {
       const NDArray& merged = MergePushValue(key, grouped_vals[i], priority);
 
       // push to servers
-auto push_to_servers =
+      auto push_to_servers =
           [this, key, merged](RunContext rctx, Engine::CallbackOnComplete cb) {
-// convert to ps keys
+         // convert to ps keys
         size_t size = merged.shape().Size();
         PSKV& pskv = EncodeKey(key, size);
 
         // do push
         real_t* data = static_cast<real_t*>(merged.data().dptr_);
-        ps::SArray<real_t> vals(data, size, false);  // false means no delete
-CHECK_NOTNULL(ps_worker_)->ZPush(
+        // false means no delete
+        ps::SArray<real_t> vals(data, size, false);
+        CHECK_NOTNULL(ps_worker_)->ZPush(
         pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
       };
       Engine::Get()->PushAsync(
@@ -122,7 +123,7 @@ CHECK_NOTNULL(ps_worker_)->ZPush(
         // issue pull, false means no delete
         auto vals = new ps::SArray<real_t>(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPull(
-            pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); });
+        pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); });
       };
 
       CHECK_NOTNULL(Engine::Get())->PushAsync(
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 62b251fa42ff..d25d6d95d989 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -146,7 +146,7 @@ class KVStoreDistServer {
     int key = DecodeKey(req_data.keys[0]);
     auto& stored = store_[key];
 
-    // there are several WaitToRead here, this is because \a recved's memory
+    // there used several WaitToRead, this is because \a recved's memory
     // could be deallocated when this function returns. so we need to make sure
     // the operators with \a NDArray are actually finished
     if (req_meta.push) {

From 11f692260138d90e3282d99fd358f484b9ee6599 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 19 Oct 2015 22:13:08 -0700
Subject: [PATCH 016/122] [R] switch mem layout, and update examples

---
 R-package/R/io.R                              |   3 +-
 R-package/R/metric.R                          |   4 +-
 R-package/R/model.R                           |  14 +-
 R-package/demo/basic_model.R                  |   4 +-
 R-package/src/base.h                          |  12 +-
 R-package/src/executor.cc                     |   4 +-
 R-package/src/io.cc                           |  23 +-
 R-package/src/ndarray.cc                      |  58 +++--
 R-package/src/ndarray.h                       |  14 +-
 R-package/src/symbol.cc                       |   6 +-
 .../classifyRealImageWithPretrainedModel.Rmd  |  12 +-
 R-package/vignettes/mnistCompetition.Rmd      |  25 +-
 .../classifyRealImageWithPretrainedModel.md   |  20 +-
 doc/R-package/fiveMinutesNeuralNetwork.md     | 215 +++++++++---------
 doc/R-package/mnistCompetition.md             |  97 ++++----
 15 files changed, 263 insertions(+), 248 deletions(-)

diff --git a/R-package/R/io.R b/R-package/R/io.R
index 5fe51c0eb70e..cde2b4c032f1 100644
--- a/R-package/R/io.R
+++ b/R-package/R/io.R
@@ -21,7 +21,8 @@ mx.io.extract <- function(iter, field) {
     padded <- iter$num.pad()
     data <- dlist[[field]]
     oshape <- dim(data)
-    packer$push(mx.nd.slice(data, 0, oshape[[1]] - padded))
+    ndim <- length(oshape)
+    packer$push(mx.nd.slice(data, 0, oshape[[ndim]] - padded))
   }
   iter$reset()
   return(packer$get())
diff --git a/R-package/R/metric.R b/R-package/R/metric.R
index 7d0b09fca922..923fec996af5 100644
--- a/R-package/R/metric.R
+++ b/R-package/R/metric.R
@@ -22,8 +22,8 @@ mx.metric.custom <- function(name, feval) {
 #'
 #' @export
 mx.metric.accuracy <- mx.metric.custom("accuracy", function(label, pred) {
-  ypred = max.col(pred, tie="first")
-  return(sum((label + 1) == ypred) / length(label))
+  ypred = max.col(t(as.array(pred)), tie="first")
+  return(sum((as.array(label) + 1) == ypred) / length(label))
 })
 
 #' RMSE (Root Mean Squared Error) metric for regression
diff --git a/R-package/R/model.R b/R-package/R/model.R
index e3011bd54c1f..e9febf3bd24d 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -1,12 +1,13 @@
 # slice the shape on the highest dimension
 mx.model.slice.shape <- function(shape, nsplit) {
-  batchsize <- shape[[1]]
+  ndim <- length(shape)
+  batchsize <- shape[[ndim]]
   step <- as.integer((batchsize + nsplit - 1) / nsplit)
   lapply(0:(nsplit - 1), function(k) {
     begin = min(k * step, batchsize)
     end = min((k + 1) * step, batchsize)
     s <- shape
-    s[[1]] = end - begin
+    s[[ndim]] = end - begin
     return(list(begin=begin, end=end, shape=s))
   })
 }
@@ -266,7 +267,8 @@ mx.model.init.iter <- function(X, y, batch.size, is.train) {
   if (is.null(y)) {
     if (is.train) stop("Need to provide parameter y for training with R arrays.")
     shape <- dim(X)
-    y <- c(1:shape[[1]]) * 0
+    ndim <- length(shape)
+    y <- c(1:shape[[ndim]]) * 0
   }
   batch.size <- min(length(y), batch.size)
   return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
@@ -324,7 +326,8 @@ function(symbol, X, y=NULL, ctx=NULL,
   }
   if (!is.list(ctx)) stop("ctx must be mx.context or list of mx.context")
   if (is.character(optimizer)) {
-    batchsize = input.shape[[1]]
+    ndim <- length(input.shape)
+    batchsize = input.shape[[ndim]]
     optimizer <- mx.opt.create(optimizer, rescale.grad=(1/batchsize), ...)
   }
 
@@ -366,7 +369,8 @@ predict.MXFeedForwardModel <- function(model, X, ctx=NULL, array.batch.size=128)
     out.pred <- mx.nd.copyto(pexec$ref.outputs[[1]], mx.cpu())
     padded <- X$num.pad()
     oshape <- dim(out.pred)
-    packer$push(mx.nd.slice(out.pred, 0, oshape[[1]] - padded))
+    ndim <- length(oshape)
+    packer$push(mx.nd.slice(out.pred, 0, oshape[[ndim]] - padded))
   }
   X$reset()
   return(packer$get())
diff --git a/R-package/demo/basic_model.R b/R-package/demo/basic_model.R
index d849c43974dc..1bf40d647c98 100644
--- a/R-package/demo/basic_model.R
+++ b/R-package/demo/basic_model.R
@@ -42,7 +42,6 @@ model <- mx.model.FeedForward.create(softmax, X=dtrain, eval.data=dtest,
                                      iter.end.callback=mx.callback.save.checkpoint("chkpt"),
                                      epoch.end.callback=mx.callback.log.train.metric(100))
 
-
 # do prediction
 pred <- predict(model, dtest)
 label <- mx.io.extract(dtest, "label")
@@ -51,11 +50,10 @@ dataX <- mx.io.extract(dtest, "data")
 pred2 <- predict(model, X=dataX)
 
 accuracy <- function(label, pred) {
-  ypred = max.col(as.array(pred))
+  ypred = max.col(t(as.array(pred)))
   return(sum((as.array(label) + 1) == ypred) / length(label))
 }
 
 print(paste0("Finish prediction... accuracy=", accuracy(label, pred)))
 print(paste0("Finish prediction... accuracy2=", accuracy(label, pred2)))
 
-
diff --git a/R-package/src/base.h b/R-package/src/base.h
index fae6b005958e..a9763cc022be 100644
--- a/R-package/src/base.h
+++ b/R-package/src/base.h
@@ -268,10 +268,11 @@ inline std::string toPyString(const std::string &key, const Rcpp::RObject& val)
   if (len != 1) {
     RCHECK(TYPEOF(val) == INTSXP || TYPEOF(val) == REALSXP)
         << "Only accept integer vectors or simple types";
+    // Do shape convesion back to reversed shape.
     Rcpp::IntegerVector vec(val);
     os << "(";
     for (size_t i = 0; i < vec.size(); ++i) {
-      int value = vec[i];
+      int value = vec[vec.size() - i - 1];
       if (i != 0) os << ", ";
       os << value;
     }
@@ -327,14 +328,15 @@ inline std::vector<std::string> SafeGetListNames(const Rcpp::List& src) {
 }
 
 /*!
- * \brief convert Rcpp's Dimension to shape vector
+ * \brief convert Rcpp's Dimension to internal shape vector
+ * This will reverse the shape layout internally
  * \param rshape The dimension in R
- * \return A vector representation in R.
+ * \return A internal vector representation of shapes in mxnet.
  */
-inline std::vector<mx_uint> Dim2Vec(const Rcpp::Dimension &rshape) {
+inline std::vector<mx_uint> Dim2InternalShape(const Rcpp::Dimension &rshape) {
   std::vector<mx_uint> shape(rshape.size());
   for (size_t i = 0; i < rshape.size(); ++i) {
-    shape[i] = rshape[i];
+    shape[rshape.size() - i - 1] = rshape[i];
   }
   return shape;
 }
diff --git a/R-package/src/executor.cc b/R-package/src/executor.cc
index 33a28218f103..3b4fd9becf50 100644
--- a/R-package/src/executor.cc
+++ b/R-package/src/executor.cc
@@ -118,7 +118,7 @@ inline Rcpp::List* CreateArrayList(const Rcpp::List& source_array,
       RCHECK(Rcpp::is<NDArray>(source_array[i]))
           << "Expect input " << key << " to be list of " << NDArray::TypeName();
       NDArray src = NDArray::FromRObject(source_array[i]);
-      ret->at(i) = NDArray::Empty(src.shape(), ctx);
+      ret->at(i) = NDArray::Empty(src.dim(), ctx);
       NDArray dst = NDArray::FromRObject(ret->at(i));
       handles->at(i) = dst->handle;
       NDArray::CopyFromTo(src, &dst);
@@ -146,7 +146,7 @@ inline Rcpp::List* CreateGradList(const Rcpp::List& source_array,
       RCHECK(Rcpp::is<bool>(grad_reqs[i]))
           << "Expect input grad_reqs to be list of booleans";
       if (Rcpp::as<bool>(grad_reqs[i])) {
-        ret->at(i) = NDArray::Empty(NDArray::FromRObject(source_array[i]).shape(), ctx);
+        ret->at(i) = NDArray::Empty(NDArray::FromRObject(source_array[i]).dim(), ctx);
         handles->at(i) = NDArray::FromRObject(ret->at(i))->handle;
         grad_req_type->at(i) = 1;
       }
diff --git a/R-package/src/io.cc b/R-package/src/io.cc
index 2ae5cf832a43..f84fd2159aea 100644
--- a/R-package/src/io.cc
+++ b/R-package/src/io.cc
@@ -43,6 +43,18 @@ ArrayDataIter::ArrayDataIter(const Rcpp::NumericVector& data,
                              const Rcpp::NumericVector& unif_rnds,
                              int batch_size,
                              bool shuffle) : counter_(0) {
+  Rcpp::IntegerVector dshape = data.attr("dim");
+  Rcpp::IntegerVector lshape = label.attr("dim");
+  if (dshape[dshape.size() - 1] != lshape[lshape.size() - 1]) {
+    if (dshape[0] == lshape[0]) {
+      RLOG_FATAL << "Seems X, y was passed in a Row major way, "
+                 << "MXNetR adopts a column major convention.\n"
+                 << "Please pass in transpose of X instead";
+    } else {
+      RLOG_FATAL << "Data and label shape in-consistent";
+    }
+  }
+
   std::vector<size_t> order(label.size());
   for (size_t i = 0; i < order.size(); ++i) {
     order[i] = i;
@@ -70,15 +82,16 @@ void ArrayDataIter::Convert(const Rcpp::NumericVector& src,
                             std::vector<NDArray> *out) {
   Rcpp::RObject dim = src.attr("dim");
   Rcpp::Dimension rshape(dim);
-  std::vector<mx_float> temp, batch;
-  ConvertToRowMajor(src, &temp);
+  size_t ndim = rshape.size();
+  std::vector<mx_float> temp(src.size()), batch;
+  std::copy(src.begin(), src.end(), temp.begin());
   out->clear();
-  out->reserve(rshape[0] / batch_size + 1);
+  out->reserve(rshape[ndim - 1] / batch_size + 1);
   size_t line_size = 1;
-  for (size_t i = 1; i < rshape.size(); ++i) {
+  for (size_t i = 0; i < rshape.size() - 1; ++i) {
     line_size *= rshape[i];
   }
-  rshape[0] = batch_size;
+  rshape[ndim - 1] = batch_size;
   batch.resize(batch_size * line_size, 0.0f);
 
   for (size_t begin = 0; begin < order.size(); begin += batch_size) {
diff --git a/R-package/src/ndarray.cc b/R-package/src/ndarray.cc
index f944384f065a..227237e7ac5a 100644
--- a/R-package/src/ndarray.cc
+++ b/R-package/src/ndarray.cc
@@ -121,30 +121,25 @@ inline void RowToColMajor(const mx_float *in_data,
   }
 }
 
-void ConvertToRowMajor(const Rcpp::NumericVector& rdata, std::vector<mx_float>* out) {
-  Rcpp::RObject dim = rdata.attr("dim");
-  Rcpp::Dimension rshape(dim);
-  out->resize(rdata.size());
-  ColToRowMajor(rdata.begin(), Dim2Vec(rshape),
-                out->size(), dmlc::BeginPtr(*out));
-}
-
 void NDArrayPacker::Push(const NDArray::RObjectType& nd) {
   NDArray arr(nd);
-  Rcpp::Dimension rshape = arr.shape();
+  Rcpp::Dimension rshape = arr.dim();
   if (shape_.size() == 0) {
     shape_.resize(rshape.size());
     for (size_t i = 0; i < shape_.size(); ++i) {
       shape_[i] = rshape[i];
     }
   } else {
-    for (size_t i = 1; i < shape_.size(); ++i) {
+    RCHECK(shape_.size() == rshape.size())
+        << "The number of dimension need to be matched";
+    for (size_t i = 0; i < shape_.size() - 1; ++i) {
       RCHECK(shape_[i] == rshape[i])
-          << "The dimension besides 0 need to be consistent for arrays pushed";
+          << "The dimension besides last need to be consistent for arrays pushed";
     }
-    shape_[0] += rshape[0];
+    shape_.back() += rshape[shape_.size() - 1];
   }
-  size_t begin = data_.size(), size = rshape.prod();
+  size_t begin = data_.size();
+  size_t size = rshape.prod();
   data_.resize(begin + size);
   MX_CALL(MXNDArraySyncCopyToCPU(
       arr->handle, dmlc::BeginPtr(data_) + begin, size));
@@ -156,8 +151,7 @@ Rcpp::NumericVector NDArrayPacker::Get() const {
   Rcpp::Dimension dim(sexp);
   Rcpp::NumericVector ret(dim);
   RCHECK(ret.size() == data_.size());
-  RowToColMajor(dmlc::BeginPtr(data_), shape_,
-                data_.size(), ret.begin());
+  std::copy(data_.begin(), data_.end(), ret.begin());
   return ret;
 }
 
@@ -165,18 +159,19 @@ Rcpp::RObject NDArrayPacker::CreateNDArrayPacker() {
   return Rcpp::internal::make_new_object(new NDArrayPacker());
 }
 
-Rcpp::Dimension NDArray::shape() const {
+Rcpp::Dimension NDArray::dim() const {
   mx_uint ndim;
   const mx_uint *pshape;
   MX_CALL(MXNDArrayGetShape(
       ptr_->handle, &ndim, &pshape));
   Rcpp::IntegerVector dat(pshape, pshape + ndim);
+  std::reverse(dat.begin(), dat.end());
   Rcpp::RObject ret = dat;
   return Rcpp::Dimension(ret);
 }
 
 NDArray NDArray::Clone() const {
-  std::vector<mx_uint> shape = Dim2Vec(this->shape());
+  std::vector<mx_uint> shape = Dim2InternalShape(this->dim());
   Context ctx = this->ctx();
   NDArrayHandle handle;
   MX_CALL(MXNDArrayCreate(dmlc::BeginPtr(shape),
@@ -194,7 +189,7 @@ Context NDArray::ctx() const {
 }
 
 size_t NDArray::Size() const {
-  Rcpp::Dimension dim = this->shape();
+  Rcpp::Dimension dim = this->dim();
   size_t sz = 1;
   for (size_t i = 0; i < dim.size(); ++i) {
     sz *= dim[i];
@@ -209,13 +204,12 @@ NDArray NDArray::Slice(mx_uint begin, mx_uint end) const {
 }
 
 Rcpp::NumericVector NDArray::AsNumericVector() const {
-  Rcpp::Dimension rshape = this->shape();
+  Rcpp::Dimension rshape = this->dim();
   std::vector<mx_float> temp(rshape.prod());
   MX_CALL(MXNDArraySyncCopyToCPU(
       ptr_->handle, dmlc::BeginPtr(temp), temp.size()));
   Rcpp::NumericVector ret(rshape);
-  RowToColMajor(dmlc::BeginPtr(temp), Dim2Vec(rshape),
-                temp.size(), ret.begin());
+  std::copy(temp.begin(), temp.end(), ret.begin());
   return ret;
 }
 
@@ -263,7 +257,7 @@ Rcpp::List NDArray::Load(const std::string& filename) {
 NDArray::RObjectType NDArray::Empty(
     const Rcpp::Dimension& rshape,
     const Context::RObjectType& rctx) {
-  std::vector<mx_uint> shape = Dim2Vec(rshape);
+  std::vector<mx_uint> shape = Dim2InternalShape(rshape);
   Context ctx(rctx);
   NDArrayHandle handle;
   MX_CALL(MXNDArrayCreate(dmlc::BeginPtr(shape),
@@ -310,13 +304,12 @@ NDArray::RObjectType NDArray::Array(
   Rcpp::NumericVector rdata(src);
   Rcpp::RObject dim = rdata.attr("dim");
   Rcpp::Dimension rshape(dim);
-  std::vector<mx_float> temp(rdata.size());
-  ColToRowMajor(rdata.begin(), Dim2Vec(rshape),
-                temp.size(), dmlc::BeginPtr(temp));
   RObjectType ret = NDArray::Empty(rshape, ctx);
+  std::vector<mx_float> temp(rdata.size());
+  std::copy(rdata.begin(), rdata.end(), temp.begin());
   MX_CALL(MXNDArraySyncCopyFromCPU(
       NDArray(ret)->handle,
-      dmlc::BeginPtr(temp), temp.size()));
+      dmlc::BeginPtr(temp), rdata.size()));
   return ret;
 }
 
@@ -578,8 +571,8 @@ NDArray::RObjectType DispatchOps(SEXP op, SEXP lhs, SEXP rhs) {
   return NDArray::RObject(out, true);
 }
 
-Rcpp::Dimension shape(const NDArray::RObjectType& src) {
-  return NDArray(src).shape();
+Rcpp::Dimension dim(const NDArray::RObjectType& src) {
+  return NDArray(src).dim();
 }
 
 Context::RObjectType ctx(const NDArray::RObjectType& src) {
@@ -596,7 +589,12 @@ Rcpp::NumericVector AsNumericVector(const NDArray::RObjectType& src) {
 
 NDArray::RObjectType Slice(const NDArray::RObjectType& src,
                            mx_uint begin, mx_uint end) {
-  return NDArray(src).Slice(begin, end).RObject();
+  NDArray nd(src);
+  Rcpp::Dimension dim = nd.dim();
+  size_t ndim = dim.size();
+  RCHECK(dim[ndim - 1] >= end)
+      << "end=" << end << ", max-dim=" << dim[ndim - 1];
+  return nd.Slice(begin, end).RObject();
 }
 }  // namespace ndarray
 
@@ -610,7 +608,7 @@ void NDArray::InitRcppModule() {
   function("mx.nd.internal.empty.array", &NDArray::Empty);
   function("mx.nd.internal.dispatch.Ops", &ndarray::DispatchOps);
   // exposing members
-  function("mx.nd.internal.dim", &ndarray::shape);
+  function("mx.nd.internal.dim", &ndarray::dim);
   function("mx.nd.internal.ctx", &ndarray::ctx);
   function("mx.nd.internal.length", &ndarray::Size);
   function("mx.nd.internal.as.array", &ndarray::AsNumericVector);
diff --git a/R-package/src/ndarray.h b/R-package/src/ndarray.h
index deb0d27af881..131aae5433ad 100644
--- a/R-package/src/ndarray.h
+++ b/R-package/src/ndarray.h
@@ -101,9 +101,9 @@ class NDArray  {
   }
   /*!
    * \param src The source array.
-   * \return The shape of the array
+   * \return The dimension of the array
    */
-  Rcpp::Dimension shape() const;
+  Rcpp::Dimension dim() const;
   /*!
    * \brief Return a clone of NDArray.
    *  Do not expose this to R side.
@@ -269,14 +269,8 @@ class NDArrayFunction : public ::Rcpp::CppFunction {
 };
 
 /*!
- * \brief Convert the src into row major layout into out
- * \param src The source vector
- * \param out The output memory.
- */
-void ConvertToRowMajor(const Rcpp::NumericVector& src, std::vector<mx_float>* out);
-
-/*!
- * \brief An array packer that packs NDArray array together on dimension 0.
+ * \brief An array packer that packs NDArray array together on
+ *   slowest changing dimension.
  */
 class NDArrayPacker {
  public:
diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index 26b5088bbd1b..82cd2cb86696 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -120,7 +120,9 @@ inline Rcpp::List BuildShapeData(mx_uint shape_size,
                                  const std::vector<std::string> &names) {
   Rcpp::List ret(shape_size);
   for (mx_uint i = 0; i < shape_size; ++i) {
-    ret[i] = Rcpp::IntegerVector(shape_data[i], shape_data[i] + shape_ndim[i]);
+    Rcpp::IntegerVector dim(shape_data[i], shape_data[i] + shape_ndim[i]);
+    std::reverse(dim.begin(), dim.end());
+    ret[i] = dim;
   }
   ret.names() = names;
   return ret;
@@ -136,7 +138,7 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
   for (size_t i = 0; i < kwargs.size(); ++i) {
     RCHECK(keys[i].length() != 0)
       << "Need to pass parameters in key=value style.\n";
-    std::vector<mx_uint> dim = Dim2Vec(kwargs[i]);
+    std::vector<mx_uint> dim = Dim2InternalShape(kwargs[i]);
     arg_shape_data.insert(arg_shape_data.end(), dim.begin(), dim.end());
     arg_ind_ptr.push_back(static_cast<mx_uint>(arg_shape_data.size()));
   }
diff --git a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
index 8a0e79c739c4..c1f63a164a5e 100644
--- a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
+++ b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
@@ -77,12 +77,10 @@ preproc.image <-function(im, mean.image) {
   # convert to array (x, y, channel)
   arr <- as.array(resized)
   dim(arr) = c(224, 224, 3)
-  # Change to the format of mxnet (channel, height, width)
-  sample <- aperm(arr, c(3, 2, 1))
   # substract the mean
-  normed <- sample - mean.img
-  # Reshape to format needed by mxnet
-  dim(normed) <- c(1, 3, 224, 224)
+  normed <- arr - mean.img
+  # Reshape to format needed by mxnet (width, height, channel, num)
+  dim(normed) <- c(224, 224, 3, 1)
   return(normed)
 }
 ```
@@ -106,9 +104,9 @@ dim(prob)
 As you can see ```prob``` is a 1 times 1000 array, which gives the probability
 over the 1000 image classes of the input.
 
-We can use the ```max.col``` to get the class index.
+We can use the ```max.col``` on the transpose of prob. get the class index.
 ```{r}
-max.idx <- max.col(prob)
+max.idx <- max.col(t(prob))
 max.idx
 ```
 
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index ebfbc505907a..1913887426cf 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -27,9 +27,10 @@ train.y <- train[,1]
 Here every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255], we can linearly transform it into [0,1] by
 
 ```{r}
-train.x <- train.x/255
-test <- test/255
+train.x <- t(train.x/255)
+test <- t(test/255)
 ```
+We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R).
 
 In the label part, we see the number of each digit is fairly even:
 
@@ -59,7 +60,7 @@ softmax <- mx.symbol.Softmax(fc3, name="sm")
 6. Here comes the output layer. Since there's only 10 digits, we set the number of neurons to 10.
 7. Finally we set the activation to softmax to get a probabilistic prediction.
 
-## Training 
+## Training
 
 We are almost ready for the training process. Before we start the computation, let's decide what device should we use.
 
@@ -90,14 +91,14 @@ dim(preds)
 It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
 
 ```{r}
-pred.label <- max.col(preds) - 1
+pred.label <- max.col(t(preds)) - 1
 table(pred.label)
 ```
 
 With a little extra effort in the csv format, we can have our submission to the competition!
 
 ```{r}
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
@@ -133,12 +134,10 @@ lenet <- mx.symbol.Softmax(data=fc2)
 Then let us reshape the matrices into arrays:
 
 ```{r}
-train.array <- t(train.x)
-dim(train.array) <- c(1,28,28,nrow(train.x))
-train.array <- aperm(train.array, c(4,1,2,3))
-test.array <- t(test)
-dim(test.array) <- c(1,28,28,nrow(test))
-test.array <- aperm(test.array, c(4,1,2,3))
+train.array <- train.x
+dim(train.array) <- c(28, 28, 1, ncol(train.x))
+test.array <- test
+dim(test.array) <- c(28, 28, 1, ncol(test))
 ```
 
 Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
@@ -185,8 +184,8 @@ Finally we can submit the result to Kaggle again to see the improvement of our r
 
 ```{r}
 preds <- predict(model, test.array)
-pred.label <- max.col(preds) - 1
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+pred.label <- max.col(t(preds)) - 1
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
diff --git a/doc/R-package/classifyRealImageWithPretrainedModel.md b/doc/R-package/classifyRealImageWithPretrainedModel.md
index 16d96f9abbd2..f5c88fed24ba 100644
--- a/doc/R-package/classifyRealImageWithPretrainedModel.md
+++ b/doc/R-package/classifyRealImageWithPretrainedModel.md
@@ -9,6 +9,12 @@ real world image. The network architecture is decribed in [1].
 The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
 
+Preface
+-------
+This tutorial is written in Rmarkdown.
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/classifyRealImageWithPretrainedModel.html)
+- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
+
 Pacakge Loading
 ---------------
 To get started, we load the mxnet package by require mxnet.
@@ -115,12 +121,10 @@ preproc.image <-function(im, mean.image) {
   # convert to array (x, y, channel)
   arr <- as.array(resized)
   dim(arr) = c(224, 224, 3)
-  # Change to the format of mxnet (channel, height, width)
-  sample <- aperm(arr, c(3, 2, 1))
   # substract the mean
-  normed <- sample - mean.img
-  # Reshape to format needed by mxnet
-  dim(normed) <- c(1, 3, 224, 224)
+  normed <- arr - mean.img
+  # Reshape to format needed by mxnet (width, height, channel, num)
+  dim(normed) <- c(224, 224, 3, 1)
   return(normed)
 }
 ```
@@ -144,16 +148,16 @@ dim(prob)
 ```
 
 ```
-## [1]    1 1000
+## [1] 1000    1
 ```
 
 As you can see ```prob``` is a 1 times 1000 array, which gives the probability
 over the 1000 image classes of the input.
 
-We can use the ```max.col``` to get the class index.
+We can use the ```max.col``` on the transpose of prob. get the class index.
 
 ```r
-max.idx <- max.col(prob)
+max.idx <- max.col(t(prob))
 max.idx
 ```
 
diff --git a/doc/R-package/fiveMinutesNeuralNetwork.md b/doc/R-package/fiveMinutesNeuralNetwork.md
index 1d6dd0eca3e8..d1bbf3b9ca59 100644
--- a/doc/R-package/fiveMinutesNeuralNetwork.md
+++ b/doc/R-package/fiveMinutesNeuralNetwork.md
@@ -1,7 +1,7 @@
 Neural Network with MXNet in Five Minutes
 =============================================
 
-This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. 
+This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes.
 
 We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
 
@@ -38,19 +38,22 @@ data(Sonar, package="mlbench")
 
 Sonar[,61] = as.numeric(Sonar[,61])-1
 train.ind = c(1:50, 100:150)
-train.x = data.matrix(Sonar[train.ind, 1:60])
+train.x = t(data.matrix(Sonar[train.ind, 1:60]))
 train.y = Sonar[train.ind, 61]
-test.x = data.matrix(Sonar[-train.ind, 1:60])
+test.x = t(data.matrix(Sonar[-train.ind, 1:60]))
 test.y = Sonar[-train.ind, 61]
 ```
 
+MXNet accepts a column major input convention as in R.
+So we need to transpose the data matrix to nfeature x nexample before feed into the network.
+
 The next step is to define the structure of the neural network.
 
 
 ```r
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=20)
@@ -116,7 +119,7 @@ Note that `mx.set.seed` is the correct function to control the random process in
 
 ```r
 preds = predict(model, test.x)
-pred.label = max.col(preds)-1
+pred.label = max.col(t(preds))-1
 table(pred.label, test.y)
 ```
 
@@ -136,9 +139,9 @@ Again, let us preprocess the data first.
 data(BostonHousing, package="mlbench")
 
 train.ind = seq(1, 506, 3)
-train.x = data.matrix(BostonHousing[train.ind, -14])
+train.x = t(data.matrix(BostonHousing[train.ind, -14]))
 train.y = BostonHousing[train.ind, 14]
-test.x = data.matrix(BostonHousing[-train.ind, -14])
+test.x = t(data.matrix(BostonHousing[-train.ind, -14]))
 test.y = BostonHousing[-train.ind, 14]
 ```
 
@@ -148,7 +151,7 @@ We can configure another network as what we have done above. The main difference
 ```r
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
@@ -170,56 +173,56 @@ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
 
 ```
 ## Start training with 1 devices
-## [1] Train-rmse=16.063282524034
-## [2] Train-rmse=12.2792375712573
-## [3] Train-rmse=11.1984634005885
+## [1] Train-rmse=16.0632825223292
+## [2] Train-rmse=12.2792375527391
+## [3] Train-rmse=11.1984634148088
 ## [4] Train-rmse=10.2645236892904
-## [5] Train-rmse=9.49711005504284
-## [6] Train-rmse=9.07733734175182
-## [7] Train-rmse=9.07884450847991
-## [8] Train-rmse=9.10463850277417
-## [9] Train-rmse=9.03977049028532
-## [10] Train-rmse=8.96870685004475
-## [11] Train-rmse=8.93113287361574
-## [12] Train-rmse=8.89937257821847
-## [13] Train-rmse=8.87182096922953
-## [14] Train-rmse=8.84476075083586
-## [15] Train-rmse=8.81464673014974
-## [16] Train-rmse=8.78672567900196
-## [17] Train-rmse=8.76265872846474
-## [18] Train-rmse=8.73946101419974
-## [19] Train-rmse=8.71651926303267
-## [20] Train-rmse=8.69457600919277
-## [21] Train-rmse=8.67354928674563
-## [22] Train-rmse=8.65328755392436
-## [23] Train-rmse=8.63378039680078
-## [24] Train-rmse=8.61488162586984
-## [25] Train-rmse=8.5965105183022
-## [26] Train-rmse=8.57868133563275
-## [27] Train-rmse=8.56135851937663
-## [28] Train-rmse=8.5444819772098
-## [29] Train-rmse=8.52802114610432
-## [30] Train-rmse=8.5119504512622
-## [31] Train-rmse=8.49624261719241
-## [32] Train-rmse=8.48087453238701
-## [33] Train-rmse=8.46582689119887
-## [34] Train-rmse=8.45107881002491
-## [35] Train-rmse=8.43661331401712
-## [36] Train-rmse=8.42241575909639
-## [37] Train-rmse=8.40847217331365
-## [38] Train-rmse=8.39476931796395
-## [39] Train-rmse=8.38129658373974
-## [40] Train-rmse=8.36804269059018
-## [41] Train-rmse=8.35499817678397
-## [42] Train-rmse=8.34215505742154
-## [43] Train-rmse=8.32950441908131
-## [44] Train-rmse=8.31703985777311
-## [45] Train-rmse=8.30475363906755
-## [46] Train-rmse=8.29264031506106
-## [47] Train-rmse=8.28069372820073
-## [48] Train-rmse=8.26890902770415
-## [49] Train-rmse=8.25728089053853
-## [50] Train-rmse=8.24580511500735
+## [5] Train-rmse=9.49711003902655
+## [6] Train-rmse=9.07733735504537
+## [7] Train-rmse=9.07884447337348
+## [8] Train-rmse=9.10463849901276
+## [9] Train-rmse=9.03977048081203
+## [10] Train-rmse=8.96870681959898
+## [11] Train-rmse=8.93113268945833
+## [12] Train-rmse=8.89937250031474
+## [13] Train-rmse=8.87182124831547
+## [14] Train-rmse=8.84476111567396
+## [15] Train-rmse=8.81464687265692
+## [16] Train-rmse=8.78672579209995
+## [17] Train-rmse=8.76265895056591
+## [18] Train-rmse=8.73946101364483
+## [19] Train-rmse=8.7165194446551
+## [20] Train-rmse=8.69457580107095
+## [21] Train-rmse=8.67354933875898
+## [22] Train-rmse=8.65328764760528
+## [23] Train-rmse=8.63378016812285
+## [24] Train-rmse=8.61488175856399
+## [25] Train-rmse=8.59651041652324
+## [26] Train-rmse=8.57868122898644
+## [27] Train-rmse=8.56135865255391
+## [28] Train-rmse=8.54448212525355
+## [29] Train-rmse=8.52802110389574
+## [30] Train-rmse=8.51195043845808
+## [31] Train-rmse=8.49624250344235
+## [32] Train-rmse=8.48087452797975
+## [33] Train-rmse=8.46582681750595
+## [34] Train-rmse=8.45107900842757
+## [35] Train-rmse=8.43661347614512
+## [36] Train-rmse=8.42241598595198
+## [37] Train-rmse=8.40847223745159
+## [38] Train-rmse=8.39476934189048
+## [39] Train-rmse=8.38129658669852
+## [40] Train-rmse=8.36804245552321
+## [41] Train-rmse=8.35499814305568
+## [42] Train-rmse=8.34215500774088
+## [43] Train-rmse=8.3295045517182
+## [44] Train-rmse=8.31703965839842
+## [45] Train-rmse=8.30475372106883
+## [46] Train-rmse=8.2926402584762
+## [47] Train-rmse=8.2806936364631
+## [48] Train-rmse=8.26890890119326
+## [49] Train-rmse=8.25728092677924
+## [50] Train-rmse=8.24580513680541
 ```
 
 It is also easy to make prediction and evaluate
@@ -257,56 +260,56 @@ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
 
 ```
 ## Start training with 1 devices
-## [1] Train-mae=13.1889538083225
-## [2] Train-mae=9.81431959337658
-## [3] Train-mae=9.21576419870059
+## [1] Train-mae=13.1889538090676
+## [2] Train-mae=9.81431958410475
+## [3] Train-mae=9.21576420929697
 ## [4] Train-mae=8.38071537613869
-## [5] Train-mae=7.45462437611487
-## [6] Train-mae=6.93423301743136
-## [7] Train-mae=6.91432357016537
-## [8] Train-mae=7.02742733055105
-## [9] Train-mae=7.00618194618469
-## [10] Train-mae=6.92541576984028
-## [11] Train-mae=6.87530243690643
-## [12] Train-mae=6.84757369098564
-## [13] Train-mae=6.82966501611388
-## [14] Train-mae=6.81151759574811
-## [15] Train-mae=6.78394182841811
-## [16] Train-mae=6.75914719419347
-## [17] Train-mae=6.74180388773481
-## [18] Train-mae=6.725853071279
-## [19] Train-mae=6.70932178215848
-## [20] Train-mae=6.6928868798746
-## [21] Train-mae=6.6769521329138
-## [22] Train-mae=6.66184809505939
-## [23] Train-mae=6.64754504809777
-## [24] Train-mae=6.63358514060577
-## [25] Train-mae=6.62027640889088
-## [26] Train-mae=6.60738245232238
-## [27] Train-mae=6.59505546771818
-## [28] Train-mae=6.58346195800437
-## [29] Train-mae=6.57285477783945
-## [30] Train-mae=6.56259003960424
-## [31] Train-mae=6.5527790788975
-## [32] Train-mae=6.54353428422991
-## [33] Train-mae=6.5344172368447
-## [34] Train-mae=6.52557652526432
-## [35] Train-mae=6.51697905850079
-## [36] Train-mae=6.50847898812758
-## [37] Train-mae=6.50014844106303
-## [38] Train-mae=6.49207674844397
-## [39] Train-mae=6.48412070125341
-## [40] Train-mae=6.47650500999557
-## [41] Train-mae=6.46893867486053
-## [42] Train-mae=6.46142131653097
-## [43] Train-mae=6.45395035048326
-## [44] Train-mae=6.44652914123403
-## [45] Train-mae=6.43916216409869
-## [46] Train-mae=6.43183777381976
-## [47] Train-mae=6.42455544223388
-## [48] Train-mae=6.41731406417158
-## [49] Train-mae=6.41011292926139
-## [50] Train-mae=6.40312503493494
+## [5] Train-mae=7.45462434962392
+## [6] Train-mae=6.93423304392232
+## [7] Train-mae=6.91432355824444
+## [8] Train-mae=7.02742730538464
+## [9] Train-mae=7.00618193757513
+## [10] Train-mae=6.92541587183045
+## [11] Train-mae=6.87530209053722
+## [12] Train-mae=6.847573687012
+## [13] Train-mae=6.82966502538572
+## [14] Train-mae=6.81151769575146
+## [15] Train-mae=6.78394197610517
+## [16] Train-mae=6.75914737499422
+## [17] Train-mae=6.74180429437094
+## [18] Train-mae=6.72585320373376
+## [19] Train-mae=6.70932160268227
+## [20] Train-mae=6.69288677523534
+## [21] Train-mae=6.67695207827621
+## [22] Train-mae=6.66184799075127
+## [23] Train-mae=6.64754500372542
+## [24] Train-mae=6.63358518299129
+## [25] Train-mae=6.62027624067333
+## [26] Train-mae=6.60738218476375
+## [27] Train-mae=6.59505565381712
+## [28] Train-mae=6.58346203284131
+## [29] Train-mae=6.57285475134849
+## [30] Train-mae=6.56259016940991
+## [31] Train-mae=6.55277890273266
+## [32] Train-mae=6.54353418886248
+## [33] Train-mae=6.53441721167829
+## [34] Train-mae=6.52557678090202
+## [35] Train-mae=6.51697915651732
+## [36] Train-mae=6.50847910601232
+## [37] Train-mae=6.50014858543873
+## [38] Train-mae=6.49207666102383
+## [39] Train-mae=6.48412067078882
+## [40] Train-mae=6.47650481263797
+## [41] Train-mae=6.46893873314063
+## [42] Train-mae=6.46142139865292
+## [43] Train-mae=6.45395037829876
+## [44] Train-mae=6.44652904189295
+## [45] Train-mae=6.43916221575605
+## [46] Train-mae=6.43183771024148
+## [47] Train-mae=6.42455528063907
+## [48] Train-mae=6.41731397675143
+## [49] Train-mae=6.41011299813787
+## [50] Train-mae=6.40312501904037
 ```
 
 Congratulations! Now you have learnt the basic for using `mxnet`.
diff --git a/doc/R-package/mnistCompetition.md b/doc/R-package/mnistCompetition.md
index 0e73f7700486..16a7ca761146 100644
--- a/doc/R-package/mnistCompetition.md
+++ b/doc/R-package/mnistCompetition.md
@@ -1,12 +1,12 @@
 Handwritten Digits Classification Competition
-======================================================
+=============================================
 
-[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set. We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
+[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set.
+We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
 hosted version of tutorial [here](http://mxnet.readthedocs.org/en/latest/R-package/mnistCompetition.html).
 
-
 ## Data Loading
 
 First, let us download the data from [here](https://www.kaggle.com/c/digit-recognizer/data), and put them under the `data/` folder in your working directory.
@@ -37,9 +37,10 @@ Here every image is represented as a single row in train/test. The greyscale of
 
 
 ```r
-train.x <- train.x/255
-test <- test/255
+train.x <- t(train.x/255)
+test <- t(test/255)
 ```
+We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R).
 
 In the label part, we see the number of each digit is fairly even:
 
@@ -77,7 +78,7 @@ softmax <- mx.symbol.Softmax(fc3, name="sm")
 6. Here comes the output layer. Since there's only 10 digits, we set the number of neurons to 10.
 7. Finally we set the activation to softmax to get a probabilistic prediction.
 
-## Training 
+## Training
 
 We are almost ready for the training process. Before we start the computation, let's decide what device should we use.
 
@@ -163,14 +164,14 @@ dim(preds)
 ```
 
 ```
-## [1] 28000    10
+## [1]    10 28000
 ```
 
 It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
 
 
 ```r
-pred.label <- max.col(preds) - 1
+pred.label <- max.col(t(preds)) - 1
 table(pred.label)
 ```
 
@@ -184,7 +185,7 @@ With a little extra effort in the csv format, we can have our submission to the
 
 
 ```r
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
@@ -222,12 +223,10 @@ Then let us reshape the matrices into arrays:
 
 
 ```r
-train.array <- t(train.x)
-dim(train.array) <- c(1,28,28,nrow(train.x))
-train.array <- aperm(train.array, c(4,1,2,3))
-test.array <- t(test)
-dim(test.array) <- c(1,28,28,nrow(test))
-test.array <- aperm(test.array, c(4,1,2,3))
+train.array <- train.x
+dim(train.array) <- c(28, 28, 1, ncol(train.x))
+test.array <- test
+dim(test.array) <- c(28, 28, 1, ncol(test))
 ```
 
 Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
@@ -259,11 +258,11 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
 
 ```
 ## Start training with 1 devices
-## Batch [100] Train-accuracy=0.1054
-## Batch [200] Train-accuracy=0.1237
-## Batch [300] Train-accuracy=0.352766666666667
-## Batch [400] Train-accuracy=0.498824999999999
-## [1] Train-accuracy=0.519546539379474
+## Batch [100] Train-accuracy=0.1066
+## Batch [200] Train-accuracy=0.16495
+## Batch [300] Train-accuracy=0.401766666666667
+## Batch [400] Train-accuracy=0.537675
+## [1] Train-accuracy=0.557136038186157
 ```
 
 ```r
@@ -272,7 +271,7 @@ print(proc.time() - tic)
 
 ```
 ##    user  system elapsed 
-## 132.340 203.621  84.825
+## 130.030 204.976  83.821
 ```
 
 Training on GPU:
@@ -290,31 +289,31 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
 
 ```
 ## Start training with 1 devices
-## Batch [100] Train-accuracy=0.1055
-## Batch [200] Train-accuracy=0.1197
-## Batch [300] Train-accuracy=0.346266666666667
-## Batch [400] Train-accuracy=0.4925
-## [1] Train-accuracy=0.513699284009546
-## Batch [100] Train-accuracy=0.9577
-## Batch [200] Train-accuracy=0.961849999999999
-## Batch [300] Train-accuracy=0.966
-## Batch [400] Train-accuracy=0.968750000000003
-## [2] Train-accuracy=0.969404761904765
-## Batch [100] Train-accuracy=0.977399999999999
-## Batch [200] Train-accuracy=0.97815
-## Batch [300] Train-accuracy=0.980033333333335
-## Batch [400] Train-accuracy=0.981400000000003
-## [3] Train-accuracy=0.981761904761908
-## Batch [100] Train-accuracy=0.985799999999999
-## Batch [200] Train-accuracy=0.98575
-## Batch [300] Train-accuracy=0.986666666666668
-## Batch [400] Train-accuracy=0.987550000000003
-## [4] Train-accuracy=0.987880952380955
-## Batch [100] Train-accuracy=0.9918
-## Batch [200] Train-accuracy=0.9908
-## Batch [300] Train-accuracy=0.991566666666668
-## Batch [400] Train-accuracy=0.992175000000002
-## [5] Train-accuracy=0.992380952380955
+## Batch [100] Train-accuracy=0.1066
+## Batch [200] Train-accuracy=0.1596
+## Batch [300] Train-accuracy=0.3983
+## Batch [400] Train-accuracy=0.533975
+## [1] Train-accuracy=0.553532219570405
+## Batch [100] Train-accuracy=0.958
+## Batch [200] Train-accuracy=0.96155
+## Batch [300] Train-accuracy=0.966100000000001
+## Batch [400] Train-accuracy=0.968550000000003
+## [2] Train-accuracy=0.969071428571432
+## Batch [100] Train-accuracy=0.977
+## Batch [200] Train-accuracy=0.97715
+## Batch [300] Train-accuracy=0.979566666666668
+## Batch [400] Train-accuracy=0.980900000000003
+## [3] Train-accuracy=0.981309523809527
+## Batch [100] Train-accuracy=0.9853
+## Batch [200] Train-accuracy=0.985899999999999
+## Batch [300] Train-accuracy=0.986966666666668
+## Batch [400] Train-accuracy=0.988150000000002
+## [4] Train-accuracy=0.988452380952384
+## Batch [100] Train-accuracy=0.990199999999999
+## Batch [200] Train-accuracy=0.98995
+## Batch [300] Train-accuracy=0.990600000000001
+## Batch [400] Train-accuracy=0.991325000000002
+## [5] Train-accuracy=0.991523809523812
 ```
 
 ```r
@@ -323,7 +322,7 @@ print(proc.time() - tic)
 
 ```
 ##    user  system elapsed 
-##  10.176   1.608   7.743
+##   9.288   1.680   6.889
 ```
 
 As you can see by using GPU, we can get a much faster speedup in training!
@@ -332,8 +331,8 @@ Finally we can submit the result to Kaggle again to see the improvement of our r
 
 ```r
 preds <- predict(model, test.array)
-pred.label <- max.col(preds) - 1
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+pred.label <- max.col(t(preds)) - 1
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 

From 598795a33deeee12c13f2f68e6da62e05b03db11 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 01:43:17 -0400
Subject: [PATCH 017/122] [kvstore] refer to the dev branch of ps-lite

---
 .gitmodules | 2 +-
 ps-lite     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 6505cc089282..0de60d4c80e7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,4 +6,4 @@
 	url = https://github.com/dmlc/dmlc-core.git
 [submodule "ps-lite"]
 	path = ps-lite
-	url = https://github.com/mli/ps-lite.git
+	url = https://github.com/dmlc/ps-lite
diff --git a/ps-lite b/ps-lite
index a956db01bf52..5fbee1ffa014 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit a956db01bf529a2969cda77c2944bac505226c92
+Subproject commit 5fbee1ffa0140a922fc400a806cbdba4dfb0e653

From 71f09068e8c92eb2277f5216713ecab08b041fa1 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 20 Oct 2015 10:53:12 -0600
Subject: [PATCH 018/122] [R] Add auto layout selection to detect layout for
 rowmajor in simple MLP case.

---
 R-package/R/callback.R                        |  2 +-
 R-package/R/model.R                           | 94 ++++++++++++++++++-
 .../vignettes/fiveMinutesNeuralNetwork.Rmd    | 12 ++-
 doc/R-package/fiveMinutesNeuralNetwork.md     | 32 +++++--
 4 files changed, 124 insertions(+), 16 deletions(-)

diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index 36fb3571a9b8..784a2e31a9d3 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -2,7 +2,7 @@
 #' @export
 mx.callback.log.train.metric <- function(period) {
   function(iteration, nbatch, env) {
-    if (nbatch %% period == 0) {
+    if (nbatch %% period == 0 && !is.null(env$metric)) {
       result <- env$metric$get(env$train.metric)
       cat(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value, "\n"))
     }
diff --git a/R-package/R/model.R b/R-package/R/model.R
index e9febf3bd24d..f3a9c9964409 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -274,6 +274,68 @@ mx.model.init.iter <- function(X, y, batch.size, is.train) {
   return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
 }
 
+# select layout by matching shape, report error if nothing matches up.
+mx.model.select.layout.train <- function(X, y) {
+  if (is.null(y)) stop("Need to provide y for training")
+  y <- as.array(y)
+  dimX <- dim(X)
+  dimy <- dim(y)
+  if (length(dimX) != 2) return("colmajor")
+  rowmajor <- 0
+  colmajor <- 0
+  if (dimX[[1]] == dimy[[1]]) rowmajor <- 1
+  if (dimX[[length(dimX)]] == dimy[[length(dimy)]]) colmajor <- 1
+  if (rowmajor + colmajor != 1) {
+    stop("Cannot auto select array.layout, please specify this parameter")
+  }
+  if (rowmajor == 1) {
+    cat("Auto detect layout of input matrix, use rowmajor..\n")
+    return("rowmajor")
+  } else{
+    cat("Auto detect layout input matrix, use colmajor..\n")
+    return("colmajor")
+  }
+}
+
+# select layout by matching shape, report error if nothing matches up.
+mx.model.select.layout.predict <- function(X, model) {
+  dimX <- dim(X)
+  if (length(dimX) != 2) return("colmajor")
+  rowmajor <- 1
+  colmajor <- 1
+  # try row major
+  ret <- mx.symbol.infer.shape(model$symbol, data=c(dimX[[2]], 1))
+  if (!is.null(ret)) {
+    names = names(model$arg.params)
+    for (i in 1:length(names)) {
+      if (any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]]))) {
+        rowmajor <- 0
+      }
+    }
+  }
+  # try col major
+  ret <- mx.symbol.infer.shape(model$symbol, data=c(dimX[[1]], 1))
+  if (!is.null(ret)) {
+    names = names(model$arg.params)
+    for (i in 1:length(names)) {
+      if (any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]]))) {
+        colmajor <- 0
+      }
+    }
+  }
+  if (rowmajor + colmajor != 1) {
+    stop("Cannot auto select array.layout, please specify this parameter")
+  }
+  if (rowmajor == 1) {
+    cat("Auto detect layout of input matrix, use rowmajor..\n")
+    return("rowmajor")
+  } else{
+    cat("Auto detect layout input matrix, use colmajor..\n")
+    return("colmajor")
+  }
+}
+
+
 #' Create a MXNet Feedforward neural net model with the specified training.
 #'
 #' @param symbol The symbolic configuration of the neural network.
@@ -299,6 +361,12 @@ mx.model.init.iter <- function(X, y, batch.size, is.train) {
 #'     The callback when one mini-batch iteration ends.
 #' @param array.batch.size integer (default=128)
 #'     The batch size used for R array training.
+#' @param array.layout can be "auto", "colmajor", "rowmajor", (detault=auto)
+#'     The layout of array. "rowmajor" is only supported for two dimensional array.
+#'     For matrix, "rowmajor" means dim(X) = c(nexample, nfeatures),
+#'     "colmajor" means dim(X) = c(nfeatures, nexample)
+#'     "auto" will auto detect the layout by match the feature size,
+#'      and will report error when X is a square matrix to ask user to explicitly specify layout.
 #' @param kvstore string (default="local")
 #'     The parameter synchronization scheme in multiple devices.
 #' @return model A trained mxnet model.
@@ -310,9 +378,17 @@ function(symbol, X, y=NULL, ctx=NULL,
          initializer=mx.init.uniform(0.01),
          eval.data=NULL, eval.metric=NULL,
          iter.end.callback=NULL, epoch.end.callback=NULL,
-         array.batch.size=128,
+         array.batch.size=128, array.layout="auto",
          kvstore="local",
          ...) {
+  if (is.array(X) || is.matrix(X)) {
+    if (array.layout == "auto") {
+      array.layout <- mx.model.select.layout.train(X, y)
+    }
+    if (array.layout == "rowmajor") {
+      X <- t(X)
+    }
+  }
   X <- mx.model.init.iter(X, y, batch.size=array.batch.size, is.train=TRUE)
   if (!X$iter.next()) {
     x$reset()
@@ -349,10 +425,24 @@ function(symbol, X, y=NULL, ctx=NULL,
 #' @param X The dataset to predict.
 #' @param ctx mx.cpu() or mx.gpu(i) The device used to generate the prediction.
 #' @param array.batch.size The batch size used in batching. Only used when X is R's array.
+#' @param array.layout can be "auto", "colmajor", "rowmajor", (detault=auto)
+#'     The layout of array. "rowmajor" is only supported for two dimensional array.
+#'     For matrix, "rowmajor" means dim(X) = c(nexample, nfeatures),
+#'     "colmajor" means dim(X) = c(nfeatures, nexample)
+#'     "auto" will auto detect the layout by match the feature size,
+#'      and will report error when X is a square matrix to ask user to explicitly specify layout.
 #'
 #' @export
-predict.MXFeedForwardModel <- function(model, X, ctx=NULL, array.batch.size=128) {
+predict.MXFeedForwardModel <- function(model, X, ctx=NULL, array.batch.size=128, array.layout="auto") {
   if (is.null(ctx)) ctx <- mx.ctx.default()
+  if (is.array(X) || is.matrix(X)) {
+    if (array.layout == "auto") {
+      array.layout <- mx.model.select.layout.predict(X, model)
+    }
+    if (array.layout == "rowmajor") {
+      X <- t(X)
+    }
+  }
   X <- mx.model.init.iter(X, NULL, batch.size=array.batch.size, is.train=FALSE)
   X$reset()
   if (!X$iter.next()) stop("Cannot predict on empty iterator")
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
index 23c8107c01ee..82ad3cd4515a 100644
--- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
+++ b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
@@ -1,7 +1,7 @@
 Neural Network with MXNet in Five Minutes
 =============================================
 
-This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. 
+This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes.
 
 We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
 
@@ -34,7 +34,7 @@ The next step is to define the structure of the neural network.
 ```{r}
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=20)
@@ -69,14 +69,16 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
                                      epoch.end.callback=mx.callback.log.train.metric(100))
 ```
 
-Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate
+Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate.
 
 ```{r}
 preds = predict(model, test.x)
-pred.label = max.col(preds)-1
+pred.label = max.col(t(preds))-1
 table(pred.label, test.y)
 ```
 
+Note for multi-class prediction, mxnet outputs nclass x nexamples, each each row corresponding to probability of that class.
+
 ## Regression
 
 Again, let us preprocess the data first.
@@ -96,7 +98,7 @@ We can configure another network as what we have done above. The main difference
 ```{r}
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
diff --git a/doc/R-package/fiveMinutesNeuralNetwork.md b/doc/R-package/fiveMinutesNeuralNetwork.md
index d1bbf3b9ca59..a58eafa62474 100644
--- a/doc/R-package/fiveMinutesNeuralNetwork.md
+++ b/doc/R-package/fiveMinutesNeuralNetwork.md
@@ -38,15 +38,12 @@ data(Sonar, package="mlbench")
 
 Sonar[,61] = as.numeric(Sonar[,61])-1
 train.ind = c(1:50, 100:150)
-train.x = t(data.matrix(Sonar[train.ind, 1:60]))
+train.x = data.matrix(Sonar[train.ind, 1:60])
 train.y = Sonar[train.ind, 61]
-test.x = t(data.matrix(Sonar[-train.ind, 1:60]))
+test.x = data.matrix(Sonar[-train.ind, 1:60])
 test.y = Sonar[-train.ind, 61]
 ```
 
-MXNet accepts a column major input convention as in R.
-So we need to transpose the data matrix to nfeature x nexample before feed into the network.
-
 The next step is to define the structure of the neural network.
 
 
@@ -91,6 +88,7 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
 ```
 
 ```
+## Auto detect layout of input matrix, use rowmajor..
 ## Start training with 1 devices
 ## [1] Train-accuracy=0.5
 ## [2] Train-accuracy=0.514285714285714
@@ -114,11 +112,18 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
 ## [20] Train-accuracy=0.857142857142857
 ```
 
-Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate
+Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate.
 
 
 ```r
 preds = predict(model, test.x)
+```
+
+```
+## Auto detect layout of input matrix, use rowmajor..
+```
+
+```r
 pred.label = max.col(t(preds))-1
 table(pred.label, test.y)
 ```
@@ -130,6 +135,8 @@ table(pred.label, test.y)
 ##          1 36 33
 ```
 
+Note for multi-class prediction, mxnet outputs nclass x nexamples, each each row corresponding to probability of that class.
+
 ## Regression
 
 Again, let us preprocess the data first.
@@ -139,9 +146,9 @@ Again, let us preprocess the data first.
 data(BostonHousing, package="mlbench")
 
 train.ind = seq(1, 506, 3)
-train.x = t(data.matrix(BostonHousing[train.ind, -14]))
+train.x = data.matrix(BostonHousing[train.ind, -14])
 train.y = BostonHousing[train.ind, 14]
-test.x = t(data.matrix(BostonHousing[-train.ind, -14]))
+test.x = data.matrix(BostonHousing[-train.ind, -14])
 test.y = BostonHousing[-train.ind, 14]
 ```
 
@@ -172,6 +179,7 @@ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
 ```
 
 ```
+## Auto detect layout of input matrix, use rowmajor..
 ## Start training with 1 devices
 ## [1] Train-rmse=16.0632825223292
 ## [2] Train-rmse=12.2792375527391
@@ -230,6 +238,13 @@ It is also easy to make prediction and evaluate
 
 ```r
 preds = predict(model, test.x)
+```
+
+```
+## Auto detect layout of input matrix, use rowmajor..
+```
+
+```r
 sqrt(mean((preds-test.y)^2))
 ```
 
@@ -259,6 +274,7 @@ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
 ```
 
 ```
+## Auto detect layout of input matrix, use rowmajor..
 ## Start training with 1 devices
 ## [1] Train-mae=13.1889538090676
 ## [2] Train-mae=9.81431958410475

From 6c66a76bd8aec0936a9a73d3925c8c777082c90b Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 16:47:57 -0400
Subject: [PATCH 019/122] [kvstore] async test

---
 tests/python/multi-node/dist_async_mlp.py | 27 +++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100755 tests/python/multi-node/dist_async_mlp.py

diff --git a/tests/python/multi-node/dist_async_mlp.py b/tests/python/multi-node/dist_async_mlp.py
new file mode 100755
index 000000000000..98abdca797ca
--- /dev/null
+++ b/tests/python/multi-node/dist_async_mlp.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import common
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+(train, val) = common.mnist(num_parts = kv.num_workers,
+                            part_index = kv.rank,
+                            batch_size = 100,
+                            input_shape = (784,))
+
+# train
+model  = mx.model.FeedForward.create(
+    symbol        = common.mlp(),
+    ctx           = mx.cpu(),
+    X             = train,
+    num_round     = 4,
+    learning_rate = 0.05,
+    wd            = 0.0004,
+    momentum      = 0.9,
+    kvstore       = kv)
+
+common.accuracy(model, val)

From bc89f4da6155ac6f0fb8b4376f74e1e35340490b Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 22:06:23 +0000
Subject: [PATCH 020/122] [kvstore] update ps-lite

---
 Makefile | 4 ++--
 ps-lite  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 8cd116b57800..08d745a5c784 100644
--- a/Makefile
+++ b/Makefile
@@ -80,7 +80,7 @@ PS_PATH=./ps-lite
 DEPS_PATH=$(shell pwd)/deps
 include $(PS_PATH)/make/ps.mk
 ifeq ($(USE_DIST_KVSTORE), 1)
-	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/include
+	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/include -I$(DEPS_PATH)/include
 	LIB_DEP += $(PS_PATH)/build/libps.a
 	LDFLAGS += -Wl,-rpath,$(DEPS_PATH)/lib $(PS_LDFLAGS_SO)
 endif
@@ -120,7 +120,7 @@ lib/libmxnet.so: $(ALL_DEP)
 
 # ps-lite
 $(PS_PATH)/build/libps.a:
-	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) deps
+	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) protobuf zmq
 	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
 
 $(DMLC_CORE)/libdmlc.a:
diff --git a/ps-lite b/ps-lite
index 5fbee1ffa014..9534631b2b03 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 5fbee1ffa0140a922fc400a806cbdba4dfb0e653
+Subproject commit 9534631b2b0384abe1cb00be5a3ed60de177b951

From 8e4a7d51fd2aad83cf3be35192087fcfd417914e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 20 Oct 2015 19:32:52 -0700
Subject: [PATCH 021/122] [NDARRAY] Add sqrt, square, choose_element operator

---
 R-package/R/mxnet_generated.R         | 32 ++++++++++++++
 mshadow                               |  2 +-
 src/ndarray/ndarray.cc                | 60 +++++++++++++++++++++++++++
 src/ndarray/ndarray_function-inl.h    | 34 ++++++++++++++-
 src/ndarray/ndarray_function.h        | 28 +++++++++++++
 tests/python/unittest/test_ndarray.py | 25 +++++++++--
 6 files changed, 175 insertions(+), 6 deletions(-)

diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 14334d5da376..6110dd51eac8 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -2,6 +2,18 @@
 # Generated by mxnet.export, do not edit by hand.
 ######
 
+#' Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs
+#' 
+#' @param lhs  NDArray
+#'     Left operand to the function.
+#' @param rhs  NDArray
+#'     Right operand to the function.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.choose.element
+NULL
+
 #' Clip ndarray elements to range (a_min, a_max)
 #' 
 #' @param src  NDArray
@@ -28,6 +40,26 @@ NULL
 #' @name mx.nd.dot
 NULL
 
+#' Take square root of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sqrt
+NULL
+
+#' Take square of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.square
+NULL
+
 #' Create iterator for dataset packed in recordio.
 #' 
 #' @param path.imglist  string, optional, default=''
diff --git a/mshadow b/mshadow
index d2c27549571f..bcc19fc9b6ad 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit d2c27549571fb6a71e81d8b860b1484809d8922f
+Subproject commit bcc19fc9b6ad1d2028d2f79c397d9bea23a94bf7
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 1aecbd39508b..8f3f23eb6988 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -155,6 +155,53 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
+/*!
+ * \brief run a unary operation.
+ * \param src source operand
+ * \param out the output ndarray
+ * \param unary_op the real
+ */
+template<typename OP>
+void UnaryOp(const NDArray &src,
+             NDArray *out) {
+  if (out->is_none()) {
+    *out = NDArray(OP::GetShape(src.shape()), src.ctx(), true);
+  } else {
+    CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+    CHECK(out->shape() == OP::GetShape(src.shape())) << "target shape mismatch";
+  }
+  // important: callback must always capture by value
+  NDArray ret = *out;
+  // get the const variables
+  std::vector<Engine::VarHandle> const_vars;
+  if (src.var() != ret.var()) const_vars.push_back(src.var());
+
+  // redirect everything to mshadow operations
+  switch (src.ctx().dev_mask()) {
+    case cpu::kDevMask: {
+      Engine::Get()->PushSync([src, ret](RunContext ctx) {
+          ret.CheckAndAlloc();
+          TBlob tmp = ret.data();
+          ndarray::Eval<cpu, OP>(src.data(), &tmp, ctx);
+        }, src.ctx(), const_vars, {ret.var()});
+      break;
+    }
+#if MXNET_USE_CUDA
+    case gpu::kDevMask: {
+      Engine::Get()->PushSync([src, ret](RunContext ctx) {
+          ret.CheckAndAlloc();
+          TBlob tmp = ret.data();
+          ndarray::Eval<gpu, OP>(src.data(), &tmp, ctx);
+          // Wait GPU kernel to complete
+          ctx.get_stream<gpu>()->Wait();
+        }, src.ctx(), const_vars, {ret.var()});
+      break;
+    }
+#endif
+    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+  }
+}
+
 void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
   CHECK(from.shape() == to->shape())
       << "operands shape mismatch";
@@ -601,6 +648,13 @@ void NDArray::SyncCopyToCPU(real_t *data, size_t size) const {
 // those with underscore will be registered at NDArray
 MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
 
+
+MXNET_REGISTER_NDARRAY_FUN(square).set_function(UnaryOp<ndarray::Square>)
+.describe("Take square of the src");
+
+MXNET_REGISTER_NDARRAY_FUN(sqrt).set_function(UnaryOp<ndarray::SquareRoot>)
+.describe("Take square root of the src");
+
 MXNET_REGISTER_NDARRAY_FUN(_plus).set_function(BinaryOp<ndarray::Plus>);
 MXNET_REGISTER_NDARRAY_FUN(_minus).set_function(BinaryOp<ndarray::Minus>);
 MXNET_REGISTER_NDARRAY_FUN(_mul).set_function(BinaryOp<ndarray::Mul>);
@@ -608,6 +662,12 @@ MXNET_REGISTER_NDARRAY_FUN(_div).set_function(BinaryOp<ndarray::Div>);
 
 MXNET_REGISTER_NDARRAY_FUN(dot).set_function(BinaryOp<ndarray::Dot>)
 .describe("Calcuate 2D matrix multiplication");
+
+MXNET_REGISTER_NDARRAY_FUN(choose_element)
+.set_function(BinaryOp<ndarray::MatChooseRowElem>)
+.describe("Choose one element from each line(row for python, column for R/Julia)"
+          " in lhs according to index indicated by rhs");
+
 // register API function
 // those with underscore will be registered at NDArray
 MXNET_REGISTER_NDARRAY_FUN(_plus_scalar).set_function(ScalarOp<ndarray::Plus, false>);
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 487152b2bd0f..ed75fb35af2d 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -18,6 +18,14 @@
   }
 #endif
 
+#ifndef DECL_UNARY
+#define DECL_UNARY(XPU, OP, FUN)                                        \
+  template<>                                                            \
+  void Eval<XPU, OP>(const TBlob &src, TBlob *ret, RunContext ctx) {    \
+    FUN<XPU, OP>(src, ret, ctx);                                        \
+  }
+#endif
+
 #ifndef DECL_SCALAR
 #define DECL_SCALAR(XPU, OP, FUN, REVERSE)                              \
   template<>                                                            \
@@ -45,9 +53,18 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
                                    rhs.FlatTo2D<xpu, real_t>(s));
 }
 
+template<typename xpu, typename OP>
+inline void EvalUnary_(const TBlob &src,
+                       TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->FlatTo2D<xpu, real_t>(s)
+      = F<typename OP::mshadow_op>(src.FlatTo2D<xpu, real_t>(s));
+}
+
 template<typename xpu, typename OP>
 inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
-                        TBlob *ret, RunContext ctx) {
+                     TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   ret->FlatTo2D<xpu, real_t>(s)
@@ -55,6 +72,16 @@ inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
           rhs.FlatTo2D<xpu, real_t>(s));
 }
 
+template<typename xpu, typename OP>
+inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
+                                  TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->get<xpu, 1, real_t>(s)
+      = mat_choose_row_element(lhs.get<xpu, 2, real_t>(s),
+                               rhs.get<xpu, 1, real_t>(s));
+}
+
 template<typename xpu, typename OP, bool reverse>
 inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
                         TBlob *ret, RunContext ctx) {
@@ -150,7 +177,7 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
     }
     default: {
       Tensor<xpu, 2> in_0 = source[0].FlatTo2D<xpu, real_t>(s);
-      out = F<op::identity>(in_0);
+      out = F<mshadow::op::identity>(in_0);
       for (size_t i = 1; i < source.size(); ++i) {
         out += source[i].FlatTo2D<xpu, real_t>(s);
       }
@@ -160,6 +187,9 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
 }
 
 // declarations
+DECL_UNARY(DEVICE, Square, EvalUnary_)
+DECL_UNARY(DEVICE, SquareRoot, EvalUnary_)
+DECL_BINARY(DEVICE, MatChooseRowElem, EvalMatChooseRowElem_)
 DECL_BINARY(DEVICE, Dot, EvalDot_)
 DECL_BINARY(DEVICE, Plus, EvalBinary_)
 DECL_BINARY(DEVICE, Minus, EvalBinary_)
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index f23b696bf5eb..1dc689d9e5a2 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -11,6 +11,7 @@
 #include <mxnet/base.h>
 #include <mxnet/resource.h>
 #include <vector>
+#include "../operator/mshadow_op.h"
 
 namespace mxnet {
 /*! \brief namespace to support all possible Ndarray operator */
@@ -22,6 +23,13 @@ struct BinaryBase {
     return lshape;
   }
 };
+
+struct UnaryBase {
+  inline static TShape GetShape(const TShape &shape) {
+    return shape;
+  }
+};
+
 // operators
 struct Plus : public BinaryBase {
   typedef mshadow::op::plus mshadow_op;
@@ -39,6 +47,14 @@ struct Div : public BinaryBase {
   typedef mshadow::op::div mshadow_op;
 };
 
+struct Square : public UnaryBase {
+  typedef op::mshadow_op::square mshadow_op;
+};
+
+struct SquareRoot : public UnaryBase {
+  typedef op::mshadow_op::square_root mshadow_op;
+};
+
 struct ClipMin : public BinaryBase {
   struct mshadow_op {
     MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
@@ -72,6 +88,15 @@ struct Dot {
   }
 };
 
+struct MatChooseRowElem {
+  inline static TShape GetShape(const TShape &lshape, const TShape &rshape) {
+    CHECK(lshape.ndim() == 2 && rshape.ndim() == 1)
+        << "choose_row_element only support 2D Matrix and 1D index";
+    CHECK_EQ(lshape[0], rshape[0]) << "choose_row_element index and matrix shape mismatch";
+    return rshape;
+  }
+};
+
 // type holder for random number generators
 struct UniformDistribution {};
 
@@ -84,6 +109,9 @@ void EvalClip(const TBlob &src, const real_t &a_min, const real_t &a_max,
 template<typename Device, typename OP>
 void Eval(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx);
 
+template<typename Device, typename OP>
+void Eval(const TBlob &src, TBlob *ret, RunContext ctx);
+
 template<typename Device, typename OP, bool reverse>
 void Eval(const TBlob &lhs, const real_t &rhs, TBlob *ret, RunContext ctx);
 
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 36a1672bc636..9f3da8a53e4d 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -14,7 +14,7 @@ def same(a, b):
     return np.sum(a != b) == 0
 
 
-def check_with_uniform(uf, arg_shapes, dim=None):
+def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10):
     """check function consistency with uniform random numbers"""
     if isinstance(arg_shapes, int):
         assert dim
@@ -23,13 +23,17 @@ def check_with_uniform(uf, arg_shapes, dim=None):
     ndarray_arg = []
     numpy_arg = []
     for s in arg_shapes:
-        npy = np.random.uniform(-10, 10, s)
+        npy = np.random.uniform(rmin, 10, s)
         narr = mx.nd.array(npy)
         ndarray_arg.append(narr)
         numpy_arg.append(npy)
     out1 = uf(*ndarray_arg)
-    out2 = uf(*numpy_arg)
+    if npuf is None:
+        out2 = uf(*numpy_arg)
+    else:
+        out2 = npuf(*numpy_arg)
     assert out1.shape == out2.shape
+    dif = reldiff(out1.asnumpy(), out2)
     assert reldiff(out1.asnumpy(), out2) < 1e-6
 
 
@@ -48,6 +52,9 @@ def test_ndarray_elementwise():
             check_with_uniform(lambda x, y: x - y, 2, dim)
             check_with_uniform(lambda x, y: x * y, 2, dim)
             check_with_uniform(lambda x, y: x / y, 2, dim)
+            check_with_uniform(mx.nd.sqrt, 2, dim, np.sqrt, rmin=0)
+            check_with_uniform(mx.nd.square, 2, dim, np.square, rmin=0)
+
 
 def test_ndarray_negate():
     npy = np.random.uniform(-10, 10, (2,3,4))
@@ -61,6 +68,17 @@ def test_ndarray_negate():
     assert reldiff(npy, arr.asnumpy()) < 1e-6
 
 
+def test_ndarray_choose():
+    shape = (100, 20)
+    npy = np.arange(np.prod(shape)).reshape(shape)
+    arr = mx.nd.array(npy)
+    nrepeat = 3
+    for repeat in range(nrepeat):
+        indices = np.random.randint(shape[1], size=shape[0])
+        assert same(npy[np.arange(shape[0]), indices],
+                    mx.nd.choose_element(arr, mx.nd.array(indices)).asnumpy())
+
+
 def test_ndarray_copy():
     c = mx.nd.array(np.random.uniform(-10, 10, (10, 10)))
     d = c.copyto(mx.Context('cpu', 0))
@@ -158,3 +176,4 @@ def test_dot():
     test_ndarray_scalar()
     test_clip()
     test_dot()
+    test_ndarray_choose()

From 349474d5260256332f7cbbbacde77825d03d0f46 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 20 Oct 2015 20:22:41 -0600
Subject: [PATCH 022/122] LSTM base code

---
 example/LSTM/PennTree.ipynb   | 643 ----------------------------------
 example/rnn/README.md         |   6 +
 example/rnn/lstm.py           | 287 +++++++++++++++
 example/rnn/lstm_ptb.py       |  90 +++++
 ps-lite                       |   2 +-
 src/operator/block_grad-inl.h |   2 +-
 6 files changed, 385 insertions(+), 645 deletions(-)
 delete mode 100644 example/LSTM/PennTree.ipynb
 create mode 100644 example/rnn/README.md
 create mode 100644 example/rnn/lstm.py
 create mode 100644 example/rnn/lstm_ptb.py

diff --git a/example/LSTM/PennTree.ipynb b/example/LSTM/PennTree.ipynb
deleted file mode 100644
index 798525284b8b..000000000000
--- a/example/LSTM/PennTree.ipynb
+++ /dev/null
@@ -1,643 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# LSTM on PennTreeBank\n",
-    "-----\n",
-    "This is an example to show how to use MXNet low-level symbol to make a LSTM network.\n",
-    "\n",
-    "We would like to thank Wojciech Zaremba for his work LSTM in Torch. The data is same to Wojciech used in Torch LSTM. https://github.com/wojzaremba/lstm\n",
-    "\n",
-    "To get the data, please download directly from:\n",
-    "\n",
-    "Training text: https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt\n",
-    "\n",
-    "Validation text: https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt\n",
-    "\n",
-    "Test text: https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import mxnet as mx\n",
-    "import numpy as np\n",
-    "import time"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "    Build LSTM Symbol\n",
-    "\n",
-    "    Parameters:\n",
-    "    ----------\n",
-    "    num_hidden: int\n",
-    "        hidden unit in LSTM\n",
-    "    x: symbol\n",
-    "        input x\n",
-    "    prev_c: symbol\n",
-    "        previous cell\n",
-    "    prev_h: symbol\n",
-    "        previous hidden\n",
-    "    layer_prefix: str\n",
-    "        name prefix for layer\n",
-    "    t_prefix: str\n",
-    "        name prefix for time\n",
-    "    arg_param: dict: str->symbol\n",
-    "        arguments symbol for the lstm symbol\n",
-    "    aux_param: dict: str->symbol\n",
-    "        auxiliary states symbol for the lstm symbol\n",
-    "\n",
-    "    Returns:\n",
-    "    --------\n",
-    "    output: symbol\n",
-    "        grouped lstm output [c, h]\n",
-    "\n",
-    "    arg_param: dict: str->symbol\n",
-    "        arguments symbol of the lstm symbol\n",
-    "\n",
-    "    aux_param: dict: str->symbol\n",
-    "        auxiliary states symbol of the lstm symbol\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def lstm_symbol(num_hidden,\n",
-    "                x, prev_c, prev_h,\n",
-    "                layer_prefix, t_prefix,\n",
-    "                arg_param=None, aux_param=None,\n",
-    "                **kwargs):\n",
-    "    # name and variable\n",
-    "    i2h_name = \"%s_i2h\" % layer_prefix\n",
-    "    h2h_name = \"%s_h2h\" % layer_prefix\n",
-    "    exist_flag = True\n",
-    "    if arg_param == None or i2h_name + \"_weight\" not in arg_param:\n",
-    "        exist_flag = False\n",
-    "\n",
-    "    if not exist_flag:\n",
-    "        if arg_param == None:\n",
-    "            arg_param = {}\n",
-    "        arg_param[i2h_name + \"_weight\"] = mx.sym.Variable(i2h_name + \"_weight\")\n",
-    "        arg_param[i2h_name + \"_bias\"] = mx.sym.Variable(i2h_name + \"_bias\")\n",
-    "        arg_param[h2h_name + \"_weight\"] = mx.sym.Variable(h2h_name + \"_weight\")\n",
-    "        arg_param[h2h_name + \"_bias\"] = mx.sym.Variable(h2h_name + \"_bias\")\n",
-    "    if not exist_flag:\n",
-    "        if aux_param == None:\n",
-    "            aux_param = {}\n",
-    "        aux_param[i2h_name + \"_moving_mean\"] = mx.sym.Variable(i2h_name + \"_moving_mean\")\n",
-    "        aux_param[i2h_name + \"_moving_var\"] = mx.sym.Variable(i2h_name + \"_moving_var\")\n",
-    "        aux_param[h2h_name + \"_moving_mean\"] = mx.sym.Variable(h2h_name + \"_moving_mean\")\n",
-    "        aux_param[h2h_name + \"_moving_var\"] = mx.sym.Variable(h2h_name + \"_moving_var\")\n",
-    "\n",
-    "    # transform \n",
-    "    i2h = mx.sym.FullyConnected(*[x,\n",
-    "                                  arg_param[i2h_name + \"_weight\"],\n",
-    "                                  arg_param[i2h_name + \"_bias\"]],\n",
-    "                                  num_hidden=num_hidden * 4,\n",
-    "                                  name=i2h_name)\n",
-    "    h2h = mx.sym.FullyConnected(*[prev_h,\n",
-    "                                  arg_param[h2h_name + \"_weight\"],\n",
-    "                                  arg_param[h2h_name + \"_bias\"]],\n",
-    "                                  num_hidden=num_hidden * 4,\n",
-    "                                  name=h2h_name)\n",
-    "    gates = i2h + h2h\n",
-    "\n",
-    "    # gates\n",
-    "    slice_gates = mx.sym.SliceChannel(data=gates, num_outputs=4)\n",
-    "    in_gate = mx.sym.Activation(data=slice_gates[0], act_type=\"sigmoid\")\n",
-    "    in_transform = mx.sym.Activation(data=slice_gates[1], act_type=\"tanh\")\n",
-    "    forget_gate = mx.sym.Activation(data=slice_gates[2], act_type=\"sigmoid\")\n",
-    "    out_gate = mx.sym.Activation(data=slice_gates[3], act_type=\"sigmoid\")\n",
-    "\n",
-    "    # cal states\n",
-    "    next_c = (forget_gate * prev_c) + (in_gate * in_transform)\n",
-    "    next_h = out_gate * mx.sym.Activation(data=next_c, act_type=\"tanh\")\n",
-    "    # We need to block gradient to set 0 gradient back automatically\n",
-    "    next_c = mx.sym.BlockGrad(data=next_c, name=\"%s_%s_c\" % (t_prefix, layer_prefix))\n",
-    "    next_h = mx.sym.BlockGrad(data=next_h, name=\"%s_%s_h\" % (t_prefix, layer_prefix))\n",
-    "    # if you like you can add a dropout symbol here\n",
-    "    # next_h = mx.sym.Dropout(data=next_h, p=0.5)\n",
-    "    output = mx.symbol.Group([next_c, next_h])\n",
-    "    return (output, arg_param, aux_param)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "    Build a multi-layer LSTM model for a single component in unrolled RNN\n",
-    "\n",
-    "    Parameters:\n",
-    "    -----------\n",
-    "    num_layer: int\n",
-    "        layers of LSTM network\n",
-    "    num_hidden: int\n",
-    "        hidden unit in each LSTM layer\n",
-    "    num_embed: int\n",
-    "        dimention of word embedding\n",
-    "    num_label: int\n",
-    "        output label dimention\n",
-    "    prev_states: list of tuple (prev_c, prev_h)\n",
-    "        prev_states for each LSTM layer\n",
-    "    t_prefix: str\n",
-    "        prefix name of time\n",
-    "    embed_var: list of symbol\n",
-    "        vairable for embedding layer\n",
-    "    cls_var: list of symbol\n",
-    "        variable for linear classifier\n",
-    "    arg_param: dict: str->symbol\n",
-    "        arguments symbol of the lstm symbol\n",
-    "    aux_param: dict: str->symbol\n",
-    "        auxiliary states symbol of the lstm symbol\n",
-    "\n",
-    "    Returns:\n",
-    "    layers : list of symbol\n",
-    "        layers of current component\n",
-    "    arg_param: dict: str->symbol\n",
-    "        arguments symbol of the lstm symbol\n",
-    "\n",
-    "    aux_param: dict: str->symbol\n",
-    "        auxiliary states symbol of the lstm symbol\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def create_model(num_layer, num_hidden, num_embed, num_label,\n",
-    "                 prev_states,\n",
-    "                 t_prefix,\n",
-    "                 embed_var, cls_var, arg_param=None, aux_param=None,\n",
-    "                 **kwargs):\n",
-    "    layers = []\n",
-    "    data = mx.sym.Variable(\"%s_data\" % t_prefix)\n",
-    "    embed_layer = mx.sym.FullyConnected(*[data, embed_var[0], embed_var[1]],\n",
-    "                                        num_hidden=num_embed, name=\"embedding\")\n",
-    "    for i in range(num_layer):\n",
-    "        layer_prefix = \"layer_%d\" % i\n",
-    "        prev_c, prev_h = prev_states[i]\n",
-    "        if i == 0:\n",
-    "            data = embed_layer\n",
-    "        else:\n",
-    "            data = layers[-1][1]\n",
-    "        args = None\n",
-    "        auxs = None\n",
-    "        if arg_param != None:\n",
-    "            args = arg_param\n",
-    "        if aux_param != None:\n",
-    "            auxs = aux_param\n",
-    "        lstm, arg_param, aux_param = lstm_symbol(num_hidden,\n",
-    "                                                 data, prev_c, prev_h,\n",
-    "                                                 layer_prefix, t_prefix,\n",
-    "                                                 args, auxs,\n",
-    "                                                 **kwargs)\n",
-    "        layers.append(lstm)\n",
-    "    fc = mx.sym.FullyConnected(*[layers[-1][1], cls_var[0], cls_var[1]],\n",
-    "                               num_hidden=num_label, name=\"cls\")\n",
-    "    sm = mx.sym.Softmax(data=fc, name=\"%s\" % t_prefix)\n",
-    "    layers.append(sm)\n",
-    "    return layers, arg_param, aux_param"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "    Setup Recurrent Network Symbol\n",
-    "\n",
-    "    Parameters:\n",
-    "    -----------\n",
-    "    seq_len: int\n",
-    "        length of sequence\n",
-    "    num_layer: int\n",
-    "        layer of hidden lstm layers\n",
-    "    num_embed: int\n",
-    "        dimention of embeeding layer\n",
-    "    num_label: int\n",
-    "        dimention of output space\n",
-    "    models = []\n",
-    "\n",
-    "    Returns:\n",
-    "    --------\n",
-    "    rnn: symbol\n",
-    "        A final symbol of RNN network\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "def setup_rnn_symbol(seq_len, num_layer, num_hidden, num_embed, num_label, **kwargs):\n",
-    "    models = []\n",
-    "    arg_param = None\n",
-    "    aux_param = None\n",
-    "    embed_var = [mx.sym.Variable(\"embed_weight\"), mx.sym.Variable(\"embed_bias\")]\n",
-    "    cls_var = [mx.sym.Variable(\"cls_weight\"), mx.sym.Variable(\"cls_bias\")]\n",
-    "    init_states = []\n",
-    "\n",
-    "    for i in range(num_layer):\n",
-    "        init_c = mx.sym.Variable(\"init_c_%d\" % i)\n",
-    "        init_h = mx.sym.Variable(\"init_h_%d\" % i)\n",
-    "        init_states.append([init_c, init_h])\n",
-    "\n",
-    "    for i in range(seq_len):\n",
-    "        t_prefix = \"t_%d\" % i\n",
-    "        if i == 0:\n",
-    "            states = init_states\n",
-    "        else:\n",
-    "            states = [(models[-1][j][0], models[-1][j][1]) for j in range(num_layer)]\n",
-    "        model, arg_param, aux_param = create_model(num_layer, num_hidden, num_embed, num_label,\n",
-    "                                                    states, t_prefix,\n",
-    "                                                    embed_var, cls_var,\n",
-    "                                                    arg_param, aux_param,\n",
-    "                                                    **kwargs)\n",
-    "        models.append(model)\n",
-    "    prob = mx.sym.Group([md[-1] for md in models])\n",
-    "    state = mx.sym.Group([models[-1][i] for i in range(num_layer)])\n",
-    "    rnn = mx.sym.Group([prob, state])\n",
-    "    return rnn"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "    Setup Recurrent Network Executor\n",
-    "\n",
-    "    Parameters:\n",
-    "    -----------\n",
-    "    ctx: Context\n",
-    "        running context\n",
-    "    seq_len: int\n",
-    "        length of sequence\n",
-    "    num_layer: int\n",
-    "        layer of hidden lstm layers\n",
-    "    num_embed: int\n",
-    "        dimention of embeeding layer\n",
-    "    num_label: int\n",
-    "        dimention of output space\n",
-    "    batch_size: int\n",
-    "        number of batch_size\n",
-    "    Returns:\n",
-    "    --------\n",
-    "    rnn: executor\n",
-    "        A final RNN network\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "def setup_rnn(ctx, seq_len, num_layer, num_hidden, num_embed, num_label, batch_size,\n",
-    "              initializer=mx.init.Uniform(0.05)):\n",
-    "\n",
-    "    # get symbol\n",
-    "    rnn_sym = setup_rnn_symbol(seq_len, num_layer, num_hidden, num_embed, num_label)\n",
-    "    input_shapes = {}\n",
-    "    for name in rnn_sym.list_arguments():\n",
-    "        if \"init\" in name:\n",
-    "            input_shapes[name] = (batch_size, num_hidden)\n",
-    "        if \"data\" in name:\n",
-    "            input_shapes[name] = (batch_size, num_label)\n",
-    "    # bind symbol\n",
-    "    rnn_model = rnn_sym.simple_bind(ctx=ctx, **input_shapes)\n",
-    "    # init weight\n",
-    "    names = rnn_sym.list_arguments()\n",
-    "    args = dict(zip(names, rnn_model.arg_arrays))\n",
-    "    grad = dict(zip(names, rnn_model.grad_arrays))\n",
-    "    for name, arr in args.items():\n",
-    "        if name.endswith(\"weight\") or name.endswith(\"bias\") or \\\n",
-    "           name.endswith(\"gamma\") or name.endswith(\"beta\"):\n",
-    "            initializer(name, arr)\n",
-    "    # structure for later use\n",
-    "    param_array = []\n",
-    "    for i in range(len(names)):\n",
-    "        name = names[i]\n",
-    "        if name.endswith(\"weight\") or name.endswith(\"bias\") or \\\n",
-    "           name.endswith(\"gamma\") or name.endswith(\"beta\"):\n",
-    "            param_array.append((i, args[name], grad[name]))\n",
-    "    \n",
-    "    init_states = [(args[\"init_c_%d\" % i], args[\"init_h_%d\" % i]) for i in range(num_layer)]\n",
-    "    last_states = [(rnn_model.outputs[seq_len + i * 2], rnn_model.outputs[seq_len + i *2 + 1]) for i in range(num_layer)]\n",
-    "    return (rnn_sym, rnn_model, param_array, init_states, last_states)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def Logloss(y, prob):\n",
-    "    #eps = 1e-6\n",
-    "    #return -np.sum(np.log(np.maximum(np.choose(y.astype(\"int32\"), prob.T), eps)))\n",
-    "    loss = 0.0\n",
-    "    for i in range(prob.shape[0]):\n",
-    "        loss += -np.log(np.max(prob[i, y[i]], 1e-8))\n",
-    "    loss /= prob.shape[0]\n",
-    "    return loss\n",
-    "\n",
-    "def set_onehot_input(onehot, xidx):\n",
-    "    onehot[:] = 0.\n",
-    "    onehot[np.arange(onehot.shape[0]), xidx.astype(\"int32\")] = 1.\n",
-    "\n",
-    "def load_data(path, dic=None):\n",
-    "    fi = open(path)\n",
-    "    content = fi.read()\n",
-    "    content = content.replace('\\n', '<eos>')\n",
-    "    content = content.split(' ')\n",
-    "    print(\"Loading %s, size of data = %d\" % (path, len(content)))\n",
-    "    x = np.zeros(len(content))\n",
-    "    if dic == None:\n",
-    "        dic = {}\n",
-    "    idx = 0\n",
-    "    for i in range(len(content)):\n",
-    "        word = content[i]\n",
-    "        if len(word) == 0:\n",
-    "            continue\n",
-    "        if not word in dic:\n",
-    "            dic[word] = idx\n",
-    "            idx += 1\n",
-    "        x[i] = dic[word]\n",
-    "    print(\"Unique token: %d\" % len(dic))\n",
-    "    return x, dic\n",
-    "\n",
-    "def replicate_data(x, batch_size):\n",
-    "    nbatch = int(x.shape[0] / batch_size)\n",
-    "    x_cut = x[:nbatch * batch_size]\n",
-    "    data = x_cut.reshape((nbatch, batch_size), order='F')\n",
-    "    return data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading ./data/ptb.train.txt, size of data = 929590\n",
-      "Unique token: 10000\n",
-      "Loading ./data/ptb.valid.txt, size of data = 73761\n",
-      "Unique token: 10000\n"
-     ]
-    }
-   ],
-   "source": [
-    "batch_size = 20\n",
-    "seq_len = 20\n",
-    "vocab = 10000\n",
-    "rnn_hidden = 200\n",
-    "embed = 200\n",
-    "num_layer = 2\n",
-    "num_round = 4\n",
-    "ctx = mx.cpu()\n",
-    "optimizer = mx.optimizer.SGD(learning_rate=0.01, wd=0.0001)\n",
-    "# rnn model\n",
-    "rnn_sym, rnn, param_array, init_states, last_states,  = setup_rnn(ctx=ctx, \n",
-    "                                                                  seq_len=seq_len, \n",
-    "                                                                  num_layer=num_layer, \n",
-    "                                                                  num_hidden=rnn_hidden, \n",
-    "                                                                  num_embed=embed, \n",
-    "                                                                  num_label=vocab, \n",
-    "                                                                  batch_size=batch_size)\n",
-    "seq_prob = [mx.nd.zeros(ctx=mx.cpu(), shape=rnn.outputs[i].shape) for i in range(seq_len)]\n",
-    "param_dict = dict(zip(rnn_sym.list_arguments(), rnn.arg_arrays))\n",
-    "# load data\n",
-    "X_train, dic = load_data(\"./data/ptb.train.txt\")\n",
-    "X_val, _ = load_data(\"./data/ptb.valid.txt\", dic)\n",
-    "X_train_batch = replicate_data(X_train, batch_size)\n",
-    "X_val_batch = replicate_data(X_val, batch_size)\n",
-    "onehot = np.zeros((batch_size, vocab), dtype='float32')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch [0], Batch [20]: NLL=8.503, Prep=4931.846\n",
-      "Epoch [0], Batch [40]: NLL=8.511, Prep=4971.079\n",
-      "Epoch [0], Batch [60]: NLL=8.366, Prep=4300.328\n",
-      "Epoch [0], Batch [80]: NLL=8.273, Prep=3917.564\n",
-      "Epoch [0], Batch [100]: NLL=8.241, Prep=3793.372\n",
-      "Epoch [0], Batch [120]: NLL=8.146, Prep=3448.532\n",
-      "Epoch [0], Batch [140]: NLL=8.062, Prep=3172.689\n",
-      "Epoch [0], Batch [160]: NLL=8.041, Prep=3105.142\n",
-      "Epoch [0], Batch [180]: NLL=8.107, Prep=3318.143\n",
-      "Epoch [0], Batch [200]: NLL=8.091, Prep=3264.713\n",
-      "Epoch [0], Batch [220]: NLL=8.025, Prep=3055.690\n",
-      "Epoch [0], Batch [240]: NLL=8.020, Prep=3040.329\n",
-      "Epoch [0], Batch [260]: NLL=7.993, Prep=2960.196\n",
-      "Epoch [0], Batch [280]: NLL=7.970, Prep=2892.389\n",
-      "Epoch [0], Batch [300]: NLL=8.021, Prep=3042.987\n",
-      "Epoch [0], Batch [320]: NLL=7.979, Prep=2918.540\n",
-      "Epoch [0], Batch [340]: NLL=7.951, Prep=2839.064\n",
-      "Epoch [0], Batch [360]: NLL=7.982, Prep=2927.875\n",
-      "Epoch [0], Batch [380]: NLL=7.989, Prep=2948.181\n",
-      "Epoch [0], Batch [400]: NLL=7.966, Prep=2880.162\n",
-      "Epoch [0], Batch [420]: NLL=7.942, Prep=2813.045\n",
-      "Epoch [0], Batch [440]: NLL=7.954, Prep=2847.630\n",
-      "Epoch [0], Batch [460]: NLL=7.914, Prep=2735.021\n",
-      "Epoch [0], Batch [480]: NLL=7.878, Prep=2637.867\n",
-      "Epoch [0], Batch [500]: NLL=7.872, Prep=2624.014\n",
-      "Epoch [0], Batch [520]: NLL=7.845, Prep=2552.158\n",
-      "Epoch [0], Batch [540]: NLL=7.813, Prep=2472.874\n",
-      "Epoch [0], Batch [560]: NLL=7.801, Prep=2443.447\n",
-      "Epoch [0], Batch [580]: NLL=7.772, Prep=2372.508\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:6: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
-      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:7: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
-      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:21: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-10-e75e396a22bc>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     34\u001b[0m     \u001b[1;31m# train\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     35\u001b[0m     \u001b[1;32mwhile\u001b[0m \u001b[0mnbatch\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mX_train_batch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m         \u001b[0mset_rnn_inputs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseq_len\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0monehot\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_train_batch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_dict\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     37\u001b[0m         \u001b[0mrnn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mis_train\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     38\u001b[0m         \u001b[0mget_rnn_outputs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseq_len\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrnn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mseq_prob\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m<ipython-input-10-e75e396a22bc>\u001b[0m in \u001b[0;36mset_rnn_inputs\u001b[1;34m(seq_len, idx, onehot, X, param_dict)\u001b[0m\n\u001b[0;32m      7\u001b[0m         \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnext_idx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m         \u001b[0mset_onehot_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0monehot\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m         \u001b[0mparam_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdata_key\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0monehot\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     10\u001b[0m         \u001b[0mparam_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlabel_key\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m         \u001b[0midx\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/github/mxnet/python/mxnet/ndarray.py\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, in_slice, value)\u001b[0m\n\u001b[0;32m    191\u001b[0m             \u001b[0mNDArray\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_value\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    192\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgeneric\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 193\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sync_copyfrom\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    194\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    195\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'type %s not supported'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/github/mxnet/python/mxnet/ndarray.py\u001b[0m in \u001b[0;36m_sync_copyfrom\u001b[1;34m(self, source_array)\u001b[0m\n\u001b[0;32m    224\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m             \u001b[0msource_array\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mctypes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata_as\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmx_float_p\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 226\u001b[1;33m             ctypes.c_size_t(source_array.size)))\n\u001b[0m\u001b[0;32m    227\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    228\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_slice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstop\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "def set_rnn_inputs(seq_len, idx, onehot, X, param_dict):\n",
-    "    for j in range(seq_len):\n",
-    "        data_key = \"t_%d_data\" % j\n",
-    "        label_key = \"t_%d_label\" % j\n",
-    "        next_idx = (idx + 1) % X.shape[0]\n",
-    "        x = X[idx, :]\n",
-    "        y = X[next_idx, :]\n",
-    "        set_onehot_input(onehot, x)\n",
-    "        param_dict[data_key][:] = onehot\n",
-    "        param_dict[label_key][:] = y\n",
-    "        idx += 1\n",
-    "\n",
-    "def get_rnn_outputs(seq_len, rnn, seq_prob):\n",
-    "    for j in range(seq_len):\n",
-    "        seq_prob[j][:] = rnn.outputs[j]\n",
-    "\n",
-    "def get_nll(seq_len, idx, X, seq_prob):\n",
-    "    nll = 0.\n",
-    "    for j in range(seq_len):\n",
-    "        next_idx = (idx + 1) % X.shape[0]\n",
-    "        y = X[next_idx, :]\n",
-    "        nll += Logloss(y, seq_prob[j].asnumpy())\n",
-    "    return nll\n",
-    "    \n",
-    "\n",
-    "for i in range(num_round):\n",
-    "    nbatch = 0.\n",
-    "    nll = 0.\n",
-    "    # reset states\n",
-    "    for init_c, init_h in init_states:\n",
-    "        init_c[:] = 0.\n",
-    "        init_h[:] = 0.\n",
-    "    tic = time.time()\n",
-    "    # train\n",
-    "    while nbatch < X_train_batch.shape[0]:\n",
-    "        set_rnn_inputs(seq_len, nbatch, onehot, X_train_batch, param_dict)\n",
-    "        rnn.forward(is_train=True)\n",
-    "        get_rnn_outputs(seq_len, rnn, seq_prob)\n",
-    "        rnn.backward()\n",
-    "        for ind, weight, grad in param_array:\n",
-    "            optimizer.update(ind, weight, grad, None)\n",
-    "        for j in range(num_layer):\n",
-    "            init_states[j][0][:] = last_states[j][0]\n",
-    "            init_states[j][1][:] = last_states[j][1]\n",
-    "        nll += get_nll(seq_len, nbatch, X_train_batch, seq_prob)\n",
-    "        nbatch += seq_len\n",
-    "        if nbatch % 1000 == 0:\n",
-    "            print(\"Epoch [%d], Batch [%d]: NLL=%.3f, Prep=%.3f\" % (i, nbatch, nll / nbatch, np.exp(nll / nbatch)))\n",
-    "    toc = time.time()\n",
-    "    print(\"Epoch [%d] Train: Time: %.3f sec, NLL=%.3f, Prep=%.3f\" % (i, toc - tic, nll / nbatch, np.exp(nll / nbatch)))\n",
-    "    nbatch = 0\n",
-    "    nll = 0.\n",
-    "    for init_c, init_h in init_states:\n",
-    "        init_c[:] = 0.\n",
-    "        init_h[:] = 0.\n",
-    "    while nbatch < X_val_batch.shape[0]:\n",
-    "        set_rnn_inputs(seq_len, nbatch, onehot, X_val_batch, param_dict)\n",
-    "        rnn.forward(is_train=False)\n",
-    "        get_rnn_outputs(seq_len, rnn, seq_prob)\n",
-    "        nll += get_nll(seq_len, nbatch, X_val_batch, seq_prob)\n",
-    "        nbatch += seq_len\n",
-    "    print(\"Epoch [%d] Val: NLL=%.3f, Prep=%.3f\" % (i, nll / nbatch, np.exp(nll / nbatch)))\n",
-    "    \n",
-    "        \n",
-    "    \n",
-    "    \n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 154,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<mxnet.symbol.Symbol at 0x7f8c5f3df080>"
-      ]
-     },
-     "execution_count": 154,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.4.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/example/rnn/README.md b/example/rnn/README.md
new file mode 100644
index 000000000000..3955e1809f81
--- /dev/null
+++ b/example/rnn/README.md
@@ -0,0 +1,6 @@
+RNN Example
+----
+This folder contains RNN examples using low level symbol interface.
+
+- [lstm.py](lstm.py) Basic functions for building a LSTM Network
+- [lstm_ptb.py](lstm_ptb.py) PennTreeBank language model by using LSTM
diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
new file mode 100644
index 000000000000..a33d627db9db
--- /dev/null
+++ b/example/rnn/lstm.py
@@ -0,0 +1,287 @@
+# pylint:skip-file
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+import time
+
+LSTMState = namedtuple("LSTMState", ["c", "h"])
+LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias",
+                                     "h2h_weight", "h2h_bias"])
+LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol",
+                                     "init_states", "last_states",
+                                     "seq_data", "seq_labels", "seq_outputs",
+                                     "param_blocks"])
+
+def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
+    """LSTM Cell symbol"""
+    if dropout > 0.:
+        in_data = mx.sym.Dropout(data=in_data, p=dropout)
+    i2h = mx.sym.FullyConnected(data=indata,
+                                weight=param.i2h_weight,
+                                bias=param.i2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_i2h" % (seqidx, layeridx))
+    h2h = mx.sym.FullyConnected(data=prev_state.h,
+                                weight=param.h2h_weight,
+                                bias=param.h2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_h2h" % (seqidx, layeridx))
+    gates = i2h + h2h
+    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
+                                      name="t%d_l%d_slice" % (seqidx, layeridx))
+    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
+    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
+    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
+    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")
+    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
+    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")
+    return LSTMState(c=next_c, h=next_h)
+
+
+def lstm_unroll(num_lstm_layer, seq_len,
+                num_hidden, num_embed, num_label, dropout=0.):
+    """unrolled lstm network"""
+    # initialize the parameter symbols
+    embed_weight=mx.sym.Variable("embed_weight")
+    cls_weight = mx.sym.Variable("cls_weight")
+    cls_bias = mx.sym.Variable("cls_bias")
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight = mx.sym.Variable("l%d_i2h_weight" % i),
+                                      i2h_bias = mx.sym.Variable("l%d_i2h_bias" % i),
+                                      h2h_weight = mx.sym.Variable("l%d_h2h_weight" % i),
+                                      h2h_bias = mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert(len(last_states) == num_lstm_layer)
+
+    out_prob = []
+    for seqidx in range(seq_len):
+        # embeding layer
+        data = mx.sym.Variable("t%d_data" % seqidx)
+        label = mx.sym.Variable("t%d_label" % seqidx)
+        hidden = mx.sym.FullyConnected(data=data, weight=embed_weight,
+                                      num_hidden = num_embed, no_bias=True,
+                                      name="t%d_embed" % seqidx)
+        # stack LSTM
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i, dropout=0.)
+            hidden = next_state.h
+            last_states[i] = next_state
+        # decoder
+        if dropout > 0.:
+            hidden = mx.sym.Dropout(data=hidden, p=dropout)
+        fc = mx.sym.FullyConnected(data=hidden,
+                                   weight=cls_weight,
+                                   bias=cls_bias,
+                                   num_hidden=num_label,
+                                   name="t%d_cls" % seqidx)
+        sm = mx.sym.Softmax(data=fc, label=label, name="t%d_sm" % seqidx)
+        out_prob.append(sm)
+
+    for i in range(num_lstm_layer):
+        state = last_states[i]
+        state = LSTMState(c=mx.sym.BlockGrad(state.c, name="l%d_last_c" % i),
+                          h=mx.sym.BlockGrad(state.h, name="l%d_last_h" % i))
+        last_states[i] = state
+
+    unpack_c = [state.c for state in last_states]
+    unpack_h = [state.h for state in last_states]
+    list_all = out_prob + unpack_c + unpack_h
+    return mx.sym.Group(list_all)
+
+
+def is_param_name(name):
+    return name.endswith("weight") or name.endswith("bias") or\
+        name.endswith("gamma") or name.endswith("beta")
+
+
+def setup_rnn_model(ctx,
+                    num_lstm_layer, seq_len,
+                    num_hidden, num_embed, num_label,
+                    batch_size, input_size,
+                    initializer, dropout=0.):
+    """set up rnn model with lstm cells"""
+    rnn_sym = lstm_unroll(num_lstm_layer=num_lstm_layer,
+                          num_hidden=num_hidden,
+                          seq_len=seq_len,
+                          num_embed=num_embed,
+                          num_label=num_label,
+                          dropout)
+    print(rnn_sym.list_outputs())
+    arg_names = rnn_sym.list_arguments()
+    print sorted(arg_names)
+
+    input_shapes = {}
+    for name in arg_names:
+        if name.endswith("init_c") or name.endswith("init_h"):
+            input_shapes[name] = (batch_size, num_hidden)
+        elif name.endswith("data"):
+            input_shapes[name] = (batch_size, input_size)
+        else:
+            print("ignore %s " % name)
+
+    arg_shape, out_shape, aux_shape = rnn_sym.infer_shape(**input_shapes)
+    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
+    args_grad = {}
+    for shape, name in zip(arg_shape, arg_names):
+        if is_param_name(name):
+            args_grad[name] = mx.nd.zeros(shape, ctx)
+        # else:
+            # print("Do not need gradient for %s" % name)
+
+    rnn_exec = rnn_sym.bind(ctx=ctx, args=arg_arrays,
+                            args_grad=args_grad,
+                            grad_req="add")
+    param_blocks = []
+    arg_dict = dict(zip(arg_names, rnn_exec.arg_arrays))
+    for i, name in enumerate(arg_names):
+        if is_param_name(name):
+            initializer(name, arg_dict[name])
+
+            param_blocks.append((i, arg_dict[name], args_grad[name], name))
+        else:
+            assert name not in args_grad
+    out_dict = dict(zip(rnn_sym.list_outputs(), rnn_exec.outputs))
+
+    init_states = [LSTMState(c=arg_dict["l%d_init_c" % i],
+                             h=arg_dict["l%d_init_h" % i]) for i in range(num_lstm_layer)]
+    seq_labels = [rnn_exec.arg_dict["t%d_label" % i] for i in range(seq_len)]
+    seq_data = [rnn_exec.arg_dict["t%d_data" % i] for i in range(seq_len)]
+    last_states = [LSTMState(c=out_dict["l%d_last_c_output" % i],
+                             h=out_dict["l%d_last_h_output" % i]) for i in range(num_lstm_layer)]
+    seq_outputs = [out_dict["t%d_sm_output" % i] for i in range(seq_len)]
+
+    return LSTMModel(rnn_exec=rnn_exec, symbol=rnn_sym,
+                     init_states=init_states, last_states=last_states,
+                     seq_data=seq_data, seq_labels=seq_labels, seq_outputs=seq_outputs,
+                     param_blocks=param_blocks)
+
+
+def set_onehot_input(onehot, xidx):
+    """setup onehot input"""
+    onehot[:] = 0.
+    onehot[np.arange(onehot.shape[0]), xidx.astype("int32")] = 1.
+
+def logloss(y, prob):
+    eps = 1e-10
+    assert prob.shape[0] == len(y)
+    py = prob[np.arange(len(y)), y.astype("int32")]
+    return -np.sum(np.log(np.maximum(py, eps))) / len(y)
+
+def set_rnn_inputs(m, X, onehot, begin):
+    seq_len = len(m.seq_data)
+    batch_size, vocab = onehot.shape
+    for seqidx in range(seq_len):
+        idx = (begin + seqidx) % X.shape[0]
+        next_idx = (begin + seqidx + 1) % X.shape[0]
+        x = X[idx, :]
+        y = X[next_idx, :]
+        onehot[:] = 0.
+        onehot[np.arange(batch_size), x.astype("int32")] = 1.
+        m.seq_data[seqidx][:] = onehot
+        m.seq_labels[seqidx][:] = y
+
+def calc_nll(seq_out, X, begin):
+    nll = 0.
+    for seqidx in range(len(seq_out)):
+        next_idx = (begin + seqidx + 1) % X.shape[0]
+        y = X[next_idx, :]
+        nll += logloss(y, seq_out[seqidx].asnumpy())
+    return nll
+
+def train_lstm(model, X_train_batch, X_val_batch,
+               num_round, update_period,
+               optimizer='sgd', half_life=2, **kwargs):
+    print("Training swith train.shape=%s" % str(X_train_batch.shape))
+    print("Training swith val.shape=%s" % str(X_val_batch.shape))
+    m = model
+    onehot = np.zeros(m.seq_data[0].shape, dtype='float32')
+    seq_len = len(m.seq_data)
+    batch_size = m.seq_data[0].shape[0]
+    print("batch_size=%d" % batch_size)
+    print("seq_len=%d" % seq_len)
+    rescale_grad = 1.0 / (seq_len * batch_size * update_period)
+    opt = mx.optimizer.create(optimizer,
+                              rescale_grad=rescale_grad,
+                              **kwargs)
+    updater = mx.optimizer.get_updater(opt)
+    epoch_counter = 0
+    watch_weight = False
+    log_period = max(1000 / seq_len, 1)
+
+    for iteration in range(num_round):
+        nbatch = 0
+        train_nll = 0
+        # reset states
+        for state in m.init_states:
+            state.c[:] = 0.0
+            state.h[:] = 0.0
+        tic = time.time()
+        assert X_train_batch.shape[0] % seq_len == 0
+        assert X_val_batch.shape[0] % seq_len == 0
+        for begin in range(0, X_train_batch.shape[0], seq_len):
+            set_rnn_inputs(m, X_train_batch, onehot, begin=begin)
+            m.rnn_exec.forward(is_train=True)
+            seq_outs = [out.copyto(mx.cpu()) for out in m.seq_outputs]
+            m.rnn_exec.backward()
+            # transfer the states
+            for init, last in zip(m.init_states, m.last_states):
+                last.c.copyto(init.c)
+                last.h.copyto(init.h)
+            # update epoch counter
+            epoch_counter += 1
+            if epoch_counter % update_period == 0:
+                # TODO add gradient clip here
+                # updare parameters
+                for idx, weight, grad, name in m.param_blocks:
+                    if epoch_counter % log_period == 0 and watch_weight:
+                        dw = grad.asnumpy()
+                        w = weight.asnumpy()
+                        dwnorm = np.linalg.norm(dw, 2) * rescale_grad
+                        wnorm = np.linalg.norm(w, 2)
+                        print("dw:norm(%s): %.3f" % (name, dwnorm))
+                        print("w:norm(%s): %.3f" % (name, wnorm))
+                        if name == "cls_bias":
+                            print len(dw[dw<0])
+                    updater(idx, grad, weight)
+                    # reset gradient to zero
+                    grad[:] = 0.0
+            train_nll += calc_nll(seq_outs, X_train_batch, begin=begin)
+
+            nbatch = begin + seq_len
+            if epoch_counter % log_period == 0:
+                print("Epoch [%d] Train: NLL=%.3f, Prep=%.3f" % (
+                    epoch_counter, train_nll / nbatch, np.exp(train_nll / nbatch)))
+        # end of training loop
+        toc = time.time()
+        print("Iter [%d] Train: Time: %.3f sec, NLL=%.3f, Prep=%.3f" % (
+            iteration, toc - tic, train_nll / nbatch, np.exp(train_nll / nbatch)))
+
+        val_nll = 0.0
+        # validation set, reset states
+        for state in m.init_states:
+            state.c[:] = 0.0
+            state.h[:] = 0.0
+        for begin in range(0, X_val_batch.shape[0], seq_len):
+            set_rnn_inputs(m, X_val_batch, onehot, begin=begin)
+            m.rnn_exec.forward(is_train=False)
+            seq_outs = [out.copyto(mx.cpu()) for out in m.seq_outputs]
+            # transfer the states
+            for init, last in zip(m.init_states, m.last_states):
+                last.c.copyto(init.c)
+                last.h.copyto(init.h)
+            val_nll += calc_nll(seq_outs, X_val_batch, begin=begin)
+        nbatch = X_val_batch.shape[0]
+        print("Iter [%d] Val: NLL=%.3f, Prep=%.3f" % (
+            iteration, val_nll / nbatch, np.exp(val_nll / nbatch)))
+        if (iteration + 1) % half_life == 0:
+            opt.lr *= 0.9
+            print("Reset learning rate to %g" % opt.lr)
diff --git a/example/rnn/lstm_ptb.py b/example/rnn/lstm_ptb.py
new file mode 100644
index 000000000000..b1637fb20ed1
--- /dev/null
+++ b/example/rnn/lstm_ptb.py
@@ -0,0 +1,90 @@
+# pylint:skip-file
+import lstm
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+
+"""
+PennTreeBank Language Model
+We would like to thanks Wojciech Zaremba for his Torch LSTM code
+
+The data file can be found at:
+https://github.com/wojzaremba/lstm/tree/master/data
+"""
+
+def load_data(path, dic=None):
+    fi = open(path)
+    content = fi.read()
+    content = content.replace('\n', '<eos>')
+    content = content.split(' ')
+    print("Loading %s, size of data = %d" % (path, len(content)))
+    x = np.zeros(len(content))
+    if dic == None:
+        dic = {}
+    idx = 0
+    for i in range(len(content)):
+        word = content[i]
+        if len(word) == 0:
+            continue
+        if not word in dic:
+            dic[word] = idx
+            idx += 1
+        x[i] = dic[word]
+    print("Unique token: %d" % len(dic))
+    return x, dic
+
+def drop_tail(X, seq_len):
+    shape = X.shape
+    return X[0 : shape[0]/seq_len *seq_len, :]
+
+
+def replicate_data(x, batch_size):
+    nbatch = int(x.shape[0] / batch_size)
+    x_cut = x[:nbatch * batch_size]
+    data = x_cut.reshape((nbatch, batch_size), order='F')
+    return data
+
+batch_size = 20
+seq_len = 20
+num_hidden = 200
+num_embed = 200
+num_lstm_layer = 2
+num_round = 20
+learning_rate= 1
+wd=0.00001
+momentum=0.0
+clip_gradient=1
+update_period = 1
+
+
+X_train, dic = load_data("./data/ptb.train.txt")
+X_val, _ = load_data("./data/ptb.valid.txt", dic)
+X_train_batch = replicate_data(X_train, batch_size)
+X_val_batch = replicate_data(X_val, batch_size)
+vocab = len(dic)
+print("Vocab=%d" %vocab)
+
+X_train_batch = drop_tail(X_train_batch, seq_len)
+X_val_batch = drop_tail(X_val_batch, seq_len)
+
+
+model = lstm.setup_rnn_model(mx.gpu(),
+                             num_lstm_layer=num_lstm_layer,
+                             seq_len=seq_len,
+                             num_hidden=num_hidden,
+                             num_embed=num_embed,
+                             num_label=vocab,
+                             batch_size=batch_size,
+                             input_size=vocab,
+                             initializer=mx.initializer.Uniform(0.1))
+
+lstm.train_lstm(model, X_train_batch, X_val_batch,
+                num_round=num_round,
+                half_life=2,
+                update_period=update_period,
+                learning_rate=learning_rate,
+                wd=wd,
+                momentum=momentum,
+                clip_gradient=clip_gradient)
+
diff --git a/ps-lite b/ps-lite
index 5fbee1ffa014..504faa73a826 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 5fbee1ffa0140a922fc400a806cbdba4dfb0e653
+Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
index 4ac7f174dc82..f246f2886cb9 100644
--- a/src/operator/block_grad-inl.h
+++ b/src/operator/block_grad-inl.h
@@ -64,7 +64,7 @@ class BlockGradientProp : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {}
 
   std::map<std::string, std::string> GetParams() const override {
-    return {};
+    return std::map<std:;string, std::string>();
   }
 
   bool InferShape(std::vector<TShape> *in_shape,

From 280a423f8f8c420a1088b465bc8ca5ade01b21c0 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 20 Oct 2015 21:24:51 -0700
Subject: [PATCH 023/122] [NDArray] Add onehot_encode

---
 mshadow                               |  2 +-
 python/mxnet/ndarray.py               | 22 ++++++++++++++++++++++
 src/ndarray/ndarray.cc                |  2 ++
 src/ndarray/ndarray_function-inl.h    | 11 +++++++++++
 src/ndarray/ndarray_function.h        |  9 +++++++++
 tests/python/unittest/test_ndarray.py | 13 +++++++++++++
 6 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/mshadow b/mshadow
index bcc19fc9b6ad..129e060d76cd 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit bcc19fc9b6ad1d2028d2f79c397d9bea23a94bf7
+Subproject commit 129e060d76cd5d7f42ac4c26cf39d3289a8540a6
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index f08aeb3675d9..a2b5ccadb2d7 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -335,6 +335,28 @@ def copyto(self, other):
             raise TypeError('copyto do not support type ' + str(type(other)))
     # pylint: enable= no-member
 
+
+def onehot_encode(indices, out):
+    """One hot encoding indices into matrix out.
+
+    Parameters
+    ----------
+    indices: NDArray
+        An NDArray containing indices of the categorical features.
+
+    out: NDArray
+        The result holder of the encoding.
+
+    Returns
+    -------
+    out: Array
+        Same as out.
+    """
+    # pylint: disable= no-member, protected-access
+    return NDArray._onehot_encode(indices, out, out=out)
+    # pylint: enable= no-member, protected-access
+
+
 def empty(shape, ctx=None):
     """Create an empty uninitialized new NDArray, with specified shape.
 
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 8f3f23eb6988..74ac76c00f66 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -663,6 +663,8 @@ MXNET_REGISTER_NDARRAY_FUN(_div).set_function(BinaryOp<ndarray::Div>);
 MXNET_REGISTER_NDARRAY_FUN(dot).set_function(BinaryOp<ndarray::Dot>)
 .describe("Calcuate 2D matrix multiplication");
 
+MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);
+
 MXNET_REGISTER_NDARRAY_FUN(choose_element)
 .set_function(BinaryOp<ndarray::MatChooseRowElem>)
 .describe("Choose one element from each line(row for python, column for R/Julia)"
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index ed75fb35af2d..8b5bfc72bcc1 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -72,6 +72,16 @@ inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
           rhs.FlatTo2D<xpu, real_t>(s));
 }
 
+template<typename xpu, typename OP>
+inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
+                        TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->get<xpu, 2, real_t>(s)
+      = one_hot_encode(index.get<xpu, 1, real_t>(s),
+                       rhs.shape_[1]);
+}
+
 template<typename xpu, typename OP>
 inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
                                   TBlob *ret, RunContext ctx) {
@@ -191,6 +201,7 @@ DECL_UNARY(DEVICE, Square, EvalUnary_)
 DECL_UNARY(DEVICE, SquareRoot, EvalUnary_)
 DECL_BINARY(DEVICE, MatChooseRowElem, EvalMatChooseRowElem_)
 DECL_BINARY(DEVICE, Dot, EvalDot_)
+DECL_BINARY(DEVICE, OneHotEncode, EvalOneHot_)
 DECL_BINARY(DEVICE, Plus, EvalBinary_)
 DECL_BINARY(DEVICE, Minus, EvalBinary_)
 DECL_BINARY(DEVICE, Mul, EvalBinary_)
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 1dc689d9e5a2..1263f39e5998 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -88,6 +88,15 @@ struct Dot {
   }
 };
 
+
+struct OneHotEncode {
+  inline static TShape GetShape(const TShape &index, const TShape &proptype) {
+    CHECK(index.ndim() == 1 && proptype.ndim() == 2) << "OneHotEncode only support 1d index.";
+    CHECK_EQ(index[0], proptype[0]) << "OneHotEncode shape inconsistent";
+    return proptype;
+  }
+};
+
 struct MatChooseRowElem {
   inline static TShape GetShape(const TShape &lshape, const TShape &rshape) {
     CHECK(lshape.ndim() == 2 && rshape.ndim() == 1)
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 9f3da8a53e4d..d41c47662e2f 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -79,6 +79,19 @@ def test_ndarray_choose():
                     mx.nd.choose_element(arr, mx.nd.array(indices)).asnumpy())
 
 
+def test_ndarray_choose():
+    shape = (100, 20)
+    npy = np.arange(np.prod(shape)).reshape(shape)
+    arr = mx.nd.array(npy)
+    nrepeat = 3
+    for repeat in range(nrepeat):
+        indices = np.random.randint(shape[1], size=shape[0])
+        npy[:] = 0.0
+        npy[np.arange(shape[0]), indices] = 1.0
+        mx.nd.onehot_encode(mx.nd.array(indices), out=arr)
+        assert same(npy, arr.asnumpy())
+
+
 def test_ndarray_copy():
     c = mx.nd.array(np.random.uniform(-10, 10, (10, 10)))
     d = c.copyto(mx.Context('cpu', 0))

From 28f7ee4d4b94af914da96d4f89eb87629b540e96 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Wed, 21 Oct 2015 04:31:06 +0000
Subject: [PATCH 024/122] [kvstore] more test scripts

---
 ps-lite                                       |   2 +-
 tests/python/multi-node/README.md             | 309 +++++++++++++++++-
 tests/python/multi-node/common.py             |   3 +-
 .../python/multi-node/dist_async_inception.py |  31 ++
 tests/python/multi-node/dist_async_lenet.py   |  27 ++
 .../multi-node/dist_imagenet_inception.py     |  30 ++
 tests/python/multi-node/imagenet.py           | 101 ++++++
 tests/python/multi-node/local_inception.py    |   3 +-
 8 files changed, 497 insertions(+), 9 deletions(-)
 create mode 100755 tests/python/multi-node/dist_async_inception.py
 create mode 100755 tests/python/multi-node/dist_async_lenet.py
 create mode 100755 tests/python/multi-node/dist_imagenet_inception.py
 create mode 100644 tests/python/multi-node/imagenet.py

diff --git a/ps-lite b/ps-lite
index 9534631b2b03..7121aa1bdb67 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 9534631b2b0384abe1cb00be5a3ed60de177b951
+Subproject commit 7121aa1bdb673f047c7600eb4347fd2911021710
diff --git a/tests/python/multi-node/README.md b/tests/python/multi-node/README.md
index 32d308017c5f..9713199ee17c 100644
--- a/tests/python/multi-node/README.md
+++ b/tests/python/multi-node/README.md
@@ -1,15 +1,312 @@
 # Test multi-devices and multi-machines
 
-must disable `CUDNN`
+Note that `CUDNN` leads to randomness, need to disable if comparing to the baseline
 
-`local_*` for multi-devices and single machine. Requires two GPUs.
+- `local_*` for multi-devices and single machine. Requires two GPUs.
 
+- `dist_sync_*` for multi-machines with BSP synchronizations
 
-`dist_*` for multi-machines. Run in local machine with 2 workers (requires at
-least two gpus) and 2 servers.
-
+`dist_async_*` for multi-machines with asynchronous SGD
 
 ```
-ln -s ../../../dmlc-core/tracker/dmlc_local.py .
+ln -s ../../../ps-lite/tracker/dmlc_local.py .
 ./dmlc_local.py -n 2 -s 2 ./dist_sync_mlp.py
 ```
+
+# Results
+
+## cifar10, inceptions
+
+single gtx 980. batch size = 128 and learning rate = .1
+
+```
+[03:42:04] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:42:04] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:42:04] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:42:04] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+INFO:root:Iteration[0] Train-accuracy=0.523938
+INFO:root:Iteration[0] Time cost=104.396
+INFO:root:Iteration[0] Validation-accuracy=0.665941
+INFO:root:Iteration[1] Train-accuracy=0.721108
+INFO:root:Iteration[1] Time cost=105.245
+INFO:root:Iteration[1] Validation-accuracy=0.755934
+INFO:root:Iteration[2] Train-accuracy=0.793298
+INFO:root:Iteration[2] Time cost=105.101
+INFO:root:Iteration[2] Validation-accuracy=0.784909
+INFO:root:Iteration[3] Train-accuracy=0.835198
+INFO:root:Iteration[3] Time cost=104.816
+INFO:root:Iteration[3] Validation-accuracy=0.799150
+INFO:root:Iteration[4] Train-accuracy=0.869625
+INFO:root:Iteration[4] Time cost=104.571
+INFO:root:Iteration[4] Validation-accuracy=0.809533
+INFO:root:Iteration[5] Train-accuracy=0.895201
+INFO:root:Iteration[5] Time cost=104.357
+INFO:root:Iteration[5] Validation-accuracy=0.811214
+INFO:root:Iteration[6] Train-accuracy=0.911025
+INFO:root:Iteration[6] Time cost=104.347
+INFO:root:Iteration[6] Validation-accuracy=0.799644
+INFO:root:Iteration[7] Train-accuracy=0.923853
+INFO:root:Iteration[7] Time cost=104.108
+INFO:root:Iteration[7] Validation-accuracy=0.806468
+INFO:root:Iteration[8] Train-accuracy=0.936301
+INFO:root:Iteration[8] Time cost=104.178
+INFO:root:Iteration[8] Validation-accuracy=0.813687
+INFO:root:Iteration[9] Train-accuracy=0.950068
+INFO:root:Iteration[9] Time cost=104.522
+INFO:root:Iteration[9] Validation-accuracy=0.820115
+INFO:root:Accuracy = 0.820100
+```
+
+using 3x dual gtx 980 machines, async inception with batch size = 128 and
+learning rate = .05
+
+
+```
+[03:23:29] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:31] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:29] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:31] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:30] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:30] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:29] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:31] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:29] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:31] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:30] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:30] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Iteration[0] Train-accuracy=0.185276
+INFO:root:Iteration[0] Time cost=21.556
+INFO:root:Iteration[0] Train-accuracy=0.184255
+INFO:root:Iteration[0] Time cost=22.021
+INFO:root:Iteration[0] Train-accuracy=0.183834
+INFO:root:Iteration[0] Time cost=22.342
+INFO:root:Iteration[0] Validation-accuracy=0.225079
+INFO:root:Iteration[0] Validation-accuracy=0.236452
+INFO:root:Iteration[0] Validation-accuracy=0.237836
+INFO:root:Iteration[1] Train-accuracy=0.308624
+INFO:root:Iteration[1] Time cost=21.617
+INFO:root:Iteration[1] Train-accuracy=0.312977
+INFO:root:Iteration[1] Time cost=21.603
+INFO:root:Iteration[1] Train-accuracy=0.309637
+INFO:root:Iteration[1] Time cost=21.917
+INFO:root:Iteration[1] Validation-accuracy=0.333169
+INFO:root:Iteration[1] Validation-accuracy=0.382812
+INFO:root:Iteration[1] Validation-accuracy=0.385186
+INFO:root:Iteration[2] Train-accuracy=0.426885
+INFO:root:Iteration[2] Time cost=21.531
+INFO:root:Iteration[2] Train-accuracy=0.420802
+INFO:root:Iteration[2] Time cost=21.469
+INFO:root:Iteration[2] Train-accuracy=0.436844
+INFO:root:Iteration[2] Time cost=22.053
+INFO:root:Iteration[2] Validation-accuracy=0.487935
+INFO:root:Iteration[2] Validation-accuracy=0.491495
+INFO:root:Iteration[2] Validation-accuracy=0.532832
+INFO:root:Iteration[3] Train-accuracy=0.541209
+INFO:root:Iteration[3] Time cost=21.817
+INFO:root:Iteration[3] Train-accuracy=0.544072
+INFO:root:Iteration[3] Time cost=21.759
+INFO:root:Iteration[3] Train-accuracy=0.546458
+INFO:root:Iteration[3] Time cost=22.156
+INFO:root:Iteration[3] Validation-accuracy=0.589102
+INFO:root:Iteration[3] Validation-accuracy=0.559138
+INFO:root:Iteration[3] Validation-accuracy=0.613528
+INFO:root:Iteration[4] Train-accuracy=0.618500
+INFO:root:Iteration[4] Time cost=21.552
+INFO:root:Iteration[4] Train-accuracy=0.614862
+INFO:root:Iteration[4] Time cost=21.544
+INFO:root:Iteration[4] Train-accuracy=0.619573
+INFO:root:Iteration[4] Time cost=21.890
+INFO:root:Iteration[4] Validation-accuracy=0.630241
+INFO:root:Iteration[4] Validation-accuracy=0.618176
+INFO:root:Iteration[4] Validation-accuracy=0.666930
+INFO:root:Iteration[5] Train-accuracy=0.673843
+INFO:root:Iteration[5] Time cost=21.056
+INFO:root:Iteration[5] Train-accuracy=0.675692
+INFO:root:Iteration[5] Time cost=21.120
+INFO:root:Iteration[5] Train-accuracy=0.678912
+INFO:root:Iteration[5] Time cost=21.721
+INFO:root:Iteration[5] Validation-accuracy=0.657634
+INFO:root:Iteration[5] Validation-accuracy=0.677809
+INFO:root:Iteration[5] Validation-accuracy=0.715882
+INFO:root:Iteration[6] Train-accuracy=0.722149
+INFO:root:Iteration[6] Time cost=20.579
+INFO:root:Iteration[6] Train-accuracy=0.724833
+INFO:root:Iteration[6] Time cost=20.548
+INFO:root:Iteration[6] Train-accuracy=0.720241
+INFO:root:Iteration[6] Time cost=20.772
+INFO:root:Iteration[6] Validation-accuracy=0.692939
+INFO:root:Iteration[6] Validation-accuracy=0.714794
+INFO:root:Iteration[6] Validation-accuracy=0.748220
+INFO:root:Iteration[7] Train-accuracy=0.760854
+INFO:root:Iteration[7] Time cost=20.801
+INFO:root:Iteration[7] Train-accuracy=0.757276
+INFO:root:Iteration[7] Time cost=21.080
+INFO:root:Iteration[7] Validation-accuracy=0.735858
+INFO:root:Iteration[7] Train-accuracy=0.758767
+INFO:root:Iteration[7] Time cost=21.353
+INFO:root:Iteration[7] Validation-accuracy=0.737638
+INFO:root:Iteration[7] Validation-accuracy=0.774328
+INFO:root:Iteration[8] Train-accuracy=0.794967
+INFO:root:Iteration[8] Time cost=21.593
+INFO:root:Iteration[8] Train-accuracy=0.798485
+INFO:root:Iteration[8] Time cost=21.672
+INFO:root:Iteration[8] Validation-accuracy=0.762460
+INFO:root:Iteration[8] Train-accuracy=0.795503
+INFO:root:Iteration[8] Time cost=22.155
+INFO:root:Iteration[8] Validation-accuracy=0.745748
+INFO:root:Iteration[8] Validation-accuracy=0.784513
+INFO:root:Iteration[9] Train-accuracy=0.825561
+INFO:root:Iteration[9] Time cost=21.644
+INFO:root:Iteration[9] Train-accuracy=0.821923
+INFO:root:Iteration[9] Time cost=21.479
+INFO:root:Iteration[9] Validation-accuracy=0.727453
+INFO:root:Iteration[9] Validation-accuracy=0.745253
+INFO:root:Iteration[9] Train-accuracy=0.819716
+INFO:root:Iteration[9] Time cost=21.927
+INFO:root:Iteration[9] Validation-accuracy=0.781151
+INFO:root:Iteration[10] Train-accuracy=0.842975
+INFO:root:Iteration[10] Time cost=21.431
+INFO:root:Iteration[10] Train-accuracy=0.841543
+INFO:root:Iteration[10] Time cost=21.387
+INFO:root:Iteration[10] Validation-accuracy=0.768196
+INFO:root:Iteration[10] Validation-accuracy=0.781448
+INFO:root:Iteration[10] Train-accuracy=0.843989
+INFO:root:Iteration[10] Time cost=21.875
+INFO:root:Iteration[10] Validation-accuracy=0.804391
+INFO:root:Iteration[11] Train-accuracy=0.860329
+INFO:root:Iteration[11] Time cost=20.664
+INFO:root:Iteration[11] Train-accuracy=0.858958
+INFO:root:Iteration[11] Time cost=20.734
+INFO:root:Iteration[11] Validation-accuracy=0.780063
+INFO:root:Iteration[11] Validation-accuracy=0.774426
+INFO:root:Iteration[11] Train-accuracy=0.861104
+INFO:root:Iteration[11] Time cost=21.449
+INFO:root:Iteration[11] Validation-accuracy=0.818335
+INFO:root:Iteration[12] Train-accuracy=0.885973
+INFO:root:Iteration[12] Time cost=21.037
+INFO:root:Iteration[12] Train-accuracy=0.887583
+INFO:root:Iteration[12] Time cost=21.066
+INFO:root:Iteration[12] Validation-accuracy=0.798358
+INFO:root:Iteration[12] Validation-accuracy=0.803204
+INFO:root:Iteration[12] Train-accuracy=0.885914
+INFO:root:Iteration[12] Time cost=21.738
+INFO:root:Iteration[12] Validation-accuracy=0.812203
+INFO:root:Iteration[13] Train-accuracy=0.904103
+INFO:root:Iteration[13] Time cost=21.326
+INFO:root:Iteration[13] Train-accuracy=0.904282
+INFO:root:Iteration[13] Time cost=21.278
+INFO:root:Iteration[13] Validation-accuracy=0.791238
+INFO:root:Iteration[13] Validation-accuracy=0.799842
+INFO:root:Iteration[13] Train-accuracy=0.901002
+INFO:root:Iteration[13] Time cost=21.408
+INFO:root:Iteration[13] Validation-accuracy=0.802116
+INFO:root:Iteration[14] Train-accuracy=0.911140
+INFO:root:Iteration[14] Time cost=21.527
+INFO:root:Iteration[14] Train-accuracy=0.913705
+INFO:root:Iteration[14] Time cost=21.569
+INFO:root:Iteration[14] Validation-accuracy=0.803204
+INFO:root:Iteration[14] Validation-accuracy=0.803303
+INFO:root:Iteration[14] Train-accuracy=0.914182
+INFO:root:Iteration[14] Time cost=22.170
+INFO:root:Iteration[14] Validation-accuracy=0.771460
+INFO:root:Iteration[15] Train-accuracy=0.915852
+INFO:root:Iteration[15] Time cost=21.608
+INFO:root:Iteration[15] Train-accuracy=0.911975
+INFO:root:Iteration[15] Time cost=21.623
+INFO:root:Iteration[15] Validation-accuracy=0.801325
+INFO:root:Iteration[15] Validation-accuracy=0.798259
+INFO:root:Iteration[15] Train-accuracy=0.923008
+INFO:root:Iteration[15] Time cost=21.806
+INFO:root:Iteration[15] Validation-accuracy=0.809335
+INFO:root:Iteration[16] Train-accuracy=0.938096
+INFO:root:Iteration[16] Time cost=21.857
+INFO:root:Iteration[16] Train-accuracy=0.944358
+INFO:root:Iteration[16] Time cost=21.954
+INFO:root:Iteration[16] Validation-accuracy=0.790249
+INFO:root:Iteration[16] Validation-accuracy=0.795095
+INFO:root:Iteration[16] Train-accuracy=0.947877
+INFO:root:Iteration[16] Time cost=21.844
+INFO:root:Iteration[16] Validation-accuracy=0.812797
+INFO:root:Iteration[17] Train-accuracy=0.953006
+INFO:root:Iteration[17] Time cost=21.357
+INFO:root:Iteration[17] Train-accuracy=0.957121
+INFO:root:Iteration[17] Time cost=21.431
+INFO:root:Iteration[17] Validation-accuracy=0.793908
+INFO:root:Iteration[17] Validation-accuracy=0.793216
+INFO:root:Iteration[17] Train-accuracy=0.962846
+INFO:root:Iteration[17] Time cost=21.819
+INFO:root:Iteration[17] Validation-accuracy=0.812994
+INFO:root:Iteration[18] Train-accuracy=0.961772
+INFO:root:Iteration[18] Time cost=20.599
+INFO:root:Iteration[18] Train-accuracy=0.963800
+INFO:root:Iteration[18] Time cost=20.569
+INFO:root:Iteration[18] Validation-accuracy=0.815467
+INFO:root:Iteration[18] Validation-accuracy=0.818829
+INFO:root:Iteration[18] Train-accuracy=0.966603
+INFO:root:Iteration[18] Time cost=21.018
+INFO:root:Iteration[18] Validation-accuracy=0.812698
+INFO:root:Iteration[19] Train-accuracy=0.975131
+INFO:root:Iteration[19] Time cost=20.671
+INFO:root:Iteration[19] Train-accuracy=0.975847
+INFO:root:Iteration[19] Time cost=20.758
+INFO:root:Iteration[19] Validation-accuracy=0.822785
+INFO:root:Iteration[19] Validation-accuracy=0.823378
+INFO:root:Iteration[19] Train-accuracy=0.981990
+INFO:root:Iteration[19] Time cost=20.912
+INFO:root:Accuracy = 0.823800
+INFO:root:Iteration[19] Validation-accuracy=0.828521
+INFO:root:Accuracy = 0.829200
+INFO:root:Accuracy = 0.833000
+```
+
+## imagenet
+
+3 x dual 980, with cudnn, 1G ethernet
+
+`dist_sync`:
+
+```
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Iter[0] Batch [5]	Speed: 175.98 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 173.52 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 171.04 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 107.82 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 108.03 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 107.79 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 109.53 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 109.74 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 110.21 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 113.19 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 111.20 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 110.38 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 111.24 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 109.90 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 107.48 samples/sec
+```
+
+`dist_aync`
+
+```
+INFO:root:Iter[0] Batch [5]	Speed: 202.15 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 181.41 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 179.61 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 125.75 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 108.90 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 109.25 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 118.44 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 112.89 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 112.83 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 123.68 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 115.85 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 105.82 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 124.24 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 115.21 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 106.60 samples/sec
+INFO:root:Iter[0] Batch [30]	Speed: 120.62 samples/sec
+INFO:root:Iter[0] Batch [30]	Speed: 121.35 samples/sec
+```
diff --git a/tests/python/multi-node/common.py b/tests/python/multi-node/common.py
index 2d33a32c7145..0db092462a78 100644
--- a/tests/python/multi-node/common.py
+++ b/tests/python/multi-node/common.py
@@ -58,10 +58,11 @@ def cifar10(batch_size, input_shape, num_parts=1, part_index=0):
         rand_mirror = False,
         shuffle     = False,
         round_batch = False,
-        data_shape  = (3,28,28),
+        data_shape  = input_shape,
         batch_size  = batch_size)
     return (train, val)
 
+
 def accuracy(model, data):
     """evaluate acc"""
     # predict
diff --git a/tests/python/multi-node/dist_async_inception.py b/tests/python/multi-node/dist_async_inception.py
new file mode 100755
index 000000000000..cb7fd656471f
--- /dev/null
+++ b/tests/python/multi-node/dist_async_inception.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# pylint: skip-file
+import mxnet as mx
+import common
+import logging
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+(train, val) = common.cifar10(num_parts = kv.num_workers,
+                              part_index = kv.rank,
+                              batch_size = 128,
+                              input_shape=(3,28,28))
+
+# assume each worker has two gpus
+devs = [mx.gpu(i) for i in range(2)]
+model = mx.model.FeedForward.create(
+    ctx           = devs,
+    kvstore       = kv,
+    symbol        = common.inception(),
+    X             = train,
+    eval_data     = val,
+    num_round     = 20,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001,
+    initializer   = mx.init.Uniform(0.07))
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_async_lenet.py b/tests/python/multi-node/dist_async_lenet.py
new file mode 100755
index 000000000000..866eed3b8f2a
--- /dev/null
+++ b/tests/python/multi-node/dist_async_lenet.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import common
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+# feed each machine the whole data
+(train, val) = common.mnist(num_parts = kv.num_workers,
+                            part_index = kv.rank,
+                            batch_size = 100,
+                            input_shape = (1,28,28))
+
+model  = mx.model.FeedForward.create(
+    ctx           = mx.gpu(kv.rank),
+    kvstore       = kv,
+    symbol        = common.lenet(),
+    X             = train,
+    num_round     = 10,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001)
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_imagenet_inception.py b/tests/python/multi-node/dist_imagenet_inception.py
new file mode 100755
index 000000000000..978b821f8fa6
--- /dev/null
+++ b/tests/python/multi-node/dist_imagenet_inception.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import imagenet
+
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_sync')
+
+batch_size = 96
+(train, val) = imagenet.ilsvrc12(num_parts = kv.num_workers,
+                                part_index = kv.rank,
+                                batch_size = batch_size,
+                                input_shape = (3, 224, 224))
+
+# assume each worker has two gpus
+devs = [mx.gpu(i) for i in range(2)]
+
+model = mx.model.FeedForward(
+    ctx           = devs,
+    symbol        = imagenet.inception(1000),
+    num_round     = 20,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001)
+
+model.fit(X        = train,
+          eval_data     = val,
+          kvstore       = kv,
+          epoch_end_callback = mx.callback.Speedometer(batch_size, 5))
diff --git a/tests/python/multi-node/imagenet.py b/tests/python/multi-node/imagenet.py
new file mode 100644
index 000000000000..7663df8d1bad
--- /dev/null
+++ b/tests/python/multi-node/imagenet.py
@@ -0,0 +1,101 @@
+import sys
+sys.path.insert(0, "../common/")
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import get_data
+import numpy as np
+import logging
+
+def ilsvrc12(batch_size, input_shape, num_parts=1, part_index=0):
+    """return ilsvrc12 iterator
+    """
+    data_dir = "../../../../ilsvrc12/"
+    train = mx.io.ImageRecordIter(
+        path_imgrec = data_dir + "train.rec",
+        mean_img    = data_dir + "mean.bin",
+        data_shape  = input_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True,
+        shuffle     = True,
+        round_batch = True,
+        num_parts   = num_parts,
+        part_index  = part_index)
+    val = mx.io.ImageRecordIter(
+        path_imgrec = data_dir + "val.rec",
+        mean_img    = data_dir + "mean.bin",
+        rand_crop   = False,
+        rand_mirror = False,
+        shuffle     = False,
+        round_batch = False,
+        data_shape  = input_shape,
+        batch_size  = batch_size)
+    return (train, val)
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
+    in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
+    in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/multi-node/local_inception.py b/tests/python/multi-node/local_inception.py
index 5c5fad3c4da1..fcaa8dc79688 100755
--- a/tests/python/multi-node/local_inception.py
+++ b/tests/python/multi-node/local_inception.py
@@ -15,8 +15,9 @@ def test_inception(devs, kv_type):
         ctx           = devs,
         symbol        = common.inception(),
         X             = train,
+        eval_data     = val,
         kvstore       = kv_type,
-        num_round     = 4,
+        num_round     = 10,
         learning_rate = 0.1,
         momentum      = 0.9,
         wd            = 0.00001,

From d68fe53f2e4a3a64d34ef58e77b3962509dd803d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 02:38:21 -0600
Subject: [PATCH 025/122] Update LSTM

---
 dmlc-core                     |   2 +-
 doc/env_var.md                |   4 +-
 example/rnn/README.md         |   9 +-
 example/rnn/char_lstm.ipynb   | 559 ++++++++++++++++++++++++++++++++++
 example/rnn/lstm.py           | 159 +++++++---
 example/rnn/lstm_ptb.py       |   5 +-
 ps-lite                       |   2 +-
 src/common/utils.h            |   4 +-
 src/operator/block_grad-inl.h |   2 +-
 9 files changed, 693 insertions(+), 53 deletions(-)
 create mode 100644 example/rnn/char_lstm.ipynb

diff --git a/dmlc-core b/dmlc-core
index 046a4a77e74d..c30a1a055644 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 046a4a77e74d45e5ac16f2a598c31d56d5ccce3d
+Subproject commit c30a1a0556442506b4cfe9a4701c8ca77c6d9a38
diff --git a/doc/env_var.md b/doc/env_var.md
index 16a9ee4ff3bc..d274be269c6e 100644
--- a/doc/env_var.md
+++ b/doc/env_var.md
@@ -3,7 +3,7 @@ Environment Variables
 MXNet have several settings that can be changed via environment variable.
 Usually you do not need to change these settings, but they are listed here for reference.
 
-* MXNET_GPU_WORKER_NTHREADS (default=1)
+* MXNET_GPU_WORKER_NTHREADS (default=2)
   - Maximum number of threads that do the computation job on each GPU.
 * MXNET_GPU_COPY_NTHREADS (default=1)
   - Maximum number of threads that do memory copy job on each GPU.
@@ -16,7 +16,7 @@ Usually you do not need to change these settings, but they are listed here for r
 * MXNET_EXEC_MATCH_RANGE (default=10)
   - The rough matching scale in symbolic execution memory allocator.
   - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose).
-* MXNET_EXEC_NUM_TEMP (default=4)
+* MXNET_EXEC_NUM_TEMP (default=1)
   - Maximum number of temp workspace we can allocate to each device.
   - Set this to small number can save GPU memory.
   - It will also likely to decrease level of parallelism, which is usually OK.
diff --git a/example/rnn/README.md b/example/rnn/README.md
index 3955e1809f81..74654c08b88c 100644
--- a/example/rnn/README.md
+++ b/example/rnn/README.md
@@ -1,6 +1,11 @@
 RNN Example
-----
+===========
 This folder contains RNN examples using low level symbol interface.
 
-- [lstm.py](lstm.py) Basic functions for building a LSTM Network
+- [lstm.py](lstm.py) Functions for building a LSTM Network
 - [lstm_ptb.py](lstm_ptb.py) PennTreeBank language model by using LSTM
+- [char_lstm.ipynb](char_lstm.ipynb) Notebook to demo how to train a character LSTM by using ```lstm.py```
+
+
+Performance Note:
+More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer [Environment Variables](https://mxnet.readthedocs.org/en/latest/env_var.html)
diff --git a/example/rnn/char_lstm.ipynb b/example/rnn/char_lstm.ipynb
new file mode 100644
index 000000000000..4e06efb3e8c6
--- /dev/null
+++ b/example/rnn/char_lstm.ipynb
@@ -0,0 +1,559 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Char LSTM Example.\n",
+    "This example aims to show how to use lstm to build a char level language model, and generate text from it. \n",
+    "We use a tiny shakespeare text for demo purpose. \n",
+    "Data can be found at [https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from collections import Counter\n",
+    "from collections import defaultdict\n",
+    "import mxnet as mx\n",
+    "import numpy as np\n",
+    "import sys\n",
+    "sys.path.insert(0, \"../rnn\")\n",
+    "import lstm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set basic network parameters. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "batch_size = 32\n",
+    "seq_len = 32\n",
+    "num_hidden = 256\n",
+    "num_embed = 256\n",
+    "num_lstm_layer = 2\n",
+    "num_round = 21\n",
+    "learning_rate= 1\n",
+    "wd=0.00001\n",
+    "momentum=0.0\n",
+    "clip_gradient=1\n",
+    "update_period = 1\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make dictionary from text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def make_dict(text, max_vocab=10000):\n",
+    "    lst = list(text)\n",
+    "    cnt = Counter(lst)\n",
+    "    print(\"Total unique char: %d\" % len(cnt))\n",
+    "    common = cnt.most_common(max_vocab - 1)\n",
+    "    dic = defaultdict(int)\n",
+    "    idx = 0\n",
+    "    for c, _ in common:\n",
+    "        dic[c] = idx\n",
+    "        idx += 1\n",
+    "    if len(dic) == max_vocab - 1:\n",
+    "        dic[\"_UNKNOWN_\"] = idx\n",
+    "    return dic\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Transfer text into data batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def make_batch(file_path, batch_size=32, seq_lenth=32, max_vocab=10000, dic=None):\n",
+    "    fi = open(file_path)\n",
+    "    text = fi.read()\n",
+    "    fi.close()\n",
+    "    if dic == None:\n",
+    "        dic = make_dict(text, max_vocab)\n",
+    "    lookup_table = dict((idx, c) for c, idx in dic.items())\n",
+    "    char_lst = list(text)\n",
+    "    num_batch = int(len(char_lst) / batch_size)\n",
+    "    char_lst = char_lst[:num_batch * batch_size]\n",
+    "    data = np.zeros((num_batch, batch_size), dtype=\"float32\")\n",
+    "    idx = 0\n",
+    "    for j in range(batch_size):\n",
+    "        for i in range(num_batch):\n",
+    "            if char_lst[idx] in dic:\n",
+    "                data[i][j] = dic[char_lst[idx]]\n",
+    "            else:\n",
+    "                char_lst[idx] = dic[\"_UNKNOWN_\"]\n",
+    "            idx += 1\n",
+    "    return data, dic, lookup_table\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total unique char: 65\n"
+     ]
+    }
+   ],
+   "source": [
+    "X, dic, lookup_table = make_batch(\"./input.txt\", batch_size=batch_size, seq_lenth=seq_len)\n",
+    "vocab = len(dic)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Move tail text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def drop_tail(X, seq_len):\n",
+    "    shape = X.shape\n",
+    "    nstep = int(shape[0] / seq_len)\n",
+    "    return X[0:(nstep * seq_len), :]\n",
+    "\n",
+    "train_val_fraction = 0.9\n",
+    "size = X.shape[0]\n",
+    "X_train = X[:int(size * train_val_fraction), :]\n",
+    "X_val = X[int(size * train_val_fraction):, :]\n",
+    "\n",
+    "X_train = drop_tail(X_train, seq_len)\n",
+    "X_val = drop_tail(X_val, seq_len)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set up LSTM model on GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model = lstm.setup_rnn_model(mx.gpu(),\n",
+    "                             num_lstm_layer=num_lstm_layer,\n",
+    "                             seq_len=seq_len,\n",
+    "                             num_hidden=num_hidden,\n",
+    "                             num_embed=num_embed,\n",
+    "                             num_label=vocab,\n",
+    "                             batch_size=batch_size,\n",
+    "                             input_size=vocab,\n",
+    "                             initializer=mx.initializer.Uniform(0.1),\n",
+    "                             dropout=0.5)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train LSTM model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training swith train.shape=(31360, 32)\n",
+      "Training swith val.shape=(3456, 32)\n",
+      "batch_size=32\n",
+      "seq_len=32\n",
+      "Epoch [125] Train: NLL=3.368, Prep=29.019\n",
+      "Epoch [250] Train: NLL=3.289, Prep=26.811\n",
+      "Epoch [375] Train: NLL=3.180, Prep=24.044\n",
+      "Epoch [500] Train: NLL=3.070, Prep=21.534\n",
+      "Epoch [625] Train: NLL=2.971, Prep=19.503\n",
+      "Epoch [750] Train: NLL=2.891, Prep=18.011\n",
+      "Epoch [875] Train: NLL=2.824, Prep=16.846\n",
+      "Iter [0] Train: Time: 40.182 sec, NLL=2.775, Prep=16.041\n",
+      "Iter [0] Val: NLL=2.288, Prep=9.857\n",
+      "Epoch [1000] Train: NLL=2.347, Prep=10.451\n",
+      "Epoch [1125] Train: NLL=2.321, Prep=10.188\n",
+      "Epoch [1250] Train: NLL=2.298, Prep=9.951\n",
+      "Epoch [1375] Train: NLL=2.276, Prep=9.741\n",
+      "Epoch [1500] Train: NLL=2.256, Prep=9.541\n",
+      "Epoch [1625] Train: NLL=2.234, Prep=9.338\n",
+      "Epoch [1750] Train: NLL=2.215, Prep=9.160\n",
+      "Epoch [1875] Train: NLL=2.196, Prep=8.987\n",
+      "Iter [1] Train: Time: 40.342 sec, NLL=2.184, Prep=8.885\n",
+      "Iter [1] Val: NLL=1.988, Prep=7.298\n",
+      "Epoch [2000] Train: NLL=2.050, Prep=7.766\n",
+      "Epoch [2125] Train: NLL=2.032, Prep=7.631\n",
+      "Epoch [2250] Train: NLL=2.014, Prep=7.490\n",
+      "Epoch [2375] Train: NLL=2.002, Prep=7.405\n",
+      "Epoch [2500] Train: NLL=1.988, Prep=7.297\n",
+      "Epoch [2625] Train: NLL=1.974, Prep=7.200\n",
+      "Epoch [2750] Train: NLL=1.961, Prep=7.106\n",
+      "Epoch [2875] Train: NLL=1.949, Prep=7.024\n",
+      "Iter [2] Train: Time: 40.377 sec, NLL=1.943, Prep=6.981\n",
+      "Iter [2] Val: NLL=1.808, Prep=6.101\n",
+      "Reset learning rate to 0.9\n",
+      "Epoch [3000] Train: NLL=1.850, Prep=6.359\n",
+      "Epoch [3125] Train: NLL=1.844, Prep=6.323\n",
+      "Epoch [3250] Train: NLL=1.831, Prep=6.238\n",
+      "Epoch [3375] Train: NLL=1.822, Prep=6.185\n",
+      "Epoch [3500] Train: NLL=1.812, Prep=6.124\n",
+      "Epoch [3625] Train: NLL=1.805, Prep=6.077\n",
+      "Epoch [3750] Train: NLL=1.797, Prep=6.033\n",
+      "Epoch [3875] Train: NLL=1.790, Prep=5.990\n",
+      "Iter [3] Train: Time: 40.348 sec, NLL=1.787, Prep=5.973\n",
+      "Iter [3] Val: NLL=1.695, Prep=5.446\n",
+      "Epoch [4000] Train: NLL=1.736, Prep=5.676\n",
+      "Epoch [4125] Train: NLL=1.734, Prep=5.663\n",
+      "Epoch [4250] Train: NLL=1.722, Prep=5.595\n",
+      "Epoch [4375] Train: NLL=1.715, Prep=5.555\n",
+      "Epoch [4500] Train: NLL=1.707, Prep=5.514\n",
+      "Epoch [4625] Train: NLL=1.703, Prep=5.492\n",
+      "Epoch [4750] Train: NLL=1.697, Prep=5.459\n",
+      "Epoch [4875] Train: NLL=1.693, Prep=5.434\n",
+      "Iter [4] Train: Time: 40.372 sec, NLL=1.691, Prep=5.427\n",
+      "Iter [4] Val: NLL=1.617, Prep=5.039\n",
+      "Epoch [5000] Train: NLL=1.659, Prep=5.257\n",
+      "Epoch [5125] Train: NLL=1.653, Prep=5.221\n",
+      "Epoch [5250] Train: NLL=1.645, Prep=5.179\n",
+      "Epoch [5375] Train: NLL=1.638, Prep=5.143\n",
+      "Epoch [5500] Train: NLL=1.633, Prep=5.119\n",
+      "Epoch [5625] Train: NLL=1.629, Prep=5.101\n",
+      "Epoch [5750] Train: NLL=1.625, Prep=5.079\n",
+      "Epoch [5875] Train: NLL=1.621, Prep=5.059\n",
+      "Iter [5] Train: Time: 40.363 sec, NLL=1.621, Prep=5.059\n",
+      "Iter [5] Val: NLL=1.569, Prep=4.804\n",
+      "Reset learning rate to 0.81\n",
+      "Epoch [6000] Train: NLL=1.603, Prep=4.966\n",
+      "Epoch [6125] Train: NLL=1.588, Prep=4.895\n",
+      "Epoch [6250] Train: NLL=1.585, Prep=4.879\n",
+      "Epoch [6375] Train: NLL=1.579, Prep=4.852\n",
+      "Epoch [6500] Train: NLL=1.574, Prep=4.827\n",
+      "Epoch [6625] Train: NLL=1.571, Prep=4.812\n",
+      "Epoch [6750] Train: NLL=1.567, Prep=4.793\n",
+      "Iter [6] Train: Time: 40.353 sec, NLL=1.565, Prep=4.781\n",
+      "Iter [6] Val: NLL=1.529, Prep=4.615\n",
+      "Epoch [6875] Train: NLL=1.574, Prep=4.824\n",
+      "Epoch [7000] Train: NLL=1.560, Prep=4.760\n",
+      "Epoch [7125] Train: NLL=1.545, Prep=4.686\n",
+      "Epoch [7250] Train: NLL=1.544, Prep=4.684\n",
+      "Epoch [7375] Train: NLL=1.538, Prep=4.654\n",
+      "Epoch [7500] Train: NLL=1.534, Prep=4.635\n",
+      "Epoch [7625] Train: NLL=1.530, Prep=4.620\n",
+      "Epoch [7750] Train: NLL=1.528, Prep=4.607\n",
+      "Iter [7] Train: Time: 40.353 sec, NLL=1.526, Prep=4.598\n",
+      "Iter [7] Val: NLL=1.496, Prep=4.463\n",
+      "Epoch [7875] Train: NLL=1.530, Prep=4.619\n",
+      "Epoch [8000] Train: NLL=1.522, Prep=4.579\n",
+      "Epoch [8125] Train: NLL=1.511, Prep=4.533\n",
+      "Epoch [8250] Train: NLL=1.511, Prep=4.532\n",
+      "Epoch [8375] Train: NLL=1.506, Prep=4.508\n",
+      "Epoch [8500] Train: NLL=1.503, Prep=4.494\n",
+      "Epoch [8625] Train: NLL=1.499, Prep=4.479\n",
+      "Epoch [8750] Train: NLL=1.497, Prep=4.467\n",
+      "Iter [8] Train: Time: 40.371 sec, NLL=1.495, Prep=4.461\n",
+      "Iter [8] Val: NLL=1.481, Prep=4.396\n",
+      "Reset learning rate to 0.729\n",
+      "Epoch [8875] Train: NLL=1.478, Prep=4.384\n",
+      "Epoch [9000] Train: NLL=1.489, Prep=4.434\n",
+      "Epoch [9125] Train: NLL=1.482, Prep=4.400\n",
+      "Epoch [9250] Train: NLL=1.480, Prep=4.391\n",
+      "Epoch [9375] Train: NLL=1.474, Prep=4.368\n",
+      "Epoch [9500] Train: NLL=1.471, Prep=4.355\n",
+      "Epoch [9625] Train: NLL=1.469, Prep=4.343\n",
+      "Epoch [9750] Train: NLL=1.466, Prep=4.333\n",
+      "Iter [9] Train: Time: 40.344 sec, NLL=1.465, Prep=4.329\n",
+      "Iter [9] Val: NLL=1.453, Prep=4.278\n",
+      "Epoch [9875] Train: NLL=1.458, Prep=4.297\n",
+      "Epoch [10000] Train: NLL=1.466, Prep=4.331\n",
+      "Epoch [10125] Train: NLL=1.460, Prep=4.305\n",
+      "Epoch [10250] Train: NLL=1.456, Prep=4.289\n",
+      "Epoch [10375] Train: NLL=1.452, Prep=4.270\n",
+      "Epoch [10500] Train: NLL=1.449, Prep=4.260\n",
+      "Epoch [10625] Train: NLL=1.447, Prep=4.248\n",
+      "Epoch [10750] Train: NLL=1.445, Prep=4.242\n",
+      "Iter [10] Train: Time: 40.341 sec, NLL=1.444, Prep=4.240\n",
+      "Iter [10] Val: NLL=1.438, Prep=4.211\n",
+      "Epoch [10875] Train: NLL=1.447, Prep=4.250\n",
+      "Epoch [11000] Train: NLL=1.445, Prep=4.243\n",
+      "Epoch [11125] Train: NLL=1.440, Prep=4.222\n",
+      "Epoch [11250] Train: NLL=1.436, Prep=4.205\n",
+      "Epoch [11375] Train: NLL=1.434, Prep=4.196\n",
+      "Epoch [11500] Train: NLL=1.432, Prep=4.185\n",
+      "Epoch [11625] Train: NLL=1.429, Prep=4.175\n",
+      "Epoch [11750] Train: NLL=1.428, Prep=4.169\n",
+      "Iter [11] Train: Time: 40.352 sec, NLL=1.427, Prep=4.168\n",
+      "Iter [11] Val: NLL=1.429, Prep=4.174\n",
+      "Reset learning rate to 0.6561\n",
+      "Epoch [11875] Train: NLL=1.431, Prep=4.182\n",
+      "Epoch [12000] Train: NLL=1.424, Prep=4.154\n",
+      "Epoch [12125] Train: NLL=1.422, Prep=4.145\n",
+      "Epoch [12250] Train: NLL=1.418, Prep=4.127\n",
+      "Epoch [12375] Train: NLL=1.414, Prep=4.113\n",
+      "Epoch [12500] Train: NLL=1.412, Prep=4.105\n",
+      "Epoch [12625] Train: NLL=1.410, Prep=4.096\n",
+      "Iter [12] Train: Time: 40.357 sec, NLL=1.409, Prep=4.091\n",
+      "Iter [12] Val: NLL=1.417, Prep=4.124\n",
+      "Epoch [12750] Train: NLL=1.435, Prep=4.201\n",
+      "Epoch [12875] Train: NLL=1.417, Prep=4.123\n",
+      "Epoch [13000] Train: NLL=1.408, Prep=4.086\n",
+      "Epoch [13125] Train: NLL=1.409, Prep=4.091\n",
+      "Epoch [13250] Train: NLL=1.404, Prep=4.073\n",
+      "Epoch [13375] Train: NLL=1.401, Prep=4.058\n",
+      "Epoch [13500] Train: NLL=1.398, Prep=4.048\n",
+      "Epoch [13625] Train: NLL=1.397, Prep=4.041\n",
+      "Iter [13] Train: Time: 40.356 sec, NLL=1.396, Prep=4.038\n",
+      "Iter [13] Val: NLL=1.411, Prep=4.102\n",
+      "Epoch [13750] Train: NLL=1.414, Prep=4.114\n",
+      "Epoch [13875] Train: NLL=1.402, Prep=4.063\n",
+      "Epoch [14000] Train: NLL=1.395, Prep=4.036\n",
+      "Epoch [14125] Train: NLL=1.396, Prep=4.037\n",
+      "Epoch [14250] Train: NLL=1.392, Prep=4.023\n",
+      "Epoch [14375] Train: NLL=1.389, Prep=4.010\n",
+      "Epoch [14500] Train: NLL=1.386, Prep=4.000\n",
+      "Epoch [14625] Train: NLL=1.385, Prep=3.995\n",
+      "Iter [14] Train: Time: 40.344 sec, NLL=1.384, Prep=3.992\n",
+      "Iter [14] Val: NLL=1.400, Prep=4.055\n",
+      "Reset learning rate to 0.59049\n",
+      "Epoch [14750] Train: NLL=1.378, Prep=3.966\n",
+      "Epoch [14875] Train: NLL=1.390, Prep=4.014\n",
+      "Epoch [15000] Train: NLL=1.383, Prep=3.986\n",
+      "Epoch [15125] Train: NLL=1.382, Prep=3.982\n",
+      "Epoch [15250] Train: NLL=1.377, Prep=3.965\n",
+      "Epoch [15375] Train: NLL=1.375, Prep=3.957\n",
+      "Epoch [15500] Train: NLL=1.372, Prep=3.945\n",
+      "Epoch [15625] Train: NLL=1.371, Prep=3.938\n",
+      "Iter [15] Train: Time: 40.352 sec, NLL=1.370, Prep=3.936\n",
+      "Iter [15] Val: NLL=1.393, Prep=4.026\n",
+      "Epoch [15750] Train: NLL=1.368, Prep=3.927\n",
+      "Epoch [15875] Train: NLL=1.380, Prep=3.974\n",
+      "Epoch [16000] Train: NLL=1.374, Prep=3.951\n",
+      "Epoch [16125] Train: NLL=1.371, Prep=3.940\n",
+      "Epoch [16250] Train: NLL=1.367, Prep=3.922\n",
+      "Epoch [16375] Train: NLL=1.364, Prep=3.912\n",
+      "Epoch [16500] Train: NLL=1.362, Prep=3.905\n",
+      "Epoch [16625] Train: NLL=1.361, Prep=3.900\n",
+      "Iter [16] Train: Time: 40.358 sec, NLL=1.360, Prep=3.898\n",
+      "Iter [16] Val: NLL=1.389, Prep=4.012\n",
+      "Epoch [16750] Train: NLL=1.367, Prep=3.924\n",
+      "Epoch [16875] Train: NLL=1.367, Prep=3.923\n",
+      "Epoch [17000] Train: NLL=1.363, Prep=3.907\n",
+      "Epoch [17125] Train: NLL=1.360, Prep=3.895\n",
+      "Epoch [17250] Train: NLL=1.357, Prep=3.886\n",
+      "Epoch [17375] Train: NLL=1.355, Prep=3.878\n",
+      "Epoch [17500] Train: NLL=1.353, Prep=3.867\n",
+      "Epoch [17625] Train: NLL=1.352, Prep=3.864\n",
+      "Iter [17] Train: Time: 40.347 sec, NLL=1.352, Prep=3.864\n",
+      "Iter [17] Val: NLL=1.384, Prep=3.990\n",
+      "Reset learning rate to 0.531441\n",
+      "Epoch [17750] Train: NLL=1.362, Prep=3.903\n",
+      "Epoch [17875] Train: NLL=1.355, Prep=3.877\n",
+      "Epoch [18000] Train: NLL=1.353, Prep=3.870\n",
+      "Epoch [18125] Train: NLL=1.348, Prep=3.851\n",
+      "Epoch [18250] Train: NLL=1.346, Prep=3.843\n",
+      "Epoch [18375] Train: NLL=1.344, Prep=3.834\n",
+      "Epoch [18500] Train: NLL=1.342, Prep=3.827\n",
+      "Iter [18] Train: Time: 40.354 sec, NLL=1.341, Prep=3.823\n",
+      "Iter [18] Val: NLL=1.378, Prep=3.967\n",
+      "Epoch [18625] Train: NLL=1.370, Prep=3.935\n",
+      "Epoch [18750] Train: NLL=1.352, Prep=3.863\n",
+      "Epoch [18875] Train: NLL=1.345, Prep=3.838\n",
+      "Epoch [19000] Train: NLL=1.346, Prep=3.841\n",
+      "Epoch [19125] Train: NLL=1.341, Prep=3.823\n",
+      "Epoch [19250] Train: NLL=1.338, Prep=3.811\n",
+      "Epoch [19375] Train: NLL=1.336, Prep=3.803\n",
+      "Epoch [19500] Train: NLL=1.334, Prep=3.797\n",
+      "Iter [19] Train: Time: 40.370 sec, NLL=1.334, Prep=3.795\n",
+      "Iter [19] Val: NLL=1.377, Prep=3.961\n",
+      "Epoch [19625] Train: NLL=1.354, Prep=3.874\n",
+      "Epoch [19750] Train: NLL=1.344, Prep=3.836\n",
+      "Epoch [19875] Train: NLL=1.338, Prep=3.811\n",
+      "Epoch [20000] Train: NLL=1.338, Prep=3.813\n",
+      "Epoch [20125] Train: NLL=1.334, Prep=3.797\n",
+      "Epoch [20250] Train: NLL=1.331, Prep=3.786\n",
+      "Epoch [20375] Train: NLL=1.329, Prep=3.778\n",
+      "Epoch [20500] Train: NLL=1.328, Prep=3.774\n",
+      "Iter [20] Train: Time: 40.363 sec, NLL=1.327, Prep=3.771\n",
+      "Iter [20] Val: NLL=1.373, Prep=3.946\n",
+      "Reset learning rate to 0.478297\n"
+     ]
+    }
+   ],
+   "source": [
+    "lstm.train_lstm(model, X_train, X_val,\n",
+    "                num_round=num_round,\n",
+    "                half_life=3,\n",
+    "                update_period=update_period,\n",
+    "                learning_rate=learning_rate,\n",
+    "                wd=wd,\n",
+    "                momentum=momentum,\n",
+    "                clip_gradient=clip_gradient)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get parameter from model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "args = dict([(name, arr) for i, arr, grad_arr, name in model.param_blocks])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make a sampler use the parameter we trained"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 1\n",
+    "sampler = lstm.setup_rnn_sample_model(mx.cpu(), args, num_lstm_layer, num_hidden, num_embed, vocab, batch_size, vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "start = 'a'\n",
+    "seq_len = 75\n",
+    "X_input_batch = np.zeros((1,1), dtype=\"float32\")\n",
+    "X_input_batch[0][0] = dic[start]\n",
+    "out = lstm.sample_lstm(sampler, X_input_batch, seq_len)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lookup predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "an'd and dear victories at sound before.\n",
+      "Sir! palient, made me; let it kiss \n"
+     ]
+    }
+   ],
+   "source": [
+    "chars = [lookup_table[int(out[i][0])] for i in range(seq_len)]\n",
+    "print(start + \"\".join(chars))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index a33d627db9db..996861a80894 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -114,10 +114,8 @@ def setup_rnn_model(ctx,
                           seq_len=seq_len,
                           num_embed=num_embed,
                           num_label=num_label,
-                          dropout)
-    print(rnn_sym.list_outputs())
+                          dropout=dropout)
     arg_names = rnn_sym.list_arguments()
-    print sorted(arg_names)
 
     input_shapes = {}
     for name in arg_names:
@@ -126,7 +124,7 @@ def setup_rnn_model(ctx,
         elif name.endswith("data"):
             input_shapes[name] = (batch_size, input_size)
         else:
-            print("ignore %s " % name)
+            pass
 
     arg_shape, out_shape, aux_shape = rnn_sym.infer_shape(**input_shapes)
     arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
@@ -134,8 +132,6 @@ def setup_rnn_model(ctx,
     for shape, name in zip(arg_shape, arg_names):
         if is_param_name(name):
             args_grad[name] = mx.nd.zeros(shape, ctx)
-        # else:
-            # print("Do not need gradient for %s" % name)
 
     rnn_exec = rnn_sym.bind(ctx=ctx, args=arg_arrays,
                             args_grad=args_grad,
@@ -165,36 +161,27 @@ def setup_rnn_model(ctx,
                      param_blocks=param_blocks)
 
 
-def set_onehot_input(onehot, xidx):
-    """setup onehot input"""
-    onehot[:] = 0.
-    onehot[np.arange(onehot.shape[0]), xidx.astype("int32")] = 1.
 
-def logloss(y, prob):
-    eps = 1e-10
-    assert prob.shape[0] == len(y)
-    py = prob[np.arange(len(y)), y.astype("int32")]
-    return -np.sum(np.log(np.maximum(py, eps))) / len(y)
-
-def set_rnn_inputs(m, X, onehot, begin):
+def set_rnn_inputs(m, X, begin):
     seq_len = len(m.seq_data)
-    batch_size, vocab = onehot.shape
+    batch_size, vocab = m.seq_data[0].shape
     for seqidx in range(seq_len):
         idx = (begin + seqidx) % X.shape[0]
         next_idx = (begin + seqidx + 1) % X.shape[0]
         x = X[idx, :]
         y = X[next_idx, :]
-        onehot[:] = 0.
-        onehot[np.arange(batch_size), x.astype("int32")] = 1.
-        m.seq_data[seqidx][:] = onehot
+        mx.nd.onehot_encode(mx.nd.array(x, ctx=m.seq_data[seqidx].context),
+                out=m.seq_data[seqidx])
         m.seq_labels[seqidx][:] = y
 
-def calc_nll(seq_out, X, begin):
+def calc_nll(seq_label_probs, X, begin):
+    eps = 1e-10
     nll = 0.
-    for seqidx in range(len(seq_out)):
+    for seqidx in range(len(seq_label_probs)):
         next_idx = (begin + seqidx + 1) % X.shape[0]
         y = X[next_idx, :]
-        nll += logloss(y, seq_out[seqidx].asnumpy())
+        py = seq_label_probs[seqidx].asnumpy()
+        nll += -np.sum(np.log(np.maximum(py, eps))) / len(y)
     return nll
 
 def train_lstm(model, X_train_batch, X_val_batch,
@@ -203,7 +190,6 @@ def train_lstm(model, X_train_batch, X_val_batch,
     print("Training swith train.shape=%s" % str(X_train_batch.shape))
     print("Training swith val.shape=%s" % str(X_val_batch.shape))
     m = model
-    onehot = np.zeros(m.seq_data[0].shape, dtype='float32')
     seq_len = len(m.seq_data)
     batch_size = m.seq_data[0].shape[0]
     print("batch_size=%d" % batch_size)
@@ -214,7 +200,6 @@ def train_lstm(model, X_train_batch, X_val_batch,
                               **kwargs)
     updater = mx.optimizer.get_updater(opt)
     epoch_counter = 0
-    watch_weight = False
     log_period = max(1000 / seq_len, 1)
 
     for iteration in range(num_round):
@@ -228,9 +213,11 @@ def train_lstm(model, X_train_batch, X_val_batch,
         assert X_train_batch.shape[0] % seq_len == 0
         assert X_val_batch.shape[0] % seq_len == 0
         for begin in range(0, X_train_batch.shape[0], seq_len):
-            set_rnn_inputs(m, X_train_batch, onehot, begin=begin)
+            set_rnn_inputs(m, X_train_batch, begin=begin)
             m.rnn_exec.forward(is_train=True)
-            seq_outs = [out.copyto(mx.cpu()) for out in m.seq_outputs]
+            # probability of each label class, used to evaluate nll
+            seq_label_probs = [mx.nd.choose_element(out, label).copyto(mx.cpu())
+                               for out, label in zip(m.seq_outputs, m.seq_labels)]
             m.rnn_exec.backward()
             # transfer the states
             for init, last in zip(m.init_states, m.last_states):
@@ -239,22 +226,12 @@ def train_lstm(model, X_train_batch, X_val_batch,
             # update epoch counter
             epoch_counter += 1
             if epoch_counter % update_period == 0:
-                # TODO add gradient clip here
                 # updare parameters
                 for idx, weight, grad, name in m.param_blocks:
-                    if epoch_counter % log_period == 0 and watch_weight:
-                        dw = grad.asnumpy()
-                        w = weight.asnumpy()
-                        dwnorm = np.linalg.norm(dw, 2) * rescale_grad
-                        wnorm = np.linalg.norm(w, 2)
-                        print("dw:norm(%s): %.3f" % (name, dwnorm))
-                        print("w:norm(%s): %.3f" % (name, wnorm))
-                        if name == "cls_bias":
-                            print len(dw[dw<0])
                     updater(idx, grad, weight)
                     # reset gradient to zero
                     grad[:] = 0.0
-            train_nll += calc_nll(seq_outs, X_train_batch, begin=begin)
+            train_nll += calc_nll(seq_label_probs, X_train_batch, begin=begin)
 
             nbatch = begin + seq_len
             if epoch_counter % log_period == 0:
@@ -271,17 +248,115 @@ def train_lstm(model, X_train_batch, X_val_batch,
             state.c[:] = 0.0
             state.h[:] = 0.0
         for begin in range(0, X_val_batch.shape[0], seq_len):
-            set_rnn_inputs(m, X_val_batch, onehot, begin=begin)
+            set_rnn_inputs(m, X_val_batch, begin=begin)
             m.rnn_exec.forward(is_train=False)
-            seq_outs = [out.copyto(mx.cpu()) for out in m.seq_outputs]
+            # probability of each label class, used to evaluate nll
+            seq_label_probs = [mx.nd.choose_element(out, label).copyto(mx.cpu())
+                               for out, label in zip(m.seq_outputs, m.seq_labels)]
             # transfer the states
             for init, last in zip(m.init_states, m.last_states):
                 last.c.copyto(init.c)
                 last.h.copyto(init.h)
-            val_nll += calc_nll(seq_outs, X_val_batch, begin=begin)
+            val_nll += calc_nll(seq_label_probs, X_val_batch, begin=begin)
         nbatch = X_val_batch.shape[0]
         print("Iter [%d] Val: NLL=%.3f, Prep=%.3f" % (
             iteration, val_nll / nbatch, np.exp(val_nll / nbatch)))
         if (iteration + 1) % half_life == 0:
             opt.lr *= 0.9
             print("Reset learning rate to %g" % opt.lr)
+
+def setup_rnn_sample_model(ctx,
+                           params,
+                           num_lstm_layer,
+                           num_hidden, num_embed, num_label,
+                           batch_size, input_size):
+    seq_len = 1
+    rnn_sym = lstm_unroll(num_lstm_layer=num_lstm_layer,
+                          num_hidden=num_hidden,
+                          seq_len=seq_len,
+                          num_embed=num_embed,
+                          num_label=num_label)
+    arg_names = rnn_sym.list_arguments()
+    input_shapes = {}
+    for name in arg_names:
+        if name.endswith("init_c") or name.endswith("init_h"):
+            input_shapes[name] = (batch_size, num_hidden)
+        elif name.endswith("data"):
+            input_shapes[name] = (batch_size, input_size)
+        else:
+            pass
+    arg_shape, out_shape, aux_shape = rnn_sym.infer_shape(**input_shapes)
+    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
+    arg_dict = dict(zip(arg_names, arg_arrays))
+    for name, arr in params.items():
+        arg_dict[name][:] = arr
+    rnn_exec = rnn_sym.bind(ctx=ctx, args=arg_arrays, args_grad=None, grad_req="null")
+    out_dict = dict(zip(rnn_sym.list_outputs(), rnn_exec.outputs))
+    param_blocks = []
+    params_array = list(params.items())
+    for i in range(len(params)):
+        param_blocks.append((i, params_array[i][1], None, params_array[i][0]))
+    init_states = [LSTMState(c=arg_dict["l%d_init_c" % i],
+                             h=arg_dict["l%d_init_h" % i]) for i in range(num_lstm_layer)]
+    seq_labels = [rnn_exec.arg_dict["t%d_label" % i] for i in range(seq_len)]
+    seq_data = [rnn_exec.arg_dict["t%d_data" % i] for i in range(seq_len)]
+    last_states = [LSTMState(c=out_dict["l%d_last_c_output" % i],
+                             h=out_dict["l%d_last_h_output" % i]) for i in range(num_lstm_layer)]
+    seq_outputs = [out_dict["t%d_sm_output" % i] for i in range(seq_len)]
+
+    return LSTMModel(rnn_exec=rnn_exec, symbol=rnn_sym,
+                     init_states=init_states, last_states=last_states,
+                     seq_data=seq_data, seq_labels=seq_labels, seq_outputs=seq_outputs,
+                     param_blocks=param_blocks)
+
+# Python3 np.random.choice is too strict in eval float probability so we use an alternative
+import random
+import bisect
+import collections
+
+def _cdf(weights):
+    total = sum(weights)
+    result = []
+    cumsum = 0
+    for w in weights:
+        cumsum += w
+        result.append(cumsum / total)
+    return result
+
+def _choice(population, weights):
+    assert len(population) == len(weights)
+    cdf_vals = _cdf(weights)
+    x = random.random()
+    idx = bisect.bisect(cdf_vals, x)
+    return population[idx]
+
+def sample_lstm(model, X_input_batch, seq_len, temperature=1., sample=True):
+    m = model
+    vocab = m.seq_outputs[0].shape[1]
+    batch_size = m.seq_data[0].shape[0]
+    outputs_ndarray = mx.nd.zeros(m.seq_outputs[0].shape)
+    outputs_batch = []
+    tmp = [i for i in range(vocab)]
+    for i in range(seq_len):
+        outputs_batch.append(np.zeros(X_input_batch.shape))
+    for i in range(seq_len):
+        set_rnn_inputs(m, X_input_batch, 0)
+        m.rnn_exec.forward(is_train=False)
+        outputs_ndarray[:] = m.seq_outputs[0]
+        for init, last in zip(m.init_states, m.last_states):
+            last.c.copyto(init.c)
+            last.h.copyto(init.h)
+        prob = np.clip(outputs_ndarray.asnumpy(), 1e-6, 1 - 1e-6)
+        if sample:
+            rescale = np.exp(np.log(prob) / temperature)
+            for j in range(batch_size):
+                p = rescale[j, :]
+                p[:] /= p.sum()
+                outputs_batch[i][j] = _choice(tmp, p)
+                # outputs_batch[i][j] = np.random.choice(vocab, 1, p)
+        else:
+            outputs_batch[i][:] = np.argmax(prob, axis=1)
+        X_input_batch[:] = outputs_batch[i]
+    return outputs_batch
+
+
diff --git a/example/rnn/lstm_ptb.py b/example/rnn/lstm_ptb.py
index b1637fb20ed1..b01a88aa5063 100644
--- a/example/rnn/lstm_ptb.py
+++ b/example/rnn/lstm_ptb.py
@@ -10,7 +10,7 @@
 We would like to thanks Wojciech Zaremba for his Torch LSTM code
 
 The data file can be found at:
-https://github.com/wojzaremba/lstm/tree/master/data
+https://github.com/dmlc/web-data/tree/master/mxnet/ptb
 """
 
 def load_data(path, dic=None):
@@ -36,7 +36,8 @@ def load_data(path, dic=None):
 
 def drop_tail(X, seq_len):
     shape = X.shape
-    return X[0 : shape[0]/seq_len *seq_len, :]
+    nstep = int(shape[0] / seq_len)
+    return X[0:(nstep * seq_len), :]
 
 
 def replicate_data(x, batch_size):
diff --git a/ps-lite b/ps-lite
index 504faa73a826..7121aa1bdb67 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
+Subproject commit 7121aa1bdb673f047c7600eb4347fd2911021710
diff --git a/src/common/utils.h b/src/common/utils.h
index fbaf5f4fdb55..574fc242ebd8 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -25,14 +25,14 @@ namespace common {
 // heuristic to dermine number of threads per GPU
 inline int GetNumThreadPerGPU() {
   // This is resource efficient option.
-  return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 1);
+  return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2);
 }
 
 // heuristic to get number of matching colors.
 // this decides how much parallelism we can get in each GPU.
 inline int GetExecNumMatchColor() {
   // This is resource efficient option.
-  int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 4);
+  int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 1);
   return std::min(num_match_color, GetNumThreadPerGPU());
 }
 
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
index f246f2886cb9..0b34d691b244 100644
--- a/src/operator/block_grad-inl.h
+++ b/src/operator/block_grad-inl.h
@@ -64,7 +64,7 @@ class BlockGradientProp : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {}
 
   std::map<std::string, std::string> GetParams() const override {
-    return std::map<std:;string, std::string>();
+    return std::map<std::string, std::string>();
   }
 
   bool InferShape(std::vector<TShape> *in_shape,

From d96fd9e57f3799482b2fbb1edf147b72ceb77a71 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 14:27:41 -0600
Subject: [PATCH 026/122] Update char_lstm.ipynb

---
 example/rnn/char_lstm.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/rnn/char_lstm.ipynb b/example/rnn/char_lstm.ipynb
index 4e06efb3e8c6..cac585d05a92 100644
--- a/example/rnn/char_lstm.ipynb
+++ b/example/rnn/char_lstm.ipynb
@@ -7,6 +7,7 @@
     "# Char LSTM Example.\n",
     "This example aims to show how to use lstm to build a char level language model, and generate text from it. \n",
     "We use a tiny shakespeare text for demo purpose. \n",
+    "\n",
     "Data can be found at [https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare). "
    ]
   },
@@ -23,7 +24,6 @@
     "import mxnet as mx\n",
     "import numpy as np\n",
     "import sys\n",
-    "sys.path.insert(0, \"../rnn\")\n",
     "import lstm"
    ]
   },

From 306246ba9ea3e53f7146b110504cca992c44e47d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 14:29:53 -0600
Subject: [PATCH 027/122] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0b0fd543e7f2..1641d2905a68 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
+* [LSTM Example by using symbolic API](https://github.com/dmlc/mxnet/tree/master/example/rnn)
 * [MXNet R Package brings Deep learning for R!](https://github.com/dmlc/mxnet/tree/master/R-package)
 * [Note on Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
 

From c9d1eab8d352605a2a2c0e46ef2467a65614d57b Mon Sep 17 00:00:00 2001
From: Chiyuan Zhang <pluskid@gmail.com>
Date: Wed, 21 Oct 2015 16:34:44 -0400
Subject: [PATCH 028/122] fix doc typo

---
 doc/python/tutorial.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/python/tutorial.md b/doc/python/tutorial.md
index a32442173ba1..09a70df07c04 100644
--- a/doc/python/tutorial.md
+++ b/doc/python/tutorial.md
@@ -257,7 +257,7 @@ We can also specify the automatic generated names explicitly:
 ```python
 >>> net = mx.symbol.Variable('data')
 >>> w = mx.symbol.Variable('myweight')
->>> net = sym.FullyConnected(data=data, weight=w, name='fc1', num_hidden=128)
+>>> net = sym.FullyConnected(data=net, weight=w, name='fc1', num_hidden=128)
 >>> net.list_arguments()
 ['data', 'myweight', 'fc1_bias']
 ```

From e56e1543ddfae62543f50417b3a3e455955f8d90 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 21 Oct 2015 12:41:03 -0700
Subject: [PATCH 029/122] [OP, Refactor] Enable register mshadow unary op in
 only one line

---
 src/common/tblob_op_registry.cc    | 132 +++++++++++++++++++++++++++++
 src/common/tblob_op_registry.h     | 101 ++++++++++++++++++++++
 src/ndarray/ndarray.cc             |  53 ------------
 src/ndarray/ndarray_function-inl.h |  19 -----
 src/ndarray/ndarray_function.h     |  14 ---
 src/ndarray/unary_function-inl.h   |  46 ++++++++++
 src/ndarray/unary_function.cc      |   7 ++
 src/ndarray/unary_function.cu      |   8 ++
 8 files changed, 294 insertions(+), 86 deletions(-)
 create mode 100644 src/common/tblob_op_registry.cc
 create mode 100644 src/common/tblob_op_registry.h
 create mode 100644 src/ndarray/unary_function-inl.h
 create mode 100644 src/ndarray/unary_function.cc
 create mode 100644 src/ndarray/unary_function.cu

diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
new file mode 100644
index 000000000000..e205f29cc42c
--- /dev/null
+++ b/src/common/tblob_op_registry.cc
@@ -0,0 +1,132 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file tblob_op_registry.cc
+ * Implementation of tblob op registry
+ */
+#include <mxnet/ndarray.h>
+#include <mxnet/engine.h>
+#include <vector>
+#include <mutex>
+#include "./tblob_op_registry.h"
+
+namespace mxnet {
+namespace common {
+
+
+class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
+ public:
+  TSelf& set_function(int dev_mask, UnaryFunction funary) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ++reg_counter_;
+    if (funary_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_[dev_mask] = funary;
+    // return if it is already registered.
+    if (reg_counter_ != 1) return *this;
+
+    // The body to be registered
+    auto body = [this] (NDArray **used_vars,
+                        real_t *s,
+                        NDArray **mutate_vars) {
+      NDArray src = *used_vars[0];
+      NDArray *out = mutate_vars[0];
+
+      if (out->is_none()) {
+        *out = NDArray(src.shape(), src.ctx(), true);
+      } else {
+        CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+        CHECK(out->shape() == src.shape()) << "target shape mismatch";
+      }
+      // important: callback must always capture by value
+      NDArray ret = *out;
+      // get the const variables
+      std::vector<Engine::VarHandle> const_vars;
+      if (src.var() != ret.var()) const_vars.push_back(src.var());
+      // check if the function exist
+      int dev_mask = src.ctx().dev_mask();
+      if (static_cast<size_t>(dev_mask) >= funary_.size() ||
+          funary_[dev_mask] == nullptr) {
+        if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+        LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
+      }
+      // invoke the function
+      UnaryFunction fun = funary_[dev_mask];
+      Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
+          ret.CheckAndAlloc();
+          TBlob tmp = ret.data();
+          (*fun)(src.data(), &tmp, ctx);
+#if MXNET_USE_CUDA
+          if (dev_mask == gpu::kDevMask) {
+            ctx.get_stream<gpu>()->Wait();
+          }
+#endif
+        }, src.ctx(), const_vars, {ret.var()});
+    };
+    // register the function.
+    NDArrayReg()
+        .set_body(body)
+        .set_num_use_vars(1)
+        .set_num_mutate_vars(1)
+        .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
+        .add_argument("src", "NDArray", "Source input to the function");
+    return *this;
+  }
+
+  TSelf& describe(const std::string &description) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (reg_counter_ != 1) return *this;
+    NDArrayReg().describe(description);
+    return *this;
+  }
+
+  GenericTBlobOp *GetOp() const override {
+    return nullptr;
+  }
+
+ private:
+  // internal mutex
+  std::mutex mutex_;
+  // unary functions on each device mask
+  std::vector<UnaryFunction> funary_;
+  // registration counter
+  int reg_counter_{0};
+  // NDArray registry
+  NDArrayFunctionReg *ndarray_reg_{nullptr};
+  // internal function to register NDArray function.
+  inline NDArrayFunctionReg &NDArrayReg() {
+    if (ndarray_reg_ == nullptr) {
+      NDArrayFunctionReg &reg =
+          ::dmlc::Registry<NDArrayFunctionReg>::Get()->__REGISTER__(this->name);
+      ndarray_reg_ = &reg;
+    }
+    return *ndarray_reg_;
+  }
+};
+
+
+TBlobOpRegEntry& TBlobOpRegistry::__REGISTER_OR_FIND__(const std::string &name) {
+  if (fmap_.count(name) != 0) return *fmap_.at(name);
+  TBlobOpRegEntry *e = new TBlobOpRegEntryImpl();
+  e->name = name;
+  fmap_[name] = e;
+  return *e;
+}
+
+TBlobOpRegistry* TBlobOpRegistry::Get() {
+  static TBlobOpRegistry inst;
+  return &inst;
+}
+
+TBlobOpRegistry::~TBlobOpRegistry() {
+  for (auto kv : fmap_) {
+    delete kv.second;
+  }
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
new file mode 100644
index 000000000000..910543efacb3
--- /dev/null
+++ b/src/common/tblob_op_registry.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file tblob_op_registry.h
+ * \brief Helper registry to make registration of simple unary binary math function easy.
+ * Register to this registry will enable both symbolic operator and NDArray operator in client.
+ *
+ * More complicated operators can be registered in normal way in ndarray and operator modules.
+ */
+#ifndef MXNET_COMMON_TBLOB_OP_REGISTRY_H_
+#define MXNET_COMMON_TBLOB_OP_REGISTRY_H_
+
+#include <dmlc/registry.h>
+#include <mxnet/base.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace mxnet {
+namespace common {
+
+/*! \brief pre-declare generic TBlob function*/
+struct GenericTBlobOp;
+
+/*! \brief registry for function entry */
+class TBlobOpRegEntry {
+ public:
+  /*! \brief unary tblob function */
+  typedef void (*UnaryFunction)(const TBlob &src,
+                                TBlob *ret,
+                                RunContext ctx);
+  /*! \brief declare self type */
+  typedef TBlobOpRegEntry TSelf;
+  /*! \brief name of the entry */
+  std::string name;
+  /*!
+   * \brief set function of the function to be funary
+   * \param dev_mask The device mask of the function can act on.
+   * \param funary The unary function that peforms the operation.
+   */
+  virtual TSelf& set_function(int dev_mask, UnaryFunction funary) = 0;
+  /*!
+   * \brief Describe the function.
+   * \param description The description of the function.
+   * \return reference to self.
+   */
+  virtual TSelf& describe(const std::string &description) = 0;
+  /*!
+   * \brief get the internal function representation
+   * \return the internal function representation.
+   */
+  virtual GenericTBlobOp *GetOp() const = 0;
+  /*! \brief destructor */
+  virtual ~TBlobOpRegEntry() {}
+};
+
+/*! \brief registry for TBlob functions */
+class TBlobOpRegistry {
+ public:
+  /*!
+   * \brief Internal function to register a name function under name.
+   * \param name name of the function
+   * \return ref to the registered entry, used to set properties
+   */
+  TBlobOpRegEntry &__REGISTER_OR_FIND__(const std::string& name);
+  /*!
+   * \brief Find the entry with corresponding name.
+   * \param name name of the function
+   * \return the corresponding function, can be NULL
+   */
+  inline static const TBlobOpRegEntry *Find(const std::string &name) {
+    return Get()->fmap_.at(name);
+  }
+  /*! \return global singleton of the registry */
+  static TBlobOpRegistry* Get();
+
+ private:
+  // destructor
+  ~TBlobOpRegistry();
+  /*! \brief internal registry map */
+  std::map<std::string, TBlobOpRegEntry*> fmap_;
+};
+
+#if DMLC_USE_CXX11
+struct GenericTBlobOp {
+  /*! \brief function type of the function */
+  typedef std::function<void (const std::vector<TBlob> &in,
+                              TBlob *out,
+                              RunContext ctx)> OpType;
+  /*! \brief the real operator */
+  OpType op;
+};
+#endif
+
+#define MXNET_REGISTER_TBLOB_FUN(Name, DEV)                             \
+  static ::mxnet::common::TBlobOpRegEntry &                             \
+  __make_ ## TBlobOpRegEntry ## _ ## Name ## __ ## DEV ##__ =           \
+      ::mxnet::common::TBlobOpRegistry::Get()->__REGISTER_OR_FIND__(#Name)
+
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_TBLOB_OP_REGISTRY_H_
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 74ac76c00f66..26a62fb60264 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -155,53 +155,6 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
-/*!
- * \brief run a unary operation.
- * \param src source operand
- * \param out the output ndarray
- * \param unary_op the real
- */
-template<typename OP>
-void UnaryOp(const NDArray &src,
-             NDArray *out) {
-  if (out->is_none()) {
-    *out = NDArray(OP::GetShape(src.shape()), src.ctx(), true);
-  } else {
-    CHECK(out->ctx() == src.ctx()) << "target context mismatch";
-    CHECK(out->shape() == OP::GetShape(src.shape())) << "target shape mismatch";
-  }
-  // important: callback must always capture by value
-  NDArray ret = *out;
-  // get the const variables
-  std::vector<Engine::VarHandle> const_vars;
-  if (src.var() != ret.var()) const_vars.push_back(src.var());
-
-  // redirect everything to mshadow operations
-  switch (src.ctx().dev_mask()) {
-    case cpu::kDevMask: {
-      Engine::Get()->PushSync([src, ret](RunContext ctx) {
-          ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Eval<cpu, OP>(src.data(), &tmp, ctx);
-        }, src.ctx(), const_vars, {ret.var()});
-      break;
-    }
-#if MXNET_USE_CUDA
-    case gpu::kDevMask: {
-      Engine::Get()->PushSync([src, ret](RunContext ctx) {
-          ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Eval<gpu, OP>(src.data(), &tmp, ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, src.ctx(), const_vars, {ret.var()});
-      break;
-    }
-#endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-  }
-}
-
 void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
   CHECK(from.shape() == to->shape())
       << "operands shape mismatch";
@@ -649,12 +602,6 @@ void NDArray::SyncCopyToCPU(real_t *data, size_t size) const {
 MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
 
 
-MXNET_REGISTER_NDARRAY_FUN(square).set_function(UnaryOp<ndarray::Square>)
-.describe("Take square of the src");
-
-MXNET_REGISTER_NDARRAY_FUN(sqrt).set_function(UnaryOp<ndarray::SquareRoot>)
-.describe("Take square root of the src");
-
 MXNET_REGISTER_NDARRAY_FUN(_plus).set_function(BinaryOp<ndarray::Plus>);
 MXNET_REGISTER_NDARRAY_FUN(_minus).set_function(BinaryOp<ndarray::Minus>);
 MXNET_REGISTER_NDARRAY_FUN(_mul).set_function(BinaryOp<ndarray::Mul>);
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 8b5bfc72bcc1..20f9eb8c65a0 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -18,14 +18,6 @@
   }
 #endif
 
-#ifndef DECL_UNARY
-#define DECL_UNARY(XPU, OP, FUN)                                        \
-  template<>                                                            \
-  void Eval<XPU, OP>(const TBlob &src, TBlob *ret, RunContext ctx) {    \
-    FUN<XPU, OP>(src, ret, ctx);                                        \
-  }
-#endif
-
 #ifndef DECL_SCALAR
 #define DECL_SCALAR(XPU, OP, FUN, REVERSE)                              \
   template<>                                                            \
@@ -53,15 +45,6 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
                                    rhs.FlatTo2D<xpu, real_t>(s));
 }
 
-template<typename xpu, typename OP>
-inline void EvalUnary_(const TBlob &src,
-                       TBlob *ret, RunContext ctx) {
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-      = F<typename OP::mshadow_op>(src.FlatTo2D<xpu, real_t>(s));
-}
-
 template<typename xpu, typename OP>
 inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
                      TBlob *ret, RunContext ctx) {
@@ -197,8 +180,6 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
 }
 
 // declarations
-DECL_UNARY(DEVICE, Square, EvalUnary_)
-DECL_UNARY(DEVICE, SquareRoot, EvalUnary_)
 DECL_BINARY(DEVICE, MatChooseRowElem, EvalMatChooseRowElem_)
 DECL_BINARY(DEVICE, Dot, EvalDot_)
 DECL_BINARY(DEVICE, OneHotEncode, EvalOneHot_)
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 1263f39e5998..9f23c1a5c348 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -24,12 +24,6 @@ struct BinaryBase {
   }
 };
 
-struct UnaryBase {
-  inline static TShape GetShape(const TShape &shape) {
-    return shape;
-  }
-};
-
 // operators
 struct Plus : public BinaryBase {
   typedef mshadow::op::plus mshadow_op;
@@ -47,14 +41,6 @@ struct Div : public BinaryBase {
   typedef mshadow::op::div mshadow_op;
 };
 
-struct Square : public UnaryBase {
-  typedef op::mshadow_op::square mshadow_op;
-};
-
-struct SquareRoot : public UnaryBase {
-  typedef op::mshadow_op::square_root mshadow_op;
-};
-
 struct ClipMin : public BinaryBase {
   struct mshadow_op {
     MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
new file mode 100644
index 000000000000..7832ce1798cd
--- /dev/null
+++ b/src/ndarray/unary_function-inl.h
@@ -0,0 +1,46 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary-function-inl.h
+ * \brief the real execution functions of ndarray operations
+ */
+#ifndef MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
+#define MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
+
+#include "../common/tblob_op_registry.h"
+#include "../operator/mshadow_op.h"
+
+#if defined(__CUDACC__)
+#define DEVICE gpu
+#else
+#define DEVICE cpu
+#endif
+
+namespace mxnet {
+namespace ndarray {
+
+template<typename xpu, typename OP>
+void EvalUnary_(const TBlob &src,
+                TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->FlatTo2D<xpu, real_t>(s)
+      = F<OP>(src.FlatTo2D<xpu, real_t>(s));
+}
+
+// helper macro to register mshadow element-wise unary opts
+// usually you only need to use this to register common operations
+#define REGISTER_MSHADOW_UNARY(Name, Op)            \
+  MXNET_REGISTER_TBLOB_FUN(Name, DEVICE)            \
+  .set_function(DEVICE::kDevMask, EvalUnary_<DEVICE, Op>)
+
+
+// register all unary operations here
+REGISTER_MSHADOW_UNARY(square, op::mshadow_op::square)
+.describe("Take square of the src");
+
+REGISTER_MSHADOW_UNARY(sqrt, op::mshadow_op::square_root)
+.describe("Take square root of the src");
+
+}  // namespace ndarray
+}  // namespace mxnet
+#endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/src/ndarray/unary_function.cc b/src/ndarray/unary_function.cc
new file mode 100644
index 000000000000..f77f113e611e
--- /dev/null
+++ b/src/ndarray/unary_function.cc
@@ -0,0 +1,7 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary_function.cc
+ * \brief CPU Implementation of unary function.
+ */
+// this will be invoked by gcc and compile CPU version
+#include "./unary_function-inl.h"
diff --git a/src/ndarray/unary_function.cu b/src/ndarray/unary_function.cu
new file mode 100644
index 000000000000..0c0d4e64957c
--- /dev/null
+++ b/src/ndarray/unary_function.cu
@@ -0,0 +1,8 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary_function.cu
+ * \brief GPU Implementation of unary function.
+ */
+// this will be invoked by gcc and compile GPU version
+// real common implementation is only in the -inl.h file.
+#include "./unary_function-inl.h"

From 83e46c268f3b0bcd33eed77722580e63b2ef2abc Mon Sep 17 00:00:00 2001
From: Chiyuan Zhang <pluskid@gmail.com>
Date: Wed, 21 Oct 2015 19:43:19 -0400
Subject: [PATCH 030/122] fix typo

---
 example/rnn/lstm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index 996861a80894..25245aad18ee 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -17,7 +17,7 @@
 def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
     """LSTM Cell symbol"""
     if dropout > 0.:
-        in_data = mx.sym.Dropout(data=in_data, p=dropout)
+        indata = mx.sym.Dropout(data=indata, p=dropout)
     i2h = mx.sym.FullyConnected(data=indata,
                                 weight=param.i2h_weight,
                                 bias=param.i2h_bias,

From 7d7e5c4960edf7d280ae729e70667c417a9cc824 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:33:16 +0800
Subject: [PATCH 031/122] add installation guide for pre-built windows binary

---
 doc/build.md | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index 6907d78bf154..5be22f62bfb3 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -7,9 +7,11 @@ if you have ideas to improve this page, please send a pull request!
 
 Contents
 --------
-- [Build MXNet Library](#build-mxnet-library)
-  - Introduces how to build the mxnet core library for all packages.
-  - Supported platforms: linux, windows, osx
+- [Building MXNet Library](#build-mxnet-library)
+  - [Prerequisites](#prerequisites)
+  - [Building on Linux](#building-on-linux)
+  - [Building on Windows](#building-on-windows)
+  - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
   - Introduces how to build mxnet with advanced features such as HDFS/S3 support, CUDNN
 - [Python Package Installation](#python-package-installation)
@@ -17,6 +19,9 @@ Contents
 
 Build MXNet Library
 -------------------
+
+### Prerequisites
+
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
@@ -36,7 +41,7 @@ The system dependency requirement for mxnet libraries are
 - BLAS library.
 - opencv (optional if you do not need image augmentation, you can switch it off in config.mk)
 
-### Linux
+### Building on Linux
 
 On Ubuntu >= 13.10, one can install the dependencies by
 
@@ -73,7 +78,7 @@ make -j4
 
 Then proceed to package installation instructions for python or R in this page.
 
-### Windows
+### Building on Windows
 
 Firstly, we should make your Visual Studio 2013 support more C++11 features.
 
@@ -88,6 +93,14 @@ Finally, use CMake to create a Visual Studio solution in `./build/`. During conf
 
 Then proceed to package installation instructions for python or R in this page.
 
+### Installing pre-built packages on Windows
+
+Mxnet also provides pre-built packages on Windows. The pre-built package includes pre-build MxNet library, the dependent thrid-party libraries, a sample C++ solution in Visual Studio and the Python install script.
+
+You can download the packages from the [Releases tab](https://github.com/dmlc/mxnet/releases) of MxNet. There are two variants provided: one with GPU support (using CUDA and CUDNN v3) and one without GPU support. You can choose one that fits your hardward configuration.
+
+After download, unpack the package into a folder, say D:\MxNet, then install the package by double clicking the setupenv.cmd inside the folder. It will setup environmental variables needed by MxNet. After that, you should be able to usee the provided VS solution to build C++ programs, or to [install Python package](#python-package-installation).
+
 Advanced Build Configurations
 -----------------------------
 The configuration of mxnet can be modified by ```config.mk```

From cf963c1d23533b958d61fe3e64b7cf62709c4215 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:38:29 +0800
Subject: [PATCH 032/122] fix dll name on windows

---
 doc/build.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index 5be22f62bfb3..e7cef14fc87f 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -10,6 +10,7 @@ Contents
 - [Building MXNet Library](#build-mxnet-library)
   - [Prerequisites](#prerequisites)
   - [Building on Linux](#building-on-linux)
+  - [Building on OSX](#building-on-osx)
   - [Building on Windows](#building-on-windows)
   - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
@@ -25,7 +26,7 @@ Build MXNet Library
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
-- On Windows the target libary is ```mxnet.dll```
+- On Windows the target libary is ```libmxnet.dll```
 
 Things to do before get started:
 
@@ -56,7 +57,7 @@ make -j4
 ```
 Then proceed to package installation instructions for python or R in this page.
 
-### OSX
+### Buillding on OSX
 On OSX, we can install the dependencies by
 
 ```bash

From a29137aa32cfeced262fa68d71aafe44a18b2e84 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:42:16 +0800
Subject: [PATCH 033/122] fix windows linkage problem by adding dllexport

---
 include/mxnet/base.h    | 13 +++++++++++++
 include/mxnet/engine.h  |  2 +-
 include/mxnet/storage.h |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1ef9c6bf8450..b3ee2242d182 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -49,6 +49,19 @@
 #endif
 #endif
 
+/*!
+* \brief define dllexport for Visual Studio
+*/
+#ifdef _MSC_VER
+#ifdef MXNET_EXPORTS
+#define MXAPI __declspec(dllexport)
+#else
+#define MXAPI __declspec(dllimport)
+#endif
+#else
+#define MXAPI
+#endif
+
 /*! \brief namespace of mxnet */
 namespace mxnet {
 /*! \brief mxnet cpu */
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 03eb45b54de0..9b879ef3b4c2 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class Engine {
+class MXAPI Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index da7a8aaa5388..743b4e8b0514 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-class Storage {
+  class MXAPI Storage {
  public:
   /*!
    * \brief Storage handle.

From 708d4fcf0de4f29f7adf4d1711dcd454837b3c0f Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:44:14 +0800
Subject: [PATCH 034/122] disable ps-lite on windows for now

---
 CMakeLists.txt | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d55086fd197a..fe020b81502b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,6 @@ else(MSVC)
 endif(MSVC)
 
 if(USE_OPENCV)
-  if(MSVC)
-    set(OpenCV_STATIC OFF)
-  endif()
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
@@ -68,7 +65,9 @@ if(USE_CUDNN)
 endif()
 
 add_subdirectory("dmlc-core")
-add_subdirectory("ps-lite")
+if(NOT MSVC)
+  add_subdirectory("ps-lite")
+endif()
 
 mxnet_source_group("Source"   GLOB_RECURSE "src/*.cc")
 mxnet_source_group("Source\\Cuda" GLOB_RECURSE "src/*.cu")
@@ -93,8 +92,10 @@ endif()
 add_library(mxnet SHARED ${SOURCE})
 target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
-target_link_libraries(mxnet pslite)
-target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+if(NOT MSVC)
+  target_link_libraries(mxnet pslite)
+  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+endif()
 set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 
 # ---[ Linter target

From 5f3e9ac26bd3baf871e98371289f6ee50df86db2 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:33:16 +0800
Subject: [PATCH 035/122] add installation guide for pre-built windows binary

---
 doc/build.md | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index 6907d78bf154..5be22f62bfb3 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -7,9 +7,11 @@ if you have ideas to improve this page, please send a pull request!
 
 Contents
 --------
-- [Build MXNet Library](#build-mxnet-library)
-  - Introduces how to build the mxnet core library for all packages.
-  - Supported platforms: linux, windows, osx
+- [Building MXNet Library](#build-mxnet-library)
+  - [Prerequisites](#prerequisites)
+  - [Building on Linux](#building-on-linux)
+  - [Building on Windows](#building-on-windows)
+  - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
   - Introduces how to build mxnet with advanced features such as HDFS/S3 support, CUDNN
 - [Python Package Installation](#python-package-installation)
@@ -17,6 +19,9 @@ Contents
 
 Build MXNet Library
 -------------------
+
+### Prerequisites
+
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
@@ -36,7 +41,7 @@ The system dependency requirement for mxnet libraries are
 - BLAS library.
 - opencv (optional if you do not need image augmentation, you can switch it off in config.mk)
 
-### Linux
+### Building on Linux
 
 On Ubuntu >= 13.10, one can install the dependencies by
 
@@ -73,7 +78,7 @@ make -j4
 
 Then proceed to package installation instructions for python or R in this page.
 
-### Windows
+### Building on Windows
 
 Firstly, we should make your Visual Studio 2013 support more C++11 features.
 
@@ -88,6 +93,14 @@ Finally, use CMake to create a Visual Studio solution in `./build/`. During conf
 
 Then proceed to package installation instructions for python or R in this page.
 
+### Installing pre-built packages on Windows
+
+Mxnet also provides pre-built packages on Windows. The pre-built package includes pre-build MxNet library, the dependent thrid-party libraries, a sample C++ solution in Visual Studio and the Python install script.
+
+You can download the packages from the [Releases tab](https://github.com/dmlc/mxnet/releases) of MxNet. There are two variants provided: one with GPU support (using CUDA and CUDNN v3) and one without GPU support. You can choose one that fits your hardward configuration.
+
+After download, unpack the package into a folder, say D:\MxNet, then install the package by double clicking the setupenv.cmd inside the folder. It will setup environmental variables needed by MxNet. After that, you should be able to usee the provided VS solution to build C++ programs, or to [install Python package](#python-package-installation).
+
 Advanced Build Configurations
 -----------------------------
 The configuration of mxnet can be modified by ```config.mk```

From 52f11c3523d81b01ee05c2ceabbbc43b8717b2ac Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:38:29 +0800
Subject: [PATCH 036/122] fix dll name on windows

---
 doc/build.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index 5be22f62bfb3..e7cef14fc87f 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -10,6 +10,7 @@ Contents
 - [Building MXNet Library](#build-mxnet-library)
   - [Prerequisites](#prerequisites)
   - [Building on Linux](#building-on-linux)
+  - [Building on OSX](#building-on-osx)
   - [Building on Windows](#building-on-windows)
   - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
@@ -25,7 +26,7 @@ Build MXNet Library
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
-- On Windows the target libary is ```mxnet.dll```
+- On Windows the target libary is ```libmxnet.dll```
 
 Things to do before get started:
 
@@ -56,7 +57,7 @@ make -j4
 ```
 Then proceed to package installation instructions for python or R in this page.
 
-### OSX
+### Buillding on OSX
 On OSX, we can install the dependencies by
 
 ```bash

From f606d86e83e423f501dd3954176345b2a2ac816a Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 21:19:11 -0600
Subject: [PATCH 037/122] Update char_lstm.ipynb

Due to my typo in https://github.com/dmlc/mxnet/pull/353
This notebook doesn't train with dropout
---
 example/rnn/char_lstm.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/rnn/char_lstm.ipynb b/example/rnn/char_lstm.ipynb
index cac585d05a92..72ba3f18dc41 100644
--- a/example/rnn/char_lstm.ipynb
+++ b/example/rnn/char_lstm.ipynb
@@ -195,7 +195,7 @@
     "                             batch_size=batch_size,\n",
     "                             input_size=vocab,\n",
     "                             initializer=mx.initializer.Uniform(0.1),\n",
-    "                             dropout=0.5)\n"
+    "                             dropout=0.)\n"
    ]
   },
   {

From 7dcd8c44dde9b531e3245eeae43a155374b74d84 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 11:27:35 +0800
Subject: [PATCH 038/122] change MXAPI to MXNET_API

---
 include/mxnet/base.h    | 4 ++--
 include/mxnet/engine.h  | 2 +-
 include/mxnet/storage.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index b3ee2242d182..962740b4194d 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -54,9 +54,9 @@
 */
 #ifdef _MSC_VER
 #ifdef MXNET_EXPORTS
-#define MXAPI __declspec(dllexport)
+#define MXNET_API __declspec(dllexport)
 #else
-#define MXAPI __declspec(dllimport)
+#define MXNET_API __declspec(dllimport)
 #endif
 #else
 #define MXAPI
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 9b879ef3b4c2..195f5c05eb20 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class MXAPI Engine {
+class MXNET_API Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 743b4e8b0514..60bca03b0680 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-  class MXAPI Storage {
+class MXNET_API Storage {
  public:
   /*!
    * \brief Storage handle.

From 7d0f29debe732700a75e56846a3d3ba98daeeed4 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:42:16 +0800
Subject: [PATCH 039/122] fix windows linkage problem by adding dllexport

---
 include/mxnet/base.h    | 13 +++++++++++++
 include/mxnet/engine.h  |  2 +-
 include/mxnet/storage.h |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1ef9c6bf8450..b3ee2242d182 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -49,6 +49,19 @@
 #endif
 #endif
 
+/*!
+* \brief define dllexport for Visual Studio
+*/
+#ifdef _MSC_VER
+#ifdef MXNET_EXPORTS
+#define MXAPI __declspec(dllexport)
+#else
+#define MXAPI __declspec(dllimport)
+#endif
+#else
+#define MXAPI
+#endif
+
 /*! \brief namespace of mxnet */
 namespace mxnet {
 /*! \brief mxnet cpu */
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 03eb45b54de0..9b879ef3b4c2 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class Engine {
+class MXAPI Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index da7a8aaa5388..743b4e8b0514 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-class Storage {
+  class MXAPI Storage {
  public:
   /*!
    * \brief Storage handle.

From 3ec9344d2ec865a7f9b83b5cbca73b881d128843 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 11:27:35 +0800
Subject: [PATCH 040/122] change MXAPI to MXNET_API

---
 include/mxnet/base.h    | 4 ++--
 include/mxnet/engine.h  | 2 +-
 include/mxnet/storage.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index b3ee2242d182..962740b4194d 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -54,9 +54,9 @@
 */
 #ifdef _MSC_VER
 #ifdef MXNET_EXPORTS
-#define MXAPI __declspec(dllexport)
+#define MXNET_API __declspec(dllexport)
 #else
-#define MXAPI __declspec(dllimport)
+#define MXNET_API __declspec(dllimport)
 #endif
 #else
 #define MXAPI
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 9b879ef3b4c2..195f5c05eb20 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class MXAPI Engine {
+class MXNET_API Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 743b4e8b0514..60bca03b0680 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-  class MXAPI Storage {
+class MXNET_API Storage {
  public:
   /*!
    * \brief Storage handle.

From a37f4859267c0106a9380c51744c14cdf5637c3c Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:44:14 +0800
Subject: [PATCH 041/122] disable ps-lite on windows for now

---
 CMakeLists.txt | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d55086fd197a..fe020b81502b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,6 @@ else(MSVC)
 endif(MSVC)
 
 if(USE_OPENCV)
-  if(MSVC)
-    set(OpenCV_STATIC OFF)
-  endif()
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
@@ -68,7 +65,9 @@ if(USE_CUDNN)
 endif()
 
 add_subdirectory("dmlc-core")
-add_subdirectory("ps-lite")
+if(NOT MSVC)
+  add_subdirectory("ps-lite")
+endif()
 
 mxnet_source_group("Source"   GLOB_RECURSE "src/*.cc")
 mxnet_source_group("Source\\Cuda" GLOB_RECURSE "src/*.cu")
@@ -93,8 +92,10 @@ endif()
 add_library(mxnet SHARED ${SOURCE})
 target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
-target_link_libraries(mxnet pslite)
-target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+if(NOT MSVC)
+  target_link_libraries(mxnet pslite)
+  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+endif()
 set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 
 # ---[ Linter target

From f672a9dd4f02a9daba301915af66b1b6eb8f4ddc Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 21 Oct 2015 22:01:56 -0700
Subject: [PATCH 042/122] [OP] Allow register symbolic and ndarray unary
 operator in one place

---
 include/mxnet/operator.h         |   8 +-
 src/common/tblob_op_registry.cc  | 317 +++++++++++++++++++++++++------
 src/common/tblob_op_registry.h   |  78 +++++---
 src/ndarray/unary_function-inl.h |  79 ++++++--
 src/operator/mshadow_op.h        |   2 +-
 5 files changed, 383 insertions(+), 101 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 72c5f6c28823..dc6176fe8b51 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -400,7 +400,7 @@ class OperatorProperty {
 };
 
 /*! \brief typedef the factory function of operator property */
-typedef OperatorProperty *(*OperatorPropertyFactory)();
+typedef std::function<OperatorProperty *()> OperatorPropertyFactory;
 /*!
  * \brief Registry entry for OperatorProperty factory functions.
  */
@@ -454,12 +454,8 @@ struct OperatorPropertyReg
  * \endcode
  */
 #define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
-  static ::mxnet::OperatorProperty* __create__ ## OperatorProperty ## name ## __() { \
-    OperatorProperty* ret = new OperatorPropertyType();                 \
-    return ret;                                                         \
-  }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
-  .set_body(__create__ ## OperatorProperty ## name ## __)               \
+  .set_body([]() { return new OperatorPropertyType(); })                \
   .check_name()
 
 #endif  // DMLC_USE_CXX11
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
index e205f29cc42c..ae1f54da3c3a 100644
--- a/src/common/tblob_op_registry.cc
+++ b/src/common/tblob_op_registry.cc
@@ -11,11 +11,14 @@
 
 namespace mxnet {
 namespace common {
-
+class TBlobUnaryOpProp;
 
 class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
  public:
-  TSelf& set_function(int dev_mask, UnaryFunction funary) override {
+  // functions
+  TSelf& set_function(int dev_mask,
+                      UnaryFunction funary,
+                      bool inplace_in_out) override {
     std::lock_guard<std::mutex> lock(mutex_);
     ++reg_counter_;
     if (funary_.size() <= static_cast<size_t>(dev_mask)) {
@@ -26,54 +29,46 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
                  << " already registerd for device " << dev_mask;
     }
     funary_[dev_mask] = funary;
-    // return if it is already registered.
-    if (reg_counter_ != 1) return *this;
+    inplace_in0_out_forward_ = inplace_in_out;
+    if (reg_counter_ == 1) this->DoRegisterUnary();
+    return *this;
+  }
 
-    // The body to be registered
-    auto body = [this] (NDArray **used_vars,
-                        real_t *s,
-                        NDArray **mutate_vars) {
-      NDArray src = *used_vars[0];
-      NDArray *out = mutate_vars[0];
-
-      if (out->is_none()) {
-        *out = NDArray(src.shape(), src.ctx(), true);
-      } else {
-        CHECK(out->ctx() == src.ctx()) << "target context mismatch";
-        CHECK(out->shape() == src.shape()) << "target shape mismatch";
-      }
-      // important: callback must always capture by value
-      NDArray ret = *out;
-      // get the const variables
-      std::vector<Engine::VarHandle> const_vars;
-      if (src.var() != ret.var()) const_vars.push_back(src.var());
-      // check if the function exist
-      int dev_mask = src.ctx().dev_mask();
-      if (static_cast<size_t>(dev_mask) >= funary_.size() ||
-          funary_[dev_mask] == nullptr) {
-        if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-        LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
-      }
-      // invoke the function
-      UnaryFunction fun = funary_[dev_mask];
-      Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
-          ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          (*fun)(src.data(), &tmp, ctx);
-#if MXNET_USE_CUDA
-          if (dev_mask == gpu::kDevMask) {
-            ctx.get_stream<gpu>()->Wait();
-          }
-#endif
-        }, src.ctx(), const_vars, {ret.var()});
-    };
-    // register the function.
-    NDArrayReg()
-        .set_body(body)
-        .set_num_use_vars(1)
-        .set_num_mutate_vars(1)
-        .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
-        .add_argument("src", "NDArray", "Source input to the function");
+  TSelf& set_gradient(int dev_mask,
+                      UnaryGradType1 fgrad,
+                      bool inplace_out_in_grad) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (funary_grad_t1_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_grad_t1_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_grad_t1_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device gradient function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_grad_t1_[dev_mask] = fgrad;
+    inplace_out_in0_grad_ = inplace_out_in_grad;
+    return *this;
+  }
+
+  TSelf& set_gradient(int dev_mask,
+                      UnaryGradType2 fgrad,
+                      bool inplace_out_in_grad) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (funary_grad_t2_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_grad_t2_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_grad_t2_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device gradient function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_grad_t2_[dev_mask] = fgrad;
+    inplace_out_in0_grad_ = inplace_out_in_grad;
+    return *this;
+  }
+
+  TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    unary_infer_ = fshapeinfer;
     return *this;
   }
 
@@ -81,22 +76,32 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     std::lock_guard<std::mutex> lock(mutex_);
     if (reg_counter_ != 1) return *this;
     NDArrayReg().describe(description);
+    OpReg().describe(description);
     return *this;
   }
 
-  GenericTBlobOp *GetOp() const override {
-    return nullptr;
-  }
-
  private:
+  // make friend with unary op
+  friend class TBlobUnaryOpProp;
   // internal mutex
   std::mutex mutex_;
-  // unary functions on each device mask
-  std::vector<UnaryFunction> funary_;
   // registration counter
   int reg_counter_{0};
+  // unary shape inferencer
+  UnaryShapeInfer unary_infer_{nullptr};
+  // unary functions on each device mask
+  std::vector<UnaryFunction> funary_;
+  // type 1 gradient function
+  std::vector<UnaryGradType1> funary_grad_t1_;
+  // type 2 gradient function
+  std::vector<UnaryGradType2> funary_grad_t2_;
+  // whether do inplace optimization of in 0 and output
+  bool inplace_in0_out_forward_{true};
+  // whether do inplace optimization of out_grad and in_grad0
+  bool inplace_out_in0_grad_{false};
   // NDArray registry
   NDArrayFunctionReg *ndarray_reg_{nullptr};
+  OperatorPropertyReg *op_reg_{nullptr};
   // internal function to register NDArray function.
   inline NDArrayFunctionReg &NDArrayReg() {
     if (ndarray_reg_ == nullptr) {
@@ -106,8 +111,209 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     }
     return *ndarray_reg_;
   }
+  // internal function to register NDArray function.
+  inline OperatorPropertyReg &OpReg() {
+    if (op_reg_ == nullptr) {
+      OperatorPropertyReg &reg =
+          ::dmlc::Registry<OperatorPropertyReg>::Get()->__REGISTER__(this->name);
+      op_reg_ = &reg;
+    }
+    return *op_reg_;
+  }
+  // start registering all stuffs
+  void DoRegisterUnary();
+};
+
+// Unary operator to invoke generic TBlob function.
+struct TBlobUnaryOperator : public Operator {
+  TBlobOpRegEntry::UnaryFunction forward;
+  TBlobOpRegEntry::UnaryGradType1 backward1{nullptr};
+  TBlobOpRegEntry::UnaryGradType2 backward2{nullptr};
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data,
+               const std::vector<TBlob> &aux_args) override {
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    TBlob out = out_data[0];
+    (*forward)(in_data[0], &out, req[0], ctx.run_ctx);
+  }
+
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad,
+                const std::vector<TBlob> &aux_args) override {
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    arg::OutGrad ograd; ograd.data = out_grad[0];
+    TBlob igrad = in_grad[0];
+    if (backward1 != nullptr) {
+      arg::OutValue out_value; out_value.data = out_data[0];
+      (*backward1)(ograd, out_value, &igrad, req[0], ctx.run_ctx);
+    } else if (backward2 != nullptr) {
+      arg::Input0 in0; in0.data = in_data[0];
+      (*backward2)(ograd, in0, &igrad, req[0], ctx.run_ctx);
+    } else {
+      LOG(FATAL) << "Backward is not supported";
+    }
+  }
+};  // class UnaryOperator
+
+class TBlobUnaryOpProp : public OperatorProperty {
+ public:
+  std::string name;
+  TBlobOpRegEntryImpl* source;
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return std::map<std::string, std::string>();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    if (source->unary_infer_ == nullptr) {
+      out_shape->push_back(dshape);
+    } else {
+      out_shape->push_back((*(source->unary_infer_))(dshape));
+    }
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new TBlobUnaryOpProp();
+    ptr->source = source;
+    ptr->name = name;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return name;
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (source->funary_grad_t1_.size() != 0) {
+      return {out_grad[0], out_data[0]};
+    } else if (source->funary_grad_t2_.size() != 0) {
+      return {out_grad[0], in_data[0]};
+    } else {
+      LOG(FATAL) << "Backward of " << name << " is not decalred";
+      return {};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    if (source->inplace_out_in0_grad_) {
+      return {{out_grad[0], in_grad[0]}};
+    } else {
+      return {};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    if (source->inplace_in0_out_forward_) {
+      return {{in_data[0], out_data[0]}};
+    } else {
+      return {};
+    }
+  }
+
+  Operator* CreateOperator(Context ctx) const {
+    size_t dev_mask = ctx.dev_mask();
+    TBlobUnaryOperator *op = new TBlobUnaryOperator();
+    CHECK(dev_mask < source->funary_.size() && source->funary_[dev_mask] != nullptr);
+    op->forward = source->funary_[dev_mask];
+    if (dev_mask < source->funary_grad_t1_.size()) {
+      op->backward1 = source->funary_grad_t1_[dev_mask];
+    }
+    if (dev_mask < source->funary_grad_t2_.size()) {
+      op->backward2 = source->funary_grad_t2_[dev_mask];
+    }
+    return op;
+  }
 };
 
+void TBlobOpRegEntryImpl::DoRegisterUnary() {
+  CHECK_EQ(reg_counter_, 1);
+  // The body to be registered
+  auto body = [this] (NDArray **used_vars,
+                      real_t *s,
+                      NDArray **mutate_vars) {
+    NDArray src = *used_vars[0];
+    NDArray *out = mutate_vars[0];
+
+    if (out->is_none()) {
+      *out = NDArray(src.shape(), src.ctx(), true);
+    } else {
+      CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+      CHECK(out->shape() == src.shape()) << "target shape mismatch";
+    }
+    // important: callback must always capture by value
+    NDArray ret = *out;
+    // get the const variables
+    std::vector<Engine::VarHandle> const_vars;
+    if (src.var() != ret.var()) const_vars.push_back(src.var());
+    // check if the function exist
+    int dev_mask = src.ctx().dev_mask();
+    if (static_cast<size_t>(dev_mask) >= funary_.size() ||
+        funary_[dev_mask] == nullptr) {
+      if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
+    }
+    // invoke the function
+    UnaryFunction fun = funary_[dev_mask];
+    Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
+        ret.CheckAndAlloc();
+        TBlob tmp = ret.data();
+        (*fun)(src.data(), &tmp, kWriteTo, ctx);
+#if MXNET_USE_CUDA
+        if (dev_mask == gpu::kDevMask) {
+          ctx.get_stream<gpu>()->Wait();
+        }
+#endif
+      }, src.ctx(), const_vars, {ret.var()});
+  };
+  // register the function.
+  NDArrayReg()
+      .set_body(body)
+      .set_num_use_vars(1)
+      .set_num_mutate_vars(1)
+      .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
+      .add_argument("src", "NDArray", "Source input to the function");
+  // register the operator
+  auto op_factory = [this]() {
+    TBlobUnaryOpProp *prop = new TBlobUnaryOpProp();
+    prop->name = this->name;
+    prop->source = this;
+    return prop;
+  };
+  OpReg()
+      .set_body(op_factory)
+      .add_argument("src", "Symbol", "Source symbolic input to the function");
+}
 
 TBlobOpRegEntry& TBlobOpRegistry::__REGISTER_OR_FIND__(const std::string &name) {
   if (fmap_.count(name) != 0) return *fmap_.at(name);
@@ -127,6 +333,5 @@ TBlobOpRegistry::~TBlobOpRegistry() {
     delete kv.second;
   }
 }
-
 }  // namespace common
 }  // namespace mxnet
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
index 910543efacb3..495144aa931e 100644
--- a/src/common/tblob_op_registry.h
+++ b/src/common/tblob_op_registry.h
@@ -11,44 +11,90 @@
 
 #include <dmlc/registry.h>
 #include <mxnet/base.h>
+#include <mxnet/operator.h>
 #include <map>
 #include <string>
 #include <vector>
 
+#if DMLC_USE_CXX11
+#include <functional>
+#endif
+
 namespace mxnet {
 namespace common {
+/*! \brief namespace of arguments */
+namespace arg {
+/*! \brief super class of all gradient function argument */
+struct GradFunctionArgument {
+  /*! \brief The real data */
+  TBlob data;
+};
+/*! \brief First input to the function */
+struct Input0 : GradFunctionArgument {};
+/*! \brief Second input to the function */
+struct Input1 : GradFunctionArgument {};
 
-/*! \brief pre-declare generic TBlob function*/
-struct GenericTBlobOp;
+/*! \brief Ouput value of the function to the function */
+struct OutValue : GradFunctionArgument {};
+/*! \brief Gradient of output value */
+struct OutGrad : GradFunctionArgument {};
+}  // namespace arg
 
 /*! \brief registry for function entry */
 class TBlobOpRegEntry {
  public:
-  /*! \brief unary tblob function */
   typedef void (*UnaryFunction)(const TBlob &src,
-                                TBlob *ret,
+                                TBlob* ret,
+                                OpReqType req,
                                 RunContext ctx);
+  typedef TShape (*UnaryShapeInfer)(const TShape &src);
+  typedef void (*UnaryGradType1)(const arg::OutGrad& out_grad,
+                                 const arg::OutValue& out_value,
+                                 TBlob* in_grad,
+                                 OpReqType req,
+                                 RunContext ctx);
+  typedef void (*UnaryGradType2)(const arg::OutGrad& out_grad,
+                                 const arg::Input0& in_data0,
+                                 TBlob* in_grad,
+                                 OpReqType req,
+                                 RunContext ctx);
   /*! \brief declare self type */
   typedef TBlobOpRegEntry TSelf;
   /*! \brief name of the entry */
   std::string name;
+  /*!
+   * \brief set shape inference function, by default use same shape.
+   * \param dev_mask The device mask of the function can act on.
+   * \param funary The unary function that peforms the operation.
+   */
+  virtual TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) = 0;
   /*!
    * \brief set function of the function to be funary
    * \param dev_mask The device mask of the function can act on.
    * \param funary The unary function that peforms the operation.
+   * \param inplace_in_out Whether do inplace optimization on in and out.
+   */
+  virtual TSelf& set_function(int dev_mask,
+                              UnaryFunction funary,
+                              bool inplace_in_out) = 0;
+  /*!
+   * \brief set gradient of the function of this function.
+   * \param dev_mask The device mask of the function can act on.
+   * \param fgrad The gradient function to be set.
+   * \param inplace_out_in_grad whether out_grad and in_grad can share memory.
    */
-  virtual TSelf& set_function(int dev_mask, UnaryFunction funary) = 0;
+  virtual TSelf& set_gradient(int dev_mask,
+                              UnaryGradType1 fgrad,
+                              bool inplace_out_in_grad) = 0;
+  virtual TSelf& set_gradient(int dev_mask,
+                              UnaryGradType2 fgrad,
+                              bool inplace_out_in_grad) = 0;
   /*!
    * \brief Describe the function.
    * \param description The description of the function.
    * \return reference to self.
    */
   virtual TSelf& describe(const std::string &description) = 0;
-  /*!
-   * \brief get the internal function representation
-   * \return the internal function representation.
-   */
-  virtual GenericTBlobOp *GetOp() const = 0;
   /*! \brief destructor */
   virtual ~TBlobOpRegEntry() {}
 };
@@ -80,22 +126,10 @@ class TBlobOpRegistry {
   std::map<std::string, TBlobOpRegEntry*> fmap_;
 };
 
-#if DMLC_USE_CXX11
-struct GenericTBlobOp {
-  /*! \brief function type of the function */
-  typedef std::function<void (const std::vector<TBlob> &in,
-                              TBlob *out,
-                              RunContext ctx)> OpType;
-  /*! \brief the real operator */
-  OpType op;
-};
-#endif
-
 #define MXNET_REGISTER_TBLOB_FUN(Name, DEV)                             \
   static ::mxnet::common::TBlobOpRegEntry &                             \
   __make_ ## TBlobOpRegEntry ## _ ## Name ## __ ## DEV ##__ =           \
       ::mxnet::common::TBlobOpRegistry::Get()->__REGISTER_OR_FIND__(#Name)
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_TBLOB_OP_REGISTRY_H_
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 7832ce1798cd..45e3e42f2495 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -8,39 +8,86 @@
 
 #include "../common/tblob_op_registry.h"
 #include "../operator/mshadow_op.h"
-
+#include "../operator/operator_common.h"
 #if defined(__CUDACC__)
-#define DEVICE gpu
+#define XPU gpu
 #else
-#define DEVICE cpu
+#define XPU cpu
 #endif
 
 namespace mxnet {
 namespace ndarray {
 
+using namespace common; // NOLINT(*)
+
 template<typename xpu, typename OP>
-void EvalUnary_(const TBlob &src,
-                TBlob *ret, RunContext ctx) {
+void UnaryForward_(const TBlob &src,
+                   TBlob *ret,
+                   OpReqType req,
+                   RunContext ctx) {
+  using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-      = F<OP>(src.FlatTo2D<xpu, real_t>(s));
+  mshadow::Tensor<xpu, 2> out = ret->FlatTo2D<xpu, real_t>(s);
+  Assign(out, req, F<OP>(src.FlatTo2D<xpu, real_t>(s)));
 }
 
-// helper macro to register mshadow element-wise unary opts
-// usually you only need to use this to register common operations
-#define REGISTER_MSHADOW_UNARY(Name, Op)            \
-  MXNET_REGISTER_TBLOB_FUN(Name, DEVICE)            \
-  .set_function(DEVICE::kDevMask, EvalUnary_<DEVICE, Op>)
+// backward function that takes input value of the op
+template<typename xpu, typename OP>
+void UnaryBackwardUseIn_(const arg::OutGrad& out_grad,
+                         const arg::Input0& in_data0,
+                         TBlob *in_grad,
+                         OpReqType req,
+                         RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
+  Assign(igrad, req,
+         F<OP>(in_data0.data.FlatTo2D<xpu, real_t>(s)) *
+         out_grad.data.FlatTo2D<xpu, real_t>());
+}
 
+// backward function that takes output value of the op
+template<typename xpu, typename OP>
+void UnaryBackwardUseOut_(const arg::OutGrad& out_grad,
+                          const arg::OutValue& out_value,
+                          TBlob *in_grad,
+                          OpReqType req,
+                          RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
+  Assign(igrad, req,
+         F<OP>(out_value.data.FlatTo2D<xpu, real_t>(s)) *
+         out_grad.data.FlatTo2D<xpu, real_t>());
+}
 
-// register all unary operations here
-REGISTER_MSHADOW_UNARY(square, op::mshadow_op::square)
+// Register all unary operations here
+// Square
+struct square_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 2.0f * a;
+  }
+};
+// The true means inplace can be enabled.
+MXNET_REGISTER_TBLOB_FUN(square, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_grad>, true)
 .describe("Take square of the src");
 
-REGISTER_MSHADOW_UNARY(sqrt, op::mshadow_op::square_root)
-.describe("Take square root of the src");
 
+// Square root
+struct square_root_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 0.5f / a;
+  }
+};
+MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, square_root_grad>, true)
+.describe("Take square root of the src");
 }  // namespace ndarray
 }  // namespace mxnet
 #endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 9238ee049c0b..c8ca495d3349 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -80,7 +80,6 @@ struct tanh_grad {
   }
 };
 
-
 struct square {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return a * a;
@@ -107,6 +106,7 @@ struct square_root {
     return sqrt(a);
   }
 };
+
 }  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet

From 89dda75a3e6f7187ded2b33064800e49a97d10a4 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 13:11:39 +0800
Subject: [PATCH 043/122] minor

---
 include/mxnet/base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 962740b4194d..1eeffc1ab4b9 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -59,7 +59,7 @@
 #define MXNET_API __declspec(dllimport)
 #endif
 #else
-#define MXAPI
+#define MXNET_API
 #endif
 
 /*! \brief namespace of mxnet */

From 343d5c979577ef9ec768697a1e5d1b0a9f03b18b Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 23:10:17 -0600
Subject: [PATCH 044/122] [OP] Fix reshape

---
 src/operator/reshape-inl.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index a2a7e58cb3e7..20411b78c585 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -23,6 +23,7 @@ namespace op {
 enum ReshapeOpInputs {kData};
 enum ReshapeOpOutputs {kOut};
 
+
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   TShape target_shape;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
@@ -33,7 +34,7 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 template<typename xpu>
 class ReshapeOp : public Operator {
  public:
-  explicit ReshapeOp(ReshapeParam param) {}  // Do nothing, just make a special factory
+  explicit ReshapeOp(ReshapeParam param) {}  // Do nothing
 
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
@@ -47,9 +48,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (req[kOut] == kNullOp) return;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    // TODO(bing): potentail bug here for non-4D input
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (data.dptr_ == out.dptr_) return;
@@ -71,8 +71,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad_out = out_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> grad_in = in_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad_out = out_grad[kData].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
     if (grad_out.dptr_ == grad_in.dptr_) return;

From b1de0cbe34113adb24b17a5727e68c80ce32f304 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 10:03:15 -0700
Subject: [PATCH 045/122] required interface of optimizer should be present in
 the base class to avoid confusion

---
 python/mxnet/optimizer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 8c5b54178f31..9fa580d6956c 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,7 +4,7 @@
 
 class Optimizer(object):
     """Base class of all optimizers."""
-    def __init__(self):
+    def __init__(self, rescale_grad=1):
         self.iteration = 0
 
     def begin_round(self, iteration):
@@ -17,6 +17,13 @@ def begin_round(self, iteration):
         """
         self.iteration = iteration
 
+    def create_state(self, index, weight):
+        """Create additional optimizer state such as momentum.
+        override in implementations."""
+
+    def update(self, index, weight, grad, state):
+        """Update the parameters. override in implementations"""
+
 
 class SGD(Optimizer):
     """A very simple SGD optimizer with momentum and weight regularization.
@@ -41,7 +48,7 @@ class SGD(Optimizer):
     def __init__(self, learning_rate=0.01, momentum=0.0,
                  wd=0.0001, rescale_grad=1, clip_gradient=None,
                  lr_scheduler=None):
-        super(SGD, self).__init__()
+        super(SGD, self).__init__(rescale_grad)
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd

From 5f18796e7dcf2e6c2d0b9bfb413e0d683cfe2ed2 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 10:34:52 -0700
Subject: [PATCH 046/122] Use factory pattern to create optimzers. Creator
 tracks all (direct and indirect) subclasses of Optimizer by class name. We
 currently silently allow overriding existing names, but maybe giving a
 warning is better?

---
 python/mxnet/optimizer.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 9fa580d6956c..1c708c533ca0 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,8 +4,21 @@
 
 class Optimizer(object):
     """Base class of all optimizers."""
+    class __metaclass__(type):
+        """Meta class for tracking all subclasses(implementations)
+        of Optimizer."""
+        __optimizers__ = {}
+
+        def __new__(meta, name, bases, attrs):
+            cls = type.__new__(meta, name, bases, attrs)
+            #Allow overriding of existing optimizer.
+            #Always keep the last one.
+            meta.__optimizers__[cls.__name__] = cls
+            return cls
+
     def __init__(self, rescale_grad=1):
         self.iteration = 0
+        self.rescale_grad = rescale_grad
 
     def begin_round(self, iteration):
         """Function called to notify beginning of iteration.
@@ -52,7 +65,6 @@ def __init__(self, learning_rate=0.01, momentum=0.0,
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd
-        self.rescale_grad = rescale_grad
         self.clip_gradient = clip_gradient
         self.lr_scheduler = lr_scheduler
         if lr_scheduler != None:
@@ -112,12 +124,11 @@ def update(self, index, weight, grad, state):
             weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
 
 
-class Test(object):
+class Test(Optimizer):
     """For test use"""
     def __init__(self, rescale_grad=1):
-        self.rescale_grad = rescale_grad
-
-
+        super(Test, self).__init__(rescale_grad)
+        
     # pylint: disable=no-self-use
     def create_state(self, index, weight):
         """Create a state to duplicate weight"""
@@ -147,10 +158,16 @@ def create(name, rescale_grad=1, **kwargs):
     opt : Optimizer
         The result optimizer.
     """
+    #TODO(eric): kept for backward compatibility.
+    #            remove after all downstream functions move to 
+    #            new naming standard.
     if name == 'sgd' or name == 'SGD':
         return SGD(rescale_grad=rescale_grad, **kwargs)
     if name == 'test':
         return Test(rescale_grad=rescale_grad)
+
+    if name in Optimizer.__optimizers__:
+        return Optimizer.__optimizers__[name](rescale_grad=rescale_grad, **kwargs)
     else:
         raise ValueError('Cannot find optimizer %s' % name)
 

From e4e58ab5a96203f1853f9e48b569959a98c80bd1 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 10:44:48 -0700
Subject: [PATCH 047/122] Optimizer factory is now case insensitive

---
 python/mxnet/optimizer.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 1c708c533ca0..56f2f17aef1c 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -13,7 +13,7 @@ def __new__(meta, name, bases, attrs):
             cls = type.__new__(meta, name, bases, attrs)
             #Allow overriding of existing optimizer.
             #Always keep the last one.
-            meta.__optimizers__[cls.__name__] = cls
+            meta.__optimizers__[cls.__name__.lower()] = cls
             return cls
 
     def __init__(self, rescale_grad=1):
@@ -128,7 +128,7 @@ class Test(Optimizer):
     """For test use"""
     def __init__(self, rescale_grad=1):
         super(Test, self).__init__(rescale_grad)
-        
+
     # pylint: disable=no-self-use
     def create_state(self, index, weight):
         """Create a state to duplicate weight"""
@@ -158,16 +158,10 @@ def create(name, rescale_grad=1, **kwargs):
     opt : Optimizer
         The result optimizer.
     """
-    #TODO(eric): kept for backward compatibility.
-    #            remove after all downstream functions move to 
-    #            new naming standard.
-    if name == 'sgd' or name == 'SGD':
-        return SGD(rescale_grad=rescale_grad, **kwargs)
-    if name == 'test':
-        return Test(rescale_grad=rescale_grad)
-
     if name in Optimizer.__optimizers__:
-        return Optimizer.__optimizers__[name](rescale_grad=rescale_grad, **kwargs)
+        return Optimizer.__optimizers__[name.lower()](
+                        rescale_grad=rescale_grad,
+                        **kwargs)
     else:
         raise ValueError('Cannot find optimizer %s' % name)
 

From d3feb835a1ac3cc065ee922f6b89199eb19b9f3b Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 11:02:30 -0700
Subject: [PATCH 048/122] fixed lint error

---
 python/mxnet/optimizer.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 56f2f17aef1c..d65e38da84c3 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -9,11 +9,18 @@ class __metaclass__(type):
         of Optimizer."""
         __optimizers__ = {}
 
-        def __new__(meta, name, bases, attrs):
-            cls = type.__new__(meta, name, bases, attrs)
-            #Allow overriding of existing optimizer.
+        def __new__(mcs, name, bases, attrs):
+            cls = type.__new__(mcs, name, bases, attrs)
+            #Allow overriding of existing optimizer, but give a warning.
             #Always keep the last one.
-            meta.__optimizers__[cls.__name__.lower()] = cls
+            cls_name = cls.__name__.lower()
+            if cls_name in mcs.__optimizers__:
+                print 'WARNING: New optimizer %s.%s is overriding ' \
+                      'existing optimizer %s.%s'%(
+                          cls.__module__, cls.__name__,
+                          mcs.__optimizers__[cls_name].__module__,
+                          mcs.__optimizers__[cls_name].__name__)
+            mcs.__optimizers__[cls_name] = cls
             return cls
 
     def __init__(self, rescale_grad=1):
@@ -160,8 +167,8 @@ def create(name, rescale_grad=1, **kwargs):
     """
     if name in Optimizer.__optimizers__:
         return Optimizer.__optimizers__[name.lower()](
-                        rescale_grad=rescale_grad,
-                        **kwargs)
+            rescale_grad=rescale_grad,
+            **kwargs)
     else:
         raise ValueError('Cannot find optimizer %s' % name)
 

From 3273874c2fe0c94725cee20da1282a446a076f52 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 11:34:18 -0700
Subject: [PATCH 049/122] fixed python3 compatibility

---
 python/mxnet/optimizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index d65e38da84c3..2a1bcbdde163 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -15,11 +15,11 @@ def __new__(mcs, name, bases, attrs):
             #Always keep the last one.
             cls_name = cls.__name__.lower()
             if cls_name in mcs.__optimizers__:
-                print 'WARNING: New optimizer %s.%s is overriding ' \
+                print('WARNING: New optimizer %s.%s is overriding ' \
                       'existing optimizer %s.%s'%(
                           cls.__module__, cls.__name__,
                           mcs.__optimizers__[cls_name].__module__,
-                          mcs.__optimizers__[cls_name].__name__)
+                          mcs.__optimizers__[cls_name].__name__))
             mcs.__optimizers__[cls_name] = cls
             return cls
 
@@ -165,7 +165,7 @@ def create(name, rescale_grad=1, **kwargs):
     opt : Optimizer
         The result optimizer.
     """
-    if name in Optimizer.__optimizers__:
+    if name.lower() in Optimizer.__optimizers__:
         return Optimizer.__optimizers__[name.lower()](
             rescale_grad=rescale_grad,
             **kwargs)

From 30bfd5f1e1570445e71092e4a50b298e5b09bb9d Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 11:56:15 -0700
Subject: [PATCH 050/122] fix more py3 compatibility...

---
 python/mxnet/optimizer.py | 60 ++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 2a1bcbdde163..bc89868bb17a 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,27 +1,39 @@
 # pylint: disable=fixme, invalid-name, unused-argument, too-many-arguments, no-name-in-module
 """Common Optimization algorithms with regularizations."""
 from .ndarray import NDArray, zeros, clip
-
-class Optimizer(object):
+from six import with_metaclass
+
+class MetaOptimizer(type):
+    """Meta class for tracking all subclasses(implementations)
+    of Optimizer."""
+    __optimizers__ = {}
+
+    def __new__(mcs, name, bases, attrs):
+        cls = type.__new__(mcs, name, bases, attrs)
+        #Allow overriding of existing optimizer, but give a warning.
+        #Always keep the last one.
+        cls_name = cls.__name__.lower()
+        if cls_name in mcs.__optimizers__:
+            print('WARNING: New optimizer %s.%s is overriding ' \
+                  'existing optimizer %s.%s'%(
+                      cls.__module__, cls.__name__,
+                      mcs.__optimizers__[cls_name].__module__,
+                      mcs.__optimizers__[cls_name].__name__))
+        mcs.__optimizers__[cls_name] = cls
+        return cls
+
+
+class Optimizer(with_metaclass(MetaOptimizer, object)):
     """Base class of all optimizers."""
-    class __metaclass__(type):
-        """Meta class for tracking all subclasses(implementations)
-        of Optimizer."""
-        __optimizers__ = {}
-
-        def __new__(mcs, name, bases, attrs):
-            cls = type.__new__(mcs, name, bases, attrs)
-            #Allow overriding of existing optimizer, but give a warning.
-            #Always keep the last one.
-            cls_name = cls.__name__.lower()
-            if cls_name in mcs.__optimizers__:
-                print('WARNING: New optimizer %s.%s is overriding ' \
-                      'existing optimizer %s.%s'%(
-                          cls.__module__, cls.__name__,
-                          mcs.__optimizers__[cls_name].__module__,
-                          mcs.__optimizers__[cls_name].__name__))
-            mcs.__optimizers__[cls_name] = cls
-            return cls
+
+    @staticmethod
+    def CreateOptimizer(name, rescale_grad=1, **kwargs):
+        if name.lower() in Optimizer.__optimizers__:
+            return Optimizer.__optimizers__[name.lower()](
+                rescale_grad=rescale_grad,
+                **kwargs)
+        else:
+            raise ValueError('Cannot find optimizer %s' % name)
 
     def __init__(self, rescale_grad=1):
         self.iteration = 0
@@ -165,12 +177,8 @@ def create(name, rescale_grad=1, **kwargs):
     opt : Optimizer
         The result optimizer.
     """
-    if name.lower() in Optimizer.__optimizers__:
-        return Optimizer.__optimizers__[name.lower()](
-            rescale_grad=rescale_grad,
-            **kwargs)
-    else:
-        raise ValueError('Cannot find optimizer %s' % name)
+    return Optimizer.CreateOptimizer(name, 
+        rescale_grad=rescale_grad, **kwargs)
 
 def get_updater(optimizer):
     """Return a clossure of the updater needed for kvstore

From ed9cd63be7554639ecf4e71426caa9b34960c181 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 12:05:05 -0700
Subject: [PATCH 051/122] fixed more lint

---
 python/mxnet/optimizer.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index bc89868bb17a..f18eaffdffcf 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -28,6 +28,25 @@ class Optimizer(with_metaclass(MetaOptimizer, object)):
 
     @staticmethod
     def CreateOptimizer(name, rescale_grad=1, **kwargs):
+        """Create an optimizer with specified name.
+
+        Parameters
+        ----------
+        name: str
+            Name of required optimizer. Should be the name
+            of a subclass of Optimizer. Case insensitive.
+
+        rescale_grad : float
+            Rescaling factor on gradient.
+
+        kwargs: dict
+            Parameters for optimizer
+
+        Returns
+        -------
+        opt : Optimizer
+            The result optimizer.
+        """
         if name.lower() in Optimizer.__optimizers__:
             return Optimizer.__optimizers__[name.lower()](
                 rescale_grad=rescale_grad,
@@ -164,7 +183,8 @@ def create(name, rescale_grad=1, **kwargs):
     Parameters
     ----------
     name: str
-        Name of required optimizer
+        Name of required optimizer. Should be the name
+        of a subclass of Optimizer. Case insensitive.
 
     rescale_grad : float
         Rescaling factor on gradient.
@@ -177,8 +197,9 @@ def create(name, rescale_grad=1, **kwargs):
     opt : Optimizer
         The result optimizer.
     """
-    return Optimizer.CreateOptimizer(name, 
-        rescale_grad=rescale_grad, **kwargs)
+    return Optimizer.CreateOptimizer(name,
+                                     rescale_grad=rescale_grad,
+                                     **kwargs)
 
 def get_updater(optimizer):
     """Return a clossure of the updater needed for kvstore

From 948af9e0dd59ef2c2f53f81841bc586f4e83d765 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 22 Oct 2015 13:07:53 -0600
Subject: [PATCH 052/122] [OP] update flatten

---
 src/operator/reshape-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 20411b78c585..d02d1cedebd1 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -173,7 +173,7 @@ class FlattenProp : public ReshapeProp {
     for (uint32_t i = 1; i < dshape.ndim(); ++i) {
       target_dim *= dshape[i];
     }
-    out_shape->push_back(mshadow::Shape4(dshape[0], 1, 1, target_dim));
+    out_shape->push_back(mshadow::Shape2(dshape[0], target_dim));
     return true;
   }
 

From 637a1c29e7e4bba60808b4f9dcfeea87bf4fe4b7 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 13:44:05 -0700
Subject: [PATCH 053/122] refactored optimizer factory for py3 compatibility

---
 python/mxnet/optimizer.py | 68 +++++++++++++--------------------------
 1 file changed, 22 insertions(+), 46 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index f18eaffdffcf..00d7df773c79 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,31 +1,25 @@
 # pylint: disable=fixme, invalid-name, unused-argument, too-many-arguments, no-name-in-module
 """Common Optimization algorithms with regularizations."""
 from .ndarray import NDArray, zeros, clip
-from six import with_metaclass
-
-class MetaOptimizer(type):
-    """Meta class for tracking all subclasses(implementations)
-    of Optimizer."""
-    __optimizers__ = {}
-
-    def __new__(mcs, name, bases, attrs):
-        cls = type.__new__(mcs, name, bases, attrs)
-        #Allow overriding of existing optimizer, but give a warning.
-        #Always keep the last one.
-        cls_name = cls.__name__.lower()
-        if cls_name in mcs.__optimizers__:
+
+class Optimizer(object):
+    """Base class of all optimizers."""
+    opt_registry = {}
+
+    @staticmethod
+    def Register(cls):
+        """Register optimizers to the optimizer factory"""
+        assert(isinstance(cls, type))
+        name = cls.__name__.lower()
+        if name in Optimizer.opt_registry:
             print('WARNING: New optimizer %s.%s is overriding ' \
                   'existing optimizer %s.%s'%(
                       cls.__module__, cls.__name__,
-                      mcs.__optimizers__[cls_name].__module__,
-                      mcs.__optimizers__[cls_name].__name__))
-        mcs.__optimizers__[cls_name] = cls
+                      Optimizer.opt_registry[name].__module__,
+                      Optimizer.opt_registry[name].__name__))
+        Optimizer.opt_registry[name] = cls
         return cls
 
-
-class Optimizer(with_metaclass(MetaOptimizer, object)):
-    """Base class of all optimizers."""
-
     @staticmethod
     def CreateOptimizer(name, rescale_grad=1, **kwargs):
         """Create an optimizer with specified name.
@@ -47,8 +41,8 @@ def CreateOptimizer(name, rescale_grad=1, **kwargs):
         opt : Optimizer
             The result optimizer.
         """
-        if name.lower() in Optimizer.__optimizers__:
-            return Optimizer.__optimizers__[name.lower()](
+        if name.lower() in Optimizer.opt_registry:
+            return Optimizer.opt_registry[name.lower()](
                 rescale_grad=rescale_grad,
                 **kwargs)
         else:
@@ -75,7 +69,10 @@ def create_state(self, index, weight):
     def update(self, index, weight, grad, state):
         """Update the parameters. override in implementations"""
 
+#convenience wrapper for Optimizer.Register
+register = Optimizer.Register
 
+@register
 class SGD(Optimizer):
     """A very simple SGD optimizer with momentum and weight regularization.
 
@@ -161,7 +158,7 @@ def update(self, index, weight, grad, state):
             assert self.momentum == 0.0
             weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
 
-
+@register
 class Test(Optimizer):
     """For test use"""
     def __init__(self, rescale_grad=1):
@@ -177,29 +174,8 @@ def update(self, index, weight, grad, state):
         weight[:] += grad * self.rescale_grad
         state[:] = weight
 
-def create(name, rescale_grad=1, **kwargs):
-    """Create an optimizer with specified name.
-
-    Parameters
-    ----------
-    name: str
-        Name of required optimizer. Should be the name
-        of a subclass of Optimizer. Case insensitive.
-
-    rescale_grad : float
-        Rescaling factor on gradient.
-
-    kwargs: dict
-        Parameters for optimizer
-
-    Returns
-    -------
-    opt : Optimizer
-        The result optimizer.
-    """
-    return Optimizer.CreateOptimizer(name,
-                                     rescale_grad=rescale_grad,
-                                     **kwargs)
+#backward compatibility wrapper for Optimizer.CreateOptimizer
+create = Optimizer.CreateOptimizer
 
 def get_updater(optimizer):
     """Return a clossure of the updater needed for kvstore

From bae6bf7ebe8d88bf07ab2ac76f958cb6497fc2e4 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 13:52:45 -0700
Subject: [PATCH 054/122] lint

---
 python/mxnet/optimizer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 00d7df773c79..7f44a1cdcae1 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -7,18 +7,18 @@ class Optimizer(object):
     opt_registry = {}
 
     @staticmethod
-    def Register(cls):
+    def Register(klass):
         """Register optimizers to the optimizer factory"""
-        assert(isinstance(cls, type))
-        name = cls.__name__.lower()
+        assert(isinstance(klass, type))
+        name = klass.__name__.lower()
         if name in Optimizer.opt_registry:
             print('WARNING: New optimizer %s.%s is overriding ' \
                   'existing optimizer %s.%s'%(
-                      cls.__module__, cls.__name__,
+                      klass.__module__, klass.__name__,
                       Optimizer.opt_registry[name].__module__,
                       Optimizer.opt_registry[name].__name__))
-        Optimizer.opt_registry[name] = cls
-        return cls
+        Optimizer.opt_registry[name] = klass
+        return klass
 
     @staticmethod
     def CreateOptimizer(name, rescale_grad=1, **kwargs):

From e43f1e51a8506d22fc27934675b5164d6e6de753 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 14:43:14 -0700
Subject: [PATCH 055/122] add compatibility for old style caffe prototxt

---
 tools/caffe_converter/convert_symbol.py | 43 +++++++++++++++++--------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index ea673c4a7863..9b5bcde99848 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -2,6 +2,7 @@
 from caffe.proto import caffe_pb2
 from google.protobuf import text_format
 import argparse
+import sys
 
 def readProtoSolverFile(filepath):
     solver_config = caffe.proto.caffe_pb2.NetParameter()
@@ -22,7 +23,12 @@ def proto2script(proto_file):
     top = dict()
     flatten_count = 0
     symbol_string = ""
-    layer = proto.layer
+    if len(proto.layer):
+        layer = proto.layer
+    elif len(proto.layers):
+        layer = proto.layers
+    else:
+        raise Exception('Invalid proto file.')
 
     # We assume the first bottom blob of first layer is the output from data layer
     input_name = layer[0].bottom[0]
@@ -33,7 +39,7 @@ def proto2script(proto_file):
         type_string = ''
         param_string = ''
         name = layer[i].name.replace('/', '_')
-        if layer[i].type == 'Convolution':
+        if layer[i].type == 'Convolution' or layer[i].type == 4:
             type_string = 'mx.symbol.Convolution'
             param = layer[i].convolution_param 
             pad = 0 if len(param.pad) == 0 else param.pad[0]
@@ -42,7 +48,7 @@ def proto2script(proto_file):
                 (param.num_output, pad, pad, param.kernel_size[0],\
                 param.kernel_size[0], stride, stride, not param.bias_term)
             need_flatten[name] = True
-        if layer[i].type == 'Pooling':
+        if layer[i].type == 'Pooling' or layer[i].type == 17:
             type_string = 'mx.symbol.Pooling'
             param = layer[i].pooling_param
             param_string = "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
@@ -55,37 +61,37 @@ def proto2script(proto_file):
             else:
                 raise Exception("Unknown Pooling Method!")
             need_flatten[name] = True
-        if layer[i].type == 'ReLU':
+        if layer[i].type == 'ReLU' or layer[i].type == 18:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='relu'"
-            need_flatten[name] = need_flatten[mapping[proto.layer[i].bottom[0]]]
-        if layer[i].type == 'LRN':
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
             param = layer[i].lrn_param  
             param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" %\
                 (param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
-        if layer[i].type == 'InnerProduct':
+        if layer[i].type == 'InnerProduct' or layer[i].type == 14:
             type_string = 'mx.symbol.FullyConnected'
             param = layer[i].inner_product_param
             param_string = "num_hidden=%d, no_bias=%s" % (param.num_output, not param.bias_term)
             need_flatten[name] = False
-        if layer[i].type == 'Dropout':
+        if layer[i].type == 'Dropout' or layer[i].type == 6:
             type_string = 'mx.symbol.Dropout'
             param = layer[i].dropout_param
             param_string = "p=%f" % param.dropout_ratio
-            need_flatten[name] = need_flatten[mapping[proto.layer[i].bottom[0]]]
-        if layer[i].type == 'Softmax':
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Softmax' or layer[i].type == 20:
             type_string = 'mx.symbol.Softmax'
 
             # We only support single output network for now.
             output_name = name
-        if layer[i].type == 'Flatten':
+        if layer[i].type == 'Flatten' or layer[i].type == 8:
             type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
-        if layer[i].type == 'Split':
+        if layer[i].type == 'Split' or layer[i].type == 22:
             type_string = 'split'
-        if layer[i].type == 'Concat':
+        if layer[i].type == 'Concat' or layer[i].type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
         if type_string == '':
@@ -121,3 +127,14 @@ def proto2symbol(proto_file):
     exec(sym)
     exec("ret = " + output_name)
     return ret
+
+def main():
+    symbol_string, output_name = proto2script(sys.argv[1])
+    if len(sys.argv) > 2:
+        with open(sys.argv[2], 'w') as fout:
+            fout.write(symbol_string)
+    else:
+        print(symbol_string)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From bf6a6043f45681af70d21f1fe579de1f3f8862a0 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 16:47:57 -0400
Subject: [PATCH 056/122] [kvstore] async test

---
 tests/python/multi-node/dist_async_mlp.py | 27 +++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100755 tests/python/multi-node/dist_async_mlp.py

diff --git a/tests/python/multi-node/dist_async_mlp.py b/tests/python/multi-node/dist_async_mlp.py
new file mode 100755
index 000000000000..98abdca797ca
--- /dev/null
+++ b/tests/python/multi-node/dist_async_mlp.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import common
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+(train, val) = common.mnist(num_parts = kv.num_workers,
+                            part_index = kv.rank,
+                            batch_size = 100,
+                            input_shape = (784,))
+
+# train
+model  = mx.model.FeedForward.create(
+    symbol        = common.mlp(),
+    ctx           = mx.cpu(),
+    X             = train,
+    num_round     = 4,
+    learning_rate = 0.05,
+    wd            = 0.0004,
+    momentum      = 0.9,
+    kvstore       = kv)
+
+common.accuracy(model, val)

From ef7212abbcc5374c5dd9a60b29b743a646c62a87 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Tue, 20 Oct 2015 22:06:23 +0000
Subject: [PATCH 057/122] [kvstore] update ps-lite

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 8cd116b57800..08d745a5c784 100644
--- a/Makefile
+++ b/Makefile
@@ -80,7 +80,7 @@ PS_PATH=./ps-lite
 DEPS_PATH=$(shell pwd)/deps
 include $(PS_PATH)/make/ps.mk
 ifeq ($(USE_DIST_KVSTORE), 1)
-	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/include
+	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/include -I$(DEPS_PATH)/include
 	LIB_DEP += $(PS_PATH)/build/libps.a
 	LDFLAGS += -Wl,-rpath,$(DEPS_PATH)/lib $(PS_LDFLAGS_SO)
 endif
@@ -120,7 +120,7 @@ lib/libmxnet.so: $(ALL_DEP)
 
 # ps-lite
 $(PS_PATH)/build/libps.a:
-	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) deps
+	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) protobuf zmq
 	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
 
 $(DMLC_CORE)/libdmlc.a:

From 3a09872420b8b840dba20449afd19c4e3058e53d Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Wed, 21 Oct 2015 04:31:06 +0000
Subject: [PATCH 058/122] [kvstore] more test scripts

---
 tests/python/multi-node/README.md             | 309 +++++++++++++++++-
 tests/python/multi-node/common.py             |   3 +-
 .../python/multi-node/dist_async_inception.py |  31 ++
 tests/python/multi-node/dist_async_lenet.py   |  27 ++
 .../multi-node/dist_imagenet_inception.py     |  30 ++
 tests/python/multi-node/imagenet.py           | 101 ++++++
 tests/python/multi-node/local_inception.py    |   3 +-
 7 files changed, 496 insertions(+), 8 deletions(-)
 create mode 100755 tests/python/multi-node/dist_async_inception.py
 create mode 100755 tests/python/multi-node/dist_async_lenet.py
 create mode 100755 tests/python/multi-node/dist_imagenet_inception.py
 create mode 100644 tests/python/multi-node/imagenet.py

diff --git a/tests/python/multi-node/README.md b/tests/python/multi-node/README.md
index 32d308017c5f..9713199ee17c 100644
--- a/tests/python/multi-node/README.md
+++ b/tests/python/multi-node/README.md
@@ -1,15 +1,312 @@
 # Test multi-devices and multi-machines
 
-must disable `CUDNN`
+Note that `CUDNN` leads to randomness, need to disable if comparing to the baseline
 
-`local_*` for multi-devices and single machine. Requires two GPUs.
+- `local_*` for multi-devices and single machine. Requires two GPUs.
 
+- `dist_sync_*` for multi-machines with BSP synchronizations
 
-`dist_*` for multi-machines. Run in local machine with 2 workers (requires at
-least two gpus) and 2 servers.
-
+`dist_async_*` for multi-machines with asynchronous SGD
 
 ```
-ln -s ../../../dmlc-core/tracker/dmlc_local.py .
+ln -s ../../../ps-lite/tracker/dmlc_local.py .
 ./dmlc_local.py -n 2 -s 2 ./dist_sync_mlp.py
 ```
+
+# Results
+
+## cifar10, inceptions
+
+single gtx 980. batch size = 128 and learning rate = .1
+
+```
+[03:42:04] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:42:04] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:42:04] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:42:04] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+INFO:root:Iteration[0] Train-accuracy=0.523938
+INFO:root:Iteration[0] Time cost=104.396
+INFO:root:Iteration[0] Validation-accuracy=0.665941
+INFO:root:Iteration[1] Train-accuracy=0.721108
+INFO:root:Iteration[1] Time cost=105.245
+INFO:root:Iteration[1] Validation-accuracy=0.755934
+INFO:root:Iteration[2] Train-accuracy=0.793298
+INFO:root:Iteration[2] Time cost=105.101
+INFO:root:Iteration[2] Validation-accuracy=0.784909
+INFO:root:Iteration[3] Train-accuracy=0.835198
+INFO:root:Iteration[3] Time cost=104.816
+INFO:root:Iteration[3] Validation-accuracy=0.799150
+INFO:root:Iteration[4] Train-accuracy=0.869625
+INFO:root:Iteration[4] Time cost=104.571
+INFO:root:Iteration[4] Validation-accuracy=0.809533
+INFO:root:Iteration[5] Train-accuracy=0.895201
+INFO:root:Iteration[5] Time cost=104.357
+INFO:root:Iteration[5] Validation-accuracy=0.811214
+INFO:root:Iteration[6] Train-accuracy=0.911025
+INFO:root:Iteration[6] Time cost=104.347
+INFO:root:Iteration[6] Validation-accuracy=0.799644
+INFO:root:Iteration[7] Train-accuracy=0.923853
+INFO:root:Iteration[7] Time cost=104.108
+INFO:root:Iteration[7] Validation-accuracy=0.806468
+INFO:root:Iteration[8] Train-accuracy=0.936301
+INFO:root:Iteration[8] Time cost=104.178
+INFO:root:Iteration[8] Validation-accuracy=0.813687
+INFO:root:Iteration[9] Train-accuracy=0.950068
+INFO:root:Iteration[9] Time cost=104.522
+INFO:root:Iteration[9] Validation-accuracy=0.820115
+INFO:root:Accuracy = 0.820100
+```
+
+using 3x dual gtx 980 machines, async inception with batch size = 128 and
+learning rate = .05
+
+
+```
+[03:23:29] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:31] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:29] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:31] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:30] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:30] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:29] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:31] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:29] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:31] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:30] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:30] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Iteration[0] Train-accuracy=0.185276
+INFO:root:Iteration[0] Time cost=21.556
+INFO:root:Iteration[0] Train-accuracy=0.184255
+INFO:root:Iteration[0] Time cost=22.021
+INFO:root:Iteration[0] Train-accuracy=0.183834
+INFO:root:Iteration[0] Time cost=22.342
+INFO:root:Iteration[0] Validation-accuracy=0.225079
+INFO:root:Iteration[0] Validation-accuracy=0.236452
+INFO:root:Iteration[0] Validation-accuracy=0.237836
+INFO:root:Iteration[1] Train-accuracy=0.308624
+INFO:root:Iteration[1] Time cost=21.617
+INFO:root:Iteration[1] Train-accuracy=0.312977
+INFO:root:Iteration[1] Time cost=21.603
+INFO:root:Iteration[1] Train-accuracy=0.309637
+INFO:root:Iteration[1] Time cost=21.917
+INFO:root:Iteration[1] Validation-accuracy=0.333169
+INFO:root:Iteration[1] Validation-accuracy=0.382812
+INFO:root:Iteration[1] Validation-accuracy=0.385186
+INFO:root:Iteration[2] Train-accuracy=0.426885
+INFO:root:Iteration[2] Time cost=21.531
+INFO:root:Iteration[2] Train-accuracy=0.420802
+INFO:root:Iteration[2] Time cost=21.469
+INFO:root:Iteration[2] Train-accuracy=0.436844
+INFO:root:Iteration[2] Time cost=22.053
+INFO:root:Iteration[2] Validation-accuracy=0.487935
+INFO:root:Iteration[2] Validation-accuracy=0.491495
+INFO:root:Iteration[2] Validation-accuracy=0.532832
+INFO:root:Iteration[3] Train-accuracy=0.541209
+INFO:root:Iteration[3] Time cost=21.817
+INFO:root:Iteration[3] Train-accuracy=0.544072
+INFO:root:Iteration[3] Time cost=21.759
+INFO:root:Iteration[3] Train-accuracy=0.546458
+INFO:root:Iteration[3] Time cost=22.156
+INFO:root:Iteration[3] Validation-accuracy=0.589102
+INFO:root:Iteration[3] Validation-accuracy=0.559138
+INFO:root:Iteration[3] Validation-accuracy=0.613528
+INFO:root:Iteration[4] Train-accuracy=0.618500
+INFO:root:Iteration[4] Time cost=21.552
+INFO:root:Iteration[4] Train-accuracy=0.614862
+INFO:root:Iteration[4] Time cost=21.544
+INFO:root:Iteration[4] Train-accuracy=0.619573
+INFO:root:Iteration[4] Time cost=21.890
+INFO:root:Iteration[4] Validation-accuracy=0.630241
+INFO:root:Iteration[4] Validation-accuracy=0.618176
+INFO:root:Iteration[4] Validation-accuracy=0.666930
+INFO:root:Iteration[5] Train-accuracy=0.673843
+INFO:root:Iteration[5] Time cost=21.056
+INFO:root:Iteration[5] Train-accuracy=0.675692
+INFO:root:Iteration[5] Time cost=21.120
+INFO:root:Iteration[5] Train-accuracy=0.678912
+INFO:root:Iteration[5] Time cost=21.721
+INFO:root:Iteration[5] Validation-accuracy=0.657634
+INFO:root:Iteration[5] Validation-accuracy=0.677809
+INFO:root:Iteration[5] Validation-accuracy=0.715882
+INFO:root:Iteration[6] Train-accuracy=0.722149
+INFO:root:Iteration[6] Time cost=20.579
+INFO:root:Iteration[6] Train-accuracy=0.724833
+INFO:root:Iteration[6] Time cost=20.548
+INFO:root:Iteration[6] Train-accuracy=0.720241
+INFO:root:Iteration[6] Time cost=20.772
+INFO:root:Iteration[6] Validation-accuracy=0.692939
+INFO:root:Iteration[6] Validation-accuracy=0.714794
+INFO:root:Iteration[6] Validation-accuracy=0.748220
+INFO:root:Iteration[7] Train-accuracy=0.760854
+INFO:root:Iteration[7] Time cost=20.801
+INFO:root:Iteration[7] Train-accuracy=0.757276
+INFO:root:Iteration[7] Time cost=21.080
+INFO:root:Iteration[7] Validation-accuracy=0.735858
+INFO:root:Iteration[7] Train-accuracy=0.758767
+INFO:root:Iteration[7] Time cost=21.353
+INFO:root:Iteration[7] Validation-accuracy=0.737638
+INFO:root:Iteration[7] Validation-accuracy=0.774328
+INFO:root:Iteration[8] Train-accuracy=0.794967
+INFO:root:Iteration[8] Time cost=21.593
+INFO:root:Iteration[8] Train-accuracy=0.798485
+INFO:root:Iteration[8] Time cost=21.672
+INFO:root:Iteration[8] Validation-accuracy=0.762460
+INFO:root:Iteration[8] Train-accuracy=0.795503
+INFO:root:Iteration[8] Time cost=22.155
+INFO:root:Iteration[8] Validation-accuracy=0.745748
+INFO:root:Iteration[8] Validation-accuracy=0.784513
+INFO:root:Iteration[9] Train-accuracy=0.825561
+INFO:root:Iteration[9] Time cost=21.644
+INFO:root:Iteration[9] Train-accuracy=0.821923
+INFO:root:Iteration[9] Time cost=21.479
+INFO:root:Iteration[9] Validation-accuracy=0.727453
+INFO:root:Iteration[9] Validation-accuracy=0.745253
+INFO:root:Iteration[9] Train-accuracy=0.819716
+INFO:root:Iteration[9] Time cost=21.927
+INFO:root:Iteration[9] Validation-accuracy=0.781151
+INFO:root:Iteration[10] Train-accuracy=0.842975
+INFO:root:Iteration[10] Time cost=21.431
+INFO:root:Iteration[10] Train-accuracy=0.841543
+INFO:root:Iteration[10] Time cost=21.387
+INFO:root:Iteration[10] Validation-accuracy=0.768196
+INFO:root:Iteration[10] Validation-accuracy=0.781448
+INFO:root:Iteration[10] Train-accuracy=0.843989
+INFO:root:Iteration[10] Time cost=21.875
+INFO:root:Iteration[10] Validation-accuracy=0.804391
+INFO:root:Iteration[11] Train-accuracy=0.860329
+INFO:root:Iteration[11] Time cost=20.664
+INFO:root:Iteration[11] Train-accuracy=0.858958
+INFO:root:Iteration[11] Time cost=20.734
+INFO:root:Iteration[11] Validation-accuracy=0.780063
+INFO:root:Iteration[11] Validation-accuracy=0.774426
+INFO:root:Iteration[11] Train-accuracy=0.861104
+INFO:root:Iteration[11] Time cost=21.449
+INFO:root:Iteration[11] Validation-accuracy=0.818335
+INFO:root:Iteration[12] Train-accuracy=0.885973
+INFO:root:Iteration[12] Time cost=21.037
+INFO:root:Iteration[12] Train-accuracy=0.887583
+INFO:root:Iteration[12] Time cost=21.066
+INFO:root:Iteration[12] Validation-accuracy=0.798358
+INFO:root:Iteration[12] Validation-accuracy=0.803204
+INFO:root:Iteration[12] Train-accuracy=0.885914
+INFO:root:Iteration[12] Time cost=21.738
+INFO:root:Iteration[12] Validation-accuracy=0.812203
+INFO:root:Iteration[13] Train-accuracy=0.904103
+INFO:root:Iteration[13] Time cost=21.326
+INFO:root:Iteration[13] Train-accuracy=0.904282
+INFO:root:Iteration[13] Time cost=21.278
+INFO:root:Iteration[13] Validation-accuracy=0.791238
+INFO:root:Iteration[13] Validation-accuracy=0.799842
+INFO:root:Iteration[13] Train-accuracy=0.901002
+INFO:root:Iteration[13] Time cost=21.408
+INFO:root:Iteration[13] Validation-accuracy=0.802116
+INFO:root:Iteration[14] Train-accuracy=0.911140
+INFO:root:Iteration[14] Time cost=21.527
+INFO:root:Iteration[14] Train-accuracy=0.913705
+INFO:root:Iteration[14] Time cost=21.569
+INFO:root:Iteration[14] Validation-accuracy=0.803204
+INFO:root:Iteration[14] Validation-accuracy=0.803303
+INFO:root:Iteration[14] Train-accuracy=0.914182
+INFO:root:Iteration[14] Time cost=22.170
+INFO:root:Iteration[14] Validation-accuracy=0.771460
+INFO:root:Iteration[15] Train-accuracy=0.915852
+INFO:root:Iteration[15] Time cost=21.608
+INFO:root:Iteration[15] Train-accuracy=0.911975
+INFO:root:Iteration[15] Time cost=21.623
+INFO:root:Iteration[15] Validation-accuracy=0.801325
+INFO:root:Iteration[15] Validation-accuracy=0.798259
+INFO:root:Iteration[15] Train-accuracy=0.923008
+INFO:root:Iteration[15] Time cost=21.806
+INFO:root:Iteration[15] Validation-accuracy=0.809335
+INFO:root:Iteration[16] Train-accuracy=0.938096
+INFO:root:Iteration[16] Time cost=21.857
+INFO:root:Iteration[16] Train-accuracy=0.944358
+INFO:root:Iteration[16] Time cost=21.954
+INFO:root:Iteration[16] Validation-accuracy=0.790249
+INFO:root:Iteration[16] Validation-accuracy=0.795095
+INFO:root:Iteration[16] Train-accuracy=0.947877
+INFO:root:Iteration[16] Time cost=21.844
+INFO:root:Iteration[16] Validation-accuracy=0.812797
+INFO:root:Iteration[17] Train-accuracy=0.953006
+INFO:root:Iteration[17] Time cost=21.357
+INFO:root:Iteration[17] Train-accuracy=0.957121
+INFO:root:Iteration[17] Time cost=21.431
+INFO:root:Iteration[17] Validation-accuracy=0.793908
+INFO:root:Iteration[17] Validation-accuracy=0.793216
+INFO:root:Iteration[17] Train-accuracy=0.962846
+INFO:root:Iteration[17] Time cost=21.819
+INFO:root:Iteration[17] Validation-accuracy=0.812994
+INFO:root:Iteration[18] Train-accuracy=0.961772
+INFO:root:Iteration[18] Time cost=20.599
+INFO:root:Iteration[18] Train-accuracy=0.963800
+INFO:root:Iteration[18] Time cost=20.569
+INFO:root:Iteration[18] Validation-accuracy=0.815467
+INFO:root:Iteration[18] Validation-accuracy=0.818829
+INFO:root:Iteration[18] Train-accuracy=0.966603
+INFO:root:Iteration[18] Time cost=21.018
+INFO:root:Iteration[18] Validation-accuracy=0.812698
+INFO:root:Iteration[19] Train-accuracy=0.975131
+INFO:root:Iteration[19] Time cost=20.671
+INFO:root:Iteration[19] Train-accuracy=0.975847
+INFO:root:Iteration[19] Time cost=20.758
+INFO:root:Iteration[19] Validation-accuracy=0.822785
+INFO:root:Iteration[19] Validation-accuracy=0.823378
+INFO:root:Iteration[19] Train-accuracy=0.981990
+INFO:root:Iteration[19] Time cost=20.912
+INFO:root:Accuracy = 0.823800
+INFO:root:Iteration[19] Validation-accuracy=0.828521
+INFO:root:Accuracy = 0.829200
+INFO:root:Accuracy = 0.833000
+```
+
+## imagenet
+
+3 x dual 980, with cudnn, 1G ethernet
+
+`dist_sync`:
+
+```
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Iter[0] Batch [5]	Speed: 175.98 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 173.52 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 171.04 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 107.82 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 108.03 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 107.79 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 109.53 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 109.74 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 110.21 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 113.19 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 111.20 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 110.38 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 111.24 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 109.90 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 107.48 samples/sec
+```
+
+`dist_aync`
+
+```
+INFO:root:Iter[0] Batch [5]	Speed: 202.15 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 181.41 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 179.61 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 125.75 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 108.90 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 109.25 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 118.44 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 112.89 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 112.83 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 123.68 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 115.85 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 105.82 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 124.24 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 115.21 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 106.60 samples/sec
+INFO:root:Iter[0] Batch [30]	Speed: 120.62 samples/sec
+INFO:root:Iter[0] Batch [30]	Speed: 121.35 samples/sec
+```
diff --git a/tests/python/multi-node/common.py b/tests/python/multi-node/common.py
index 2d33a32c7145..0db092462a78 100644
--- a/tests/python/multi-node/common.py
+++ b/tests/python/multi-node/common.py
@@ -58,10 +58,11 @@ def cifar10(batch_size, input_shape, num_parts=1, part_index=0):
         rand_mirror = False,
         shuffle     = False,
         round_batch = False,
-        data_shape  = (3,28,28),
+        data_shape  = input_shape,
         batch_size  = batch_size)
     return (train, val)
 
+
 def accuracy(model, data):
     """evaluate acc"""
     # predict
diff --git a/tests/python/multi-node/dist_async_inception.py b/tests/python/multi-node/dist_async_inception.py
new file mode 100755
index 000000000000..cb7fd656471f
--- /dev/null
+++ b/tests/python/multi-node/dist_async_inception.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# pylint: skip-file
+import mxnet as mx
+import common
+import logging
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+(train, val) = common.cifar10(num_parts = kv.num_workers,
+                              part_index = kv.rank,
+                              batch_size = 128,
+                              input_shape=(3,28,28))
+
+# assume each worker has two gpus
+devs = [mx.gpu(i) for i in range(2)]
+model = mx.model.FeedForward.create(
+    ctx           = devs,
+    kvstore       = kv,
+    symbol        = common.inception(),
+    X             = train,
+    eval_data     = val,
+    num_round     = 20,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001,
+    initializer   = mx.init.Uniform(0.07))
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_async_lenet.py b/tests/python/multi-node/dist_async_lenet.py
new file mode 100755
index 000000000000..866eed3b8f2a
--- /dev/null
+++ b/tests/python/multi-node/dist_async_lenet.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import common
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+# feed each machine the whole data
+(train, val) = common.mnist(num_parts = kv.num_workers,
+                            part_index = kv.rank,
+                            batch_size = 100,
+                            input_shape = (1,28,28))
+
+model  = mx.model.FeedForward.create(
+    ctx           = mx.gpu(kv.rank),
+    kvstore       = kv,
+    symbol        = common.lenet(),
+    X             = train,
+    num_round     = 10,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001)
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_imagenet_inception.py b/tests/python/multi-node/dist_imagenet_inception.py
new file mode 100755
index 000000000000..978b821f8fa6
--- /dev/null
+++ b/tests/python/multi-node/dist_imagenet_inception.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import imagenet
+
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_sync')
+
+batch_size = 96
+(train, val) = imagenet.ilsvrc12(num_parts = kv.num_workers,
+                                part_index = kv.rank,
+                                batch_size = batch_size,
+                                input_shape = (3, 224, 224))
+
+# assume each worker has two gpus
+devs = [mx.gpu(i) for i in range(2)]
+
+model = mx.model.FeedForward(
+    ctx           = devs,
+    symbol        = imagenet.inception(1000),
+    num_round     = 20,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001)
+
+model.fit(X        = train,
+          eval_data     = val,
+          kvstore       = kv,
+          epoch_end_callback = mx.callback.Speedometer(batch_size, 5))
diff --git a/tests/python/multi-node/imagenet.py b/tests/python/multi-node/imagenet.py
new file mode 100644
index 000000000000..7663df8d1bad
--- /dev/null
+++ b/tests/python/multi-node/imagenet.py
@@ -0,0 +1,101 @@
+import sys
+sys.path.insert(0, "../common/")
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import get_data
+import numpy as np
+import logging
+
+def ilsvrc12(batch_size, input_shape, num_parts=1, part_index=0):
+    """return ilsvrc12 iterator
+    """
+    data_dir = "../../../../ilsvrc12/"
+    train = mx.io.ImageRecordIter(
+        path_imgrec = data_dir + "train.rec",
+        mean_img    = data_dir + "mean.bin",
+        data_shape  = input_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True,
+        shuffle     = True,
+        round_batch = True,
+        num_parts   = num_parts,
+        part_index  = part_index)
+    val = mx.io.ImageRecordIter(
+        path_imgrec = data_dir + "val.rec",
+        mean_img    = data_dir + "mean.bin",
+        rand_crop   = False,
+        rand_mirror = False,
+        shuffle     = False,
+        round_batch = False,
+        data_shape  = input_shape,
+        batch_size  = batch_size)
+    return (train, val)
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
+    in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
+    in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/multi-node/local_inception.py b/tests/python/multi-node/local_inception.py
index 5c5fad3c4da1..fcaa8dc79688 100755
--- a/tests/python/multi-node/local_inception.py
+++ b/tests/python/multi-node/local_inception.py
@@ -15,8 +15,9 @@ def test_inception(devs, kv_type):
         ctx           = devs,
         symbol        = common.inception(),
         X             = train,
+        eval_data     = val,
         kvstore       = kv_type,
-        num_round     = 4,
+        num_round     = 10,
         learning_rate = 0.1,
         momentum      = 0.9,
         wd            = 0.00001,

From 2ce65edf642804a1abcaaa255a33957a5ccd8ac1 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 21 Oct 2015 12:41:03 -0700
Subject: [PATCH 059/122] [OP, Refactor] Enable register mshadow unary op in
 only one line

---
 src/common/tblob_op_registry.cc    | 132 +++++++++++++++++++++++++++++
 src/common/tblob_op_registry.h     | 101 ++++++++++++++++++++++
 src/ndarray/ndarray.cc             |  53 ------------
 src/ndarray/ndarray_function-inl.h |  19 -----
 src/ndarray/ndarray_function.h     |  14 ---
 src/ndarray/unary_function-inl.h   |  46 ++++++++++
 src/ndarray/unary_function.cc      |   7 ++
 src/ndarray/unary_function.cu      |   8 ++
 8 files changed, 294 insertions(+), 86 deletions(-)
 create mode 100644 src/common/tblob_op_registry.cc
 create mode 100644 src/common/tblob_op_registry.h
 create mode 100644 src/ndarray/unary_function-inl.h
 create mode 100644 src/ndarray/unary_function.cc
 create mode 100644 src/ndarray/unary_function.cu

diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
new file mode 100644
index 000000000000..e205f29cc42c
--- /dev/null
+++ b/src/common/tblob_op_registry.cc
@@ -0,0 +1,132 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file tblob_op_registry.cc
+ * Implementation of tblob op registry
+ */
+#include <mxnet/ndarray.h>
+#include <mxnet/engine.h>
+#include <vector>
+#include <mutex>
+#include "./tblob_op_registry.h"
+
+namespace mxnet {
+namespace common {
+
+
+class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
+ public:
+  TSelf& set_function(int dev_mask, UnaryFunction funary) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ++reg_counter_;
+    if (funary_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_[dev_mask] = funary;
+    // return if it is already registered.
+    if (reg_counter_ != 1) return *this;
+
+    // The body to be registered
+    auto body = [this] (NDArray **used_vars,
+                        real_t *s,
+                        NDArray **mutate_vars) {
+      NDArray src = *used_vars[0];
+      NDArray *out = mutate_vars[0];
+
+      if (out->is_none()) {
+        *out = NDArray(src.shape(), src.ctx(), true);
+      } else {
+        CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+        CHECK(out->shape() == src.shape()) << "target shape mismatch";
+      }
+      // important: callback must always capture by value
+      NDArray ret = *out;
+      // get the const variables
+      std::vector<Engine::VarHandle> const_vars;
+      if (src.var() != ret.var()) const_vars.push_back(src.var());
+      // check if the function exist
+      int dev_mask = src.ctx().dev_mask();
+      if (static_cast<size_t>(dev_mask) >= funary_.size() ||
+          funary_[dev_mask] == nullptr) {
+        if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+        LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
+      }
+      // invoke the function
+      UnaryFunction fun = funary_[dev_mask];
+      Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
+          ret.CheckAndAlloc();
+          TBlob tmp = ret.data();
+          (*fun)(src.data(), &tmp, ctx);
+#if MXNET_USE_CUDA
+          if (dev_mask == gpu::kDevMask) {
+            ctx.get_stream<gpu>()->Wait();
+          }
+#endif
+        }, src.ctx(), const_vars, {ret.var()});
+    };
+    // register the function.
+    NDArrayReg()
+        .set_body(body)
+        .set_num_use_vars(1)
+        .set_num_mutate_vars(1)
+        .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
+        .add_argument("src", "NDArray", "Source input to the function");
+    return *this;
+  }
+
+  TSelf& describe(const std::string &description) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (reg_counter_ != 1) return *this;
+    NDArrayReg().describe(description);
+    return *this;
+  }
+
+  GenericTBlobOp *GetOp() const override {
+    return nullptr;
+  }
+
+ private:
+  // internal mutex
+  std::mutex mutex_;
+  // unary functions on each device mask
+  std::vector<UnaryFunction> funary_;
+  // registration counter
+  int reg_counter_{0};
+  // NDArray registry
+  NDArrayFunctionReg *ndarray_reg_{nullptr};
+  // internal function to register NDArray function.
+  inline NDArrayFunctionReg &NDArrayReg() {
+    if (ndarray_reg_ == nullptr) {
+      NDArrayFunctionReg &reg =
+          ::dmlc::Registry<NDArrayFunctionReg>::Get()->__REGISTER__(this->name);
+      ndarray_reg_ = &reg;
+    }
+    return *ndarray_reg_;
+  }
+};
+
+
+TBlobOpRegEntry& TBlobOpRegistry::__REGISTER_OR_FIND__(const std::string &name) {
+  if (fmap_.count(name) != 0) return *fmap_.at(name);
+  TBlobOpRegEntry *e = new TBlobOpRegEntryImpl();
+  e->name = name;
+  fmap_[name] = e;
+  return *e;
+}
+
+TBlobOpRegistry* TBlobOpRegistry::Get() {
+  static TBlobOpRegistry inst;
+  return &inst;
+}
+
+TBlobOpRegistry::~TBlobOpRegistry() {
+  for (auto kv : fmap_) {
+    delete kv.second;
+  }
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
new file mode 100644
index 000000000000..910543efacb3
--- /dev/null
+++ b/src/common/tblob_op_registry.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file tblob_op_registry.h
+ * \brief Helper registry to make registration of simple unary binary math function easy.
+ * Register to this registry will enable both symbolic operator and NDArray operator in client.
+ *
+ * More complicated operators can be registered in normal way in ndarray and operator modules.
+ */
+#ifndef MXNET_COMMON_TBLOB_OP_REGISTRY_H_
+#define MXNET_COMMON_TBLOB_OP_REGISTRY_H_
+
+#include <dmlc/registry.h>
+#include <mxnet/base.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace mxnet {
+namespace common {
+
+/*! \brief pre-declare generic TBlob function*/
+struct GenericTBlobOp;
+
+/*! \brief registry for function entry */
+class TBlobOpRegEntry {
+ public:
+  /*! \brief unary tblob function */
+  typedef void (*UnaryFunction)(const TBlob &src,
+                                TBlob *ret,
+                                RunContext ctx);
+  /*! \brief declare self type */
+  typedef TBlobOpRegEntry TSelf;
+  /*! \brief name of the entry */
+  std::string name;
+  /*!
+   * \brief set function of the function to be funary
+   * \param dev_mask The device mask of the function can act on.
+   * \param funary The unary function that peforms the operation.
+   */
+  virtual TSelf& set_function(int dev_mask, UnaryFunction funary) = 0;
+  /*!
+   * \brief Describe the function.
+   * \param description The description of the function.
+   * \return reference to self.
+   */
+  virtual TSelf& describe(const std::string &description) = 0;
+  /*!
+   * \brief get the internal function representation
+   * \return the internal function representation.
+   */
+  virtual GenericTBlobOp *GetOp() const = 0;
+  /*! \brief destructor */
+  virtual ~TBlobOpRegEntry() {}
+};
+
+/*! \brief registry for TBlob functions */
+class TBlobOpRegistry {
+ public:
+  /*!
+   * \brief Internal function to register a name function under name.
+   * \param name name of the function
+   * \return ref to the registered entry, used to set properties
+   */
+  TBlobOpRegEntry &__REGISTER_OR_FIND__(const std::string& name);
+  /*!
+   * \brief Find the entry with corresponding name.
+   * \param name name of the function
+   * \return the corresponding function, can be NULL
+   */
+  inline static const TBlobOpRegEntry *Find(const std::string &name) {
+    return Get()->fmap_.at(name);
+  }
+  /*! \return global singleton of the registry */
+  static TBlobOpRegistry* Get();
+
+ private:
+  // destructor
+  ~TBlobOpRegistry();
+  /*! \brief internal registry map */
+  std::map<std::string, TBlobOpRegEntry*> fmap_;
+};
+
+#if DMLC_USE_CXX11
+struct GenericTBlobOp {
+  /*! \brief function type of the function */
+  typedef std::function<void (const std::vector<TBlob> &in,
+                              TBlob *out,
+                              RunContext ctx)> OpType;
+  /*! \brief the real operator */
+  OpType op;
+};
+#endif
+
+#define MXNET_REGISTER_TBLOB_FUN(Name, DEV)                             \
+  static ::mxnet::common::TBlobOpRegEntry &                             \
+  __make_ ## TBlobOpRegEntry ## _ ## Name ## __ ## DEV ##__ =           \
+      ::mxnet::common::TBlobOpRegistry::Get()->__REGISTER_OR_FIND__(#Name)
+
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_TBLOB_OP_REGISTRY_H_
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 74ac76c00f66..26a62fb60264 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -155,53 +155,6 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
-/*!
- * \brief run a unary operation.
- * \param src source operand
- * \param out the output ndarray
- * \param unary_op the real
- */
-template<typename OP>
-void UnaryOp(const NDArray &src,
-             NDArray *out) {
-  if (out->is_none()) {
-    *out = NDArray(OP::GetShape(src.shape()), src.ctx(), true);
-  } else {
-    CHECK(out->ctx() == src.ctx()) << "target context mismatch";
-    CHECK(out->shape() == OP::GetShape(src.shape())) << "target shape mismatch";
-  }
-  // important: callback must always capture by value
-  NDArray ret = *out;
-  // get the const variables
-  std::vector<Engine::VarHandle> const_vars;
-  if (src.var() != ret.var()) const_vars.push_back(src.var());
-
-  // redirect everything to mshadow operations
-  switch (src.ctx().dev_mask()) {
-    case cpu::kDevMask: {
-      Engine::Get()->PushSync([src, ret](RunContext ctx) {
-          ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Eval<cpu, OP>(src.data(), &tmp, ctx);
-        }, src.ctx(), const_vars, {ret.var()});
-      break;
-    }
-#if MXNET_USE_CUDA
-    case gpu::kDevMask: {
-      Engine::Get()->PushSync([src, ret](RunContext ctx) {
-          ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Eval<gpu, OP>(src.data(), &tmp, ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, src.ctx(), const_vars, {ret.var()});
-      break;
-    }
-#endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-  }
-}
-
 void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
   CHECK(from.shape() == to->shape())
       << "operands shape mismatch";
@@ -649,12 +602,6 @@ void NDArray::SyncCopyToCPU(real_t *data, size_t size) const {
 MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
 
 
-MXNET_REGISTER_NDARRAY_FUN(square).set_function(UnaryOp<ndarray::Square>)
-.describe("Take square of the src");
-
-MXNET_REGISTER_NDARRAY_FUN(sqrt).set_function(UnaryOp<ndarray::SquareRoot>)
-.describe("Take square root of the src");
-
 MXNET_REGISTER_NDARRAY_FUN(_plus).set_function(BinaryOp<ndarray::Plus>);
 MXNET_REGISTER_NDARRAY_FUN(_minus).set_function(BinaryOp<ndarray::Minus>);
 MXNET_REGISTER_NDARRAY_FUN(_mul).set_function(BinaryOp<ndarray::Mul>);
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 8b5bfc72bcc1..20f9eb8c65a0 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -18,14 +18,6 @@
   }
 #endif
 
-#ifndef DECL_UNARY
-#define DECL_UNARY(XPU, OP, FUN)                                        \
-  template<>                                                            \
-  void Eval<XPU, OP>(const TBlob &src, TBlob *ret, RunContext ctx) {    \
-    FUN<XPU, OP>(src, ret, ctx);                                        \
-  }
-#endif
-
 #ifndef DECL_SCALAR
 #define DECL_SCALAR(XPU, OP, FUN, REVERSE)                              \
   template<>                                                            \
@@ -53,15 +45,6 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
                                    rhs.FlatTo2D<xpu, real_t>(s));
 }
 
-template<typename xpu, typename OP>
-inline void EvalUnary_(const TBlob &src,
-                       TBlob *ret, RunContext ctx) {
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-      = F<typename OP::mshadow_op>(src.FlatTo2D<xpu, real_t>(s));
-}
-
 template<typename xpu, typename OP>
 inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
                      TBlob *ret, RunContext ctx) {
@@ -197,8 +180,6 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
 }
 
 // declarations
-DECL_UNARY(DEVICE, Square, EvalUnary_)
-DECL_UNARY(DEVICE, SquareRoot, EvalUnary_)
 DECL_BINARY(DEVICE, MatChooseRowElem, EvalMatChooseRowElem_)
 DECL_BINARY(DEVICE, Dot, EvalDot_)
 DECL_BINARY(DEVICE, OneHotEncode, EvalOneHot_)
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 1263f39e5998..9f23c1a5c348 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -24,12 +24,6 @@ struct BinaryBase {
   }
 };
 
-struct UnaryBase {
-  inline static TShape GetShape(const TShape &shape) {
-    return shape;
-  }
-};
-
 // operators
 struct Plus : public BinaryBase {
   typedef mshadow::op::plus mshadow_op;
@@ -47,14 +41,6 @@ struct Div : public BinaryBase {
   typedef mshadow::op::div mshadow_op;
 };
 
-struct Square : public UnaryBase {
-  typedef op::mshadow_op::square mshadow_op;
-};
-
-struct SquareRoot : public UnaryBase {
-  typedef op::mshadow_op::square_root mshadow_op;
-};
-
 struct ClipMin : public BinaryBase {
   struct mshadow_op {
     MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
new file mode 100644
index 000000000000..7832ce1798cd
--- /dev/null
+++ b/src/ndarray/unary_function-inl.h
@@ -0,0 +1,46 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary-function-inl.h
+ * \brief the real execution functions of ndarray operations
+ */
+#ifndef MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
+#define MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
+
+#include "../common/tblob_op_registry.h"
+#include "../operator/mshadow_op.h"
+
+#if defined(__CUDACC__)
+#define DEVICE gpu
+#else
+#define DEVICE cpu
+#endif
+
+namespace mxnet {
+namespace ndarray {
+
+template<typename xpu, typename OP>
+void EvalUnary_(const TBlob &src,
+                TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->FlatTo2D<xpu, real_t>(s)
+      = F<OP>(src.FlatTo2D<xpu, real_t>(s));
+}
+
+// helper macro to register mshadow element-wise unary opts
+// usually you only need to use this to register common operations
+#define REGISTER_MSHADOW_UNARY(Name, Op)            \
+  MXNET_REGISTER_TBLOB_FUN(Name, DEVICE)            \
+  .set_function(DEVICE::kDevMask, EvalUnary_<DEVICE, Op>)
+
+
+// register all unary operations here
+REGISTER_MSHADOW_UNARY(square, op::mshadow_op::square)
+.describe("Take square of the src");
+
+REGISTER_MSHADOW_UNARY(sqrt, op::mshadow_op::square_root)
+.describe("Take square root of the src");
+
+}  // namespace ndarray
+}  // namespace mxnet
+#endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/src/ndarray/unary_function.cc b/src/ndarray/unary_function.cc
new file mode 100644
index 000000000000..f77f113e611e
--- /dev/null
+++ b/src/ndarray/unary_function.cc
@@ -0,0 +1,7 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary_function.cc
+ * \brief CPU Implementation of unary function.
+ */
+// this will be invoked by gcc and compile CPU version
+#include "./unary_function-inl.h"
diff --git a/src/ndarray/unary_function.cu b/src/ndarray/unary_function.cu
new file mode 100644
index 000000000000..0c0d4e64957c
--- /dev/null
+++ b/src/ndarray/unary_function.cu
@@ -0,0 +1,8 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary_function.cu
+ * \brief GPU Implementation of unary function.
+ */
+// this will be invoked by gcc and compile GPU version
+// real common implementation is only in the -inl.h file.
+#include "./unary_function-inl.h"

From 0a6e7616bb903c1d79813acbd782a8de213df62f Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 10:03:15 -0700
Subject: [PATCH 060/122] required interface of optimizer should be present in
 the base class to avoid confusion

---
 python/mxnet/optimizer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 8c5b54178f31..9fa580d6956c 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,7 +4,7 @@
 
 class Optimizer(object):
     """Base class of all optimizers."""
-    def __init__(self):
+    def __init__(self, rescale_grad=1):
         self.iteration = 0
 
     def begin_round(self, iteration):
@@ -17,6 +17,13 @@ def begin_round(self, iteration):
         """
         self.iteration = iteration
 
+    def create_state(self, index, weight):
+        """Create additional optimizer state such as momentum.
+        override in implementations."""
+
+    def update(self, index, weight, grad, state):
+        """Update the parameters. override in implementations"""
+
 
 class SGD(Optimizer):
     """A very simple SGD optimizer with momentum and weight regularization.
@@ -41,7 +48,7 @@ class SGD(Optimizer):
     def __init__(self, learning_rate=0.01, momentum=0.0,
                  wd=0.0001, rescale_grad=1, clip_gradient=None,
                  lr_scheduler=None):
-        super(SGD, self).__init__()
+        super(SGD, self).__init__(rescale_grad)
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd

From e1d4b6e45d5eff3ac6acf339e9f43a67c40552e1 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 10:34:52 -0700
Subject: [PATCH 061/122] Use factory pattern to create optimzers. Creator
 tracks all (direct and indirect) subclasses of Optimizer by class name. We
 currently silently allow overriding existing names, but maybe giving a
 warning is better?

---
 python/mxnet/optimizer.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 9fa580d6956c..1c708c533ca0 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,8 +4,21 @@
 
 class Optimizer(object):
     """Base class of all optimizers."""
+    class __metaclass__(type):
+        """Meta class for tracking all subclasses(implementations)
+        of Optimizer."""
+        __optimizers__ = {}
+
+        def __new__(meta, name, bases, attrs):
+            cls = type.__new__(meta, name, bases, attrs)
+            #Allow overriding of existing optimizer.
+            #Always keep the last one.
+            meta.__optimizers__[cls.__name__] = cls
+            return cls
+
     def __init__(self, rescale_grad=1):
         self.iteration = 0
+        self.rescale_grad = rescale_grad
 
     def begin_round(self, iteration):
         """Function called to notify beginning of iteration.
@@ -52,7 +65,6 @@ def __init__(self, learning_rate=0.01, momentum=0.0,
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd
-        self.rescale_grad = rescale_grad
         self.clip_gradient = clip_gradient
         self.lr_scheduler = lr_scheduler
         if lr_scheduler != None:
@@ -112,12 +124,11 @@ def update(self, index, weight, grad, state):
             weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
 
 
-class Test(object):
+class Test(Optimizer):
     """For test use"""
     def __init__(self, rescale_grad=1):
-        self.rescale_grad = rescale_grad
-
-
+        super(Test, self).__init__(rescale_grad)
+        
     # pylint: disable=no-self-use
     def create_state(self, index, weight):
         """Create a state to duplicate weight"""
@@ -147,10 +158,16 @@ def create(name, rescale_grad=1, **kwargs):
     opt : Optimizer
         The result optimizer.
     """
+    #TODO(eric): kept for backward compatibility.
+    #            remove after all downstream functions move to 
+    #            new naming standard.
     if name == 'sgd' or name == 'SGD':
         return SGD(rescale_grad=rescale_grad, **kwargs)
     if name == 'test':
         return Test(rescale_grad=rescale_grad)
+
+    if name in Optimizer.__optimizers__:
+        return Optimizer.__optimizers__[name](rescale_grad=rescale_grad, **kwargs)
     else:
         raise ValueError('Cannot find optimizer %s' % name)
 

From 2abcd07d0d83a80d887b4ad6754a1e5d398f2754 Mon Sep 17 00:00:00 2001
From: Chiyuan Zhang <pluskid@gmail.com>
Date: Wed, 21 Oct 2015 19:43:19 -0400
Subject: [PATCH 062/122] fix typo

---
 example/rnn/lstm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index 996861a80894..25245aad18ee 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -17,7 +17,7 @@
 def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
     """LSTM Cell symbol"""
     if dropout > 0.:
-        in_data = mx.sym.Dropout(data=in_data, p=dropout)
+        indata = mx.sym.Dropout(data=indata, p=dropout)
     i2h = mx.sym.FullyConnected(data=indata,
                                 weight=param.i2h_weight,
                                 bias=param.i2h_bias,

From 30e995b7a62493aa01184a7b2f574c4940c9390b Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 21:19:11 -0600
Subject: [PATCH 063/122] Update char_lstm.ipynb

Due to my typo in https://github.com/dmlc/mxnet/pull/353
This notebook doesn't train with dropout
---
 example/rnn/char_lstm.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/rnn/char_lstm.ipynb b/example/rnn/char_lstm.ipynb
index cac585d05a92..72ba3f18dc41 100644
--- a/example/rnn/char_lstm.ipynb
+++ b/example/rnn/char_lstm.ipynb
@@ -195,7 +195,7 @@
     "                             batch_size=batch_size,\n",
     "                             input_size=vocab,\n",
     "                             initializer=mx.initializer.Uniform(0.1),\n",
-    "                             dropout=0.5)\n"
+    "                             dropout=0.)\n"
    ]
   },
   {

From a030aefbc1b64d34362a42d0e9cf3cffb8f20541 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:42:16 +0800
Subject: [PATCH 064/122] fix windows linkage problem by adding dllexport

---
 include/mxnet/base.h    | 13 +++++++++++++
 include/mxnet/engine.h  |  2 +-
 include/mxnet/storage.h |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1ef9c6bf8450..b3ee2242d182 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -49,6 +49,19 @@
 #endif
 #endif
 
+/*!
+* \brief define dllexport for Visual Studio
+*/
+#ifdef _MSC_VER
+#ifdef MXNET_EXPORTS
+#define MXAPI __declspec(dllexport)
+#else
+#define MXAPI __declspec(dllimport)
+#endif
+#else
+#define MXAPI
+#endif
+
 /*! \brief namespace of mxnet */
 namespace mxnet {
 /*! \brief mxnet cpu */
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 03eb45b54de0..9b879ef3b4c2 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class Engine {
+class MXAPI Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index da7a8aaa5388..743b4e8b0514 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-class Storage {
+  class MXAPI Storage {
  public:
   /*!
    * \brief Storage handle.

From 18adb83fc33bd52fe20326c4fa2dcd324cfd9421 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 11:27:35 +0800
Subject: [PATCH 065/122] change MXAPI to MXNET_API

---
 include/mxnet/base.h    | 4 ++--
 include/mxnet/engine.h  | 2 +-
 include/mxnet/storage.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index b3ee2242d182..962740b4194d 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -54,9 +54,9 @@
 */
 #ifdef _MSC_VER
 #ifdef MXNET_EXPORTS
-#define MXAPI __declspec(dllexport)
+#define MXNET_API __declspec(dllexport)
 #else
-#define MXAPI __declspec(dllimport)
+#define MXNET_API __declspec(dllimport)
 #endif
 #else
 #define MXAPI
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 9b879ef3b4c2..195f5c05eb20 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class MXAPI Engine {
+class MXNET_API Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 743b4e8b0514..60bca03b0680 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-  class MXAPI Storage {
+class MXNET_API Storage {
  public:
   /*!
    * \brief Storage handle.

From 936915131b7bb1604cba92fee16191e53414c406 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 13:11:39 +0800
Subject: [PATCH 066/122] minor

---
 include/mxnet/base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 962740b4194d..1eeffc1ab4b9 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -59,7 +59,7 @@
 #define MXNET_API __declspec(dllimport)
 #endif
 #else
-#define MXAPI
+#define MXNET_API
 #endif
 
 /*! \brief namespace of mxnet */

From 27d67e48661b1bd4bf6747aa18b058f8323de1dc Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:33:16 +0800
Subject: [PATCH 067/122] add installation guide for pre-built windows binary

---
 doc/build.md | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index 6907d78bf154..5be22f62bfb3 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -7,9 +7,11 @@ if you have ideas to improve this page, please send a pull request!
 
 Contents
 --------
-- [Build MXNet Library](#build-mxnet-library)
-  - Introduces how to build the mxnet core library for all packages.
-  - Supported platforms: linux, windows, osx
+- [Building MXNet Library](#build-mxnet-library)
+  - [Prerequisites](#prerequisites)
+  - [Building on Linux](#building-on-linux)
+  - [Building on Windows](#building-on-windows)
+  - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
   - Introduces how to build mxnet with advanced features such as HDFS/S3 support, CUDNN
 - [Python Package Installation](#python-package-installation)
@@ -17,6 +19,9 @@ Contents
 
 Build MXNet Library
 -------------------
+
+### Prerequisites
+
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
@@ -36,7 +41,7 @@ The system dependency requirement for mxnet libraries are
 - BLAS library.
 - opencv (optional if you do not need image augmentation, you can switch it off in config.mk)
 
-### Linux
+### Building on Linux
 
 On Ubuntu >= 13.10, one can install the dependencies by
 
@@ -73,7 +78,7 @@ make -j4
 
 Then proceed to package installation instructions for python or R in this page.
 
-### Windows
+### Building on Windows
 
 Firstly, we should make your Visual Studio 2013 support more C++11 features.
 
@@ -88,6 +93,14 @@ Finally, use CMake to create a Visual Studio solution in `./build/`. During conf
 
 Then proceed to package installation instructions for python or R in this page.
 
+### Installing pre-built packages on Windows
+
+Mxnet also provides pre-built packages on Windows. The pre-built package includes pre-build MxNet library, the dependent thrid-party libraries, a sample C++ solution in Visual Studio and the Python install script.
+
+You can download the packages from the [Releases tab](https://github.com/dmlc/mxnet/releases) of MxNet. There are two variants provided: one with GPU support (using CUDA and CUDNN v3) and one without GPU support. You can choose one that fits your hardward configuration.
+
+After download, unpack the package into a folder, say D:\MxNet, then install the package by double clicking the setupenv.cmd inside the folder. It will setup environmental variables needed by MxNet. After that, you should be able to usee the provided VS solution to build C++ programs, or to [install Python package](#python-package-installation).
+
 Advanced Build Configurations
 -----------------------------
 The configuration of mxnet can be modified by ```config.mk```

From 2bde47d71b10ed0d2e06b0463b5f217c36602535 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:38:29 +0800
Subject: [PATCH 068/122] fix dll name on windows

---
 doc/build.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/build.md b/doc/build.md
index 5be22f62bfb3..e7cef14fc87f 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -10,6 +10,7 @@ Contents
 - [Building MXNet Library](#build-mxnet-library)
   - [Prerequisites](#prerequisites)
   - [Building on Linux](#building-on-linux)
+  - [Building on OSX](#building-on-osx)
   - [Building on Windows](#building-on-windows)
   - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
@@ -25,7 +26,7 @@ Build MXNet Library
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
-- On Windows the target libary is ```mxnet.dll```
+- On Windows the target libary is ```libmxnet.dll```
 
 Things to do before get started:
 
@@ -56,7 +57,7 @@ make -j4
 ```
 Then proceed to package installation instructions for python or R in this page.
 
-### OSX
+### Buillding on OSX
 On OSX, we can install the dependencies by
 
 ```bash

From 69292992bdb5c6e9a6ccb2f19368b60c0eed1acf Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Thu, 22 Oct 2015 10:44:14 +0800
Subject: [PATCH 069/122] disable ps-lite on windows for now

---
 CMakeLists.txt | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d55086fd197a..fe020b81502b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,6 @@ else(MSVC)
 endif(MSVC)
 
 if(USE_OPENCV)
-  if(MSVC)
-    set(OpenCV_STATIC OFF)
-  endif()
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
@@ -68,7 +65,9 @@ if(USE_CUDNN)
 endif()
 
 add_subdirectory("dmlc-core")
-add_subdirectory("ps-lite")
+if(NOT MSVC)
+  add_subdirectory("ps-lite")
+endif()
 
 mxnet_source_group("Source"   GLOB_RECURSE "src/*.cc")
 mxnet_source_group("Source\\Cuda" GLOB_RECURSE "src/*.cu")
@@ -93,8 +92,10 @@ endif()
 add_library(mxnet SHARED ${SOURCE})
 target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
-target_link_libraries(mxnet pslite)
-target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+if(NOT MSVC)
+  target_link_libraries(mxnet pslite)
+  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+endif()
 set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 
 # ---[ Linter target

From b5596bf29778e3f77d486fe56763cc9522ec572b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 21 Oct 2015 22:01:56 -0700
Subject: [PATCH 070/122] [OP] Allow register symbolic and ndarray unary
 operator in one place

---
 include/mxnet/operator.h         |   8 +-
 src/common/tblob_op_registry.cc  | 317 +++++++++++++++++++++++++------
 src/common/tblob_op_registry.h   |  78 +++++---
 src/ndarray/unary_function-inl.h |  79 ++++++--
 src/operator/mshadow_op.h        |   2 +-
 5 files changed, 383 insertions(+), 101 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 72c5f6c28823..dc6176fe8b51 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -400,7 +400,7 @@ class OperatorProperty {
 };
 
 /*! \brief typedef the factory function of operator property */
-typedef OperatorProperty *(*OperatorPropertyFactory)();
+typedef std::function<OperatorProperty *()> OperatorPropertyFactory;
 /*!
  * \brief Registry entry for OperatorProperty factory functions.
  */
@@ -454,12 +454,8 @@ struct OperatorPropertyReg
  * \endcode
  */
 #define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
-  static ::mxnet::OperatorProperty* __create__ ## OperatorProperty ## name ## __() { \
-    OperatorProperty* ret = new OperatorPropertyType();                 \
-    return ret;                                                         \
-  }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
-  .set_body(__create__ ## OperatorProperty ## name ## __)               \
+  .set_body([]() { return new OperatorPropertyType(); })                \
   .check_name()
 
 #endif  // DMLC_USE_CXX11
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
index e205f29cc42c..ae1f54da3c3a 100644
--- a/src/common/tblob_op_registry.cc
+++ b/src/common/tblob_op_registry.cc
@@ -11,11 +11,14 @@
 
 namespace mxnet {
 namespace common {
-
+class TBlobUnaryOpProp;
 
 class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
  public:
-  TSelf& set_function(int dev_mask, UnaryFunction funary) override {
+  // functions
+  TSelf& set_function(int dev_mask,
+                      UnaryFunction funary,
+                      bool inplace_in_out) override {
     std::lock_guard<std::mutex> lock(mutex_);
     ++reg_counter_;
     if (funary_.size() <= static_cast<size_t>(dev_mask)) {
@@ -26,54 +29,46 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
                  << " already registerd for device " << dev_mask;
     }
     funary_[dev_mask] = funary;
-    // return if it is already registered.
-    if (reg_counter_ != 1) return *this;
+    inplace_in0_out_forward_ = inplace_in_out;
+    if (reg_counter_ == 1) this->DoRegisterUnary();
+    return *this;
+  }
 
-    // The body to be registered
-    auto body = [this] (NDArray **used_vars,
-                        real_t *s,
-                        NDArray **mutate_vars) {
-      NDArray src = *used_vars[0];
-      NDArray *out = mutate_vars[0];
-
-      if (out->is_none()) {
-        *out = NDArray(src.shape(), src.ctx(), true);
-      } else {
-        CHECK(out->ctx() == src.ctx()) << "target context mismatch";
-        CHECK(out->shape() == src.shape()) << "target shape mismatch";
-      }
-      // important: callback must always capture by value
-      NDArray ret = *out;
-      // get the const variables
-      std::vector<Engine::VarHandle> const_vars;
-      if (src.var() != ret.var()) const_vars.push_back(src.var());
-      // check if the function exist
-      int dev_mask = src.ctx().dev_mask();
-      if (static_cast<size_t>(dev_mask) >= funary_.size() ||
-          funary_[dev_mask] == nullptr) {
-        if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-        LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
-      }
-      // invoke the function
-      UnaryFunction fun = funary_[dev_mask];
-      Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
-          ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          (*fun)(src.data(), &tmp, ctx);
-#if MXNET_USE_CUDA
-          if (dev_mask == gpu::kDevMask) {
-            ctx.get_stream<gpu>()->Wait();
-          }
-#endif
-        }, src.ctx(), const_vars, {ret.var()});
-    };
-    // register the function.
-    NDArrayReg()
-        .set_body(body)
-        .set_num_use_vars(1)
-        .set_num_mutate_vars(1)
-        .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
-        .add_argument("src", "NDArray", "Source input to the function");
+  TSelf& set_gradient(int dev_mask,
+                      UnaryGradType1 fgrad,
+                      bool inplace_out_in_grad) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (funary_grad_t1_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_grad_t1_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_grad_t1_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device gradient function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_grad_t1_[dev_mask] = fgrad;
+    inplace_out_in0_grad_ = inplace_out_in_grad;
+    return *this;
+  }
+
+  TSelf& set_gradient(int dev_mask,
+                      UnaryGradType2 fgrad,
+                      bool inplace_out_in_grad) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (funary_grad_t2_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_grad_t2_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_grad_t2_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device gradient function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_grad_t2_[dev_mask] = fgrad;
+    inplace_out_in0_grad_ = inplace_out_in_grad;
+    return *this;
+  }
+
+  TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    unary_infer_ = fshapeinfer;
     return *this;
   }
 
@@ -81,22 +76,32 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     std::lock_guard<std::mutex> lock(mutex_);
     if (reg_counter_ != 1) return *this;
     NDArrayReg().describe(description);
+    OpReg().describe(description);
     return *this;
   }
 
-  GenericTBlobOp *GetOp() const override {
-    return nullptr;
-  }
-
  private:
+  // make friend with unary op
+  friend class TBlobUnaryOpProp;
   // internal mutex
   std::mutex mutex_;
-  // unary functions on each device mask
-  std::vector<UnaryFunction> funary_;
   // registration counter
   int reg_counter_{0};
+  // unary shape inferencer
+  UnaryShapeInfer unary_infer_{nullptr};
+  // unary functions on each device mask
+  std::vector<UnaryFunction> funary_;
+  // type 1 gradient function
+  std::vector<UnaryGradType1> funary_grad_t1_;
+  // type 2 gradient function
+  std::vector<UnaryGradType2> funary_grad_t2_;
+  // whether do inplace optimization of in 0 and output
+  bool inplace_in0_out_forward_{true};
+  // whether do inplace optimization of out_grad and in_grad0
+  bool inplace_out_in0_grad_{false};
   // NDArray registry
   NDArrayFunctionReg *ndarray_reg_{nullptr};
+  OperatorPropertyReg *op_reg_{nullptr};
   // internal function to register NDArray function.
   inline NDArrayFunctionReg &NDArrayReg() {
     if (ndarray_reg_ == nullptr) {
@@ -106,8 +111,209 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     }
     return *ndarray_reg_;
   }
+  // internal function to register NDArray function.
+  inline OperatorPropertyReg &OpReg() {
+    if (op_reg_ == nullptr) {
+      OperatorPropertyReg &reg =
+          ::dmlc::Registry<OperatorPropertyReg>::Get()->__REGISTER__(this->name);
+      op_reg_ = &reg;
+    }
+    return *op_reg_;
+  }
+  // start registering all stuffs
+  void DoRegisterUnary();
+};
+
+// Unary operator to invoke generic TBlob function.
+struct TBlobUnaryOperator : public Operator {
+  TBlobOpRegEntry::UnaryFunction forward;
+  TBlobOpRegEntry::UnaryGradType1 backward1{nullptr};
+  TBlobOpRegEntry::UnaryGradType2 backward2{nullptr};
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data,
+               const std::vector<TBlob> &aux_args) override {
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    TBlob out = out_data[0];
+    (*forward)(in_data[0], &out, req[0], ctx.run_ctx);
+  }
+
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad,
+                const std::vector<TBlob> &aux_args) override {
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    arg::OutGrad ograd; ograd.data = out_grad[0];
+    TBlob igrad = in_grad[0];
+    if (backward1 != nullptr) {
+      arg::OutValue out_value; out_value.data = out_data[0];
+      (*backward1)(ograd, out_value, &igrad, req[0], ctx.run_ctx);
+    } else if (backward2 != nullptr) {
+      arg::Input0 in0; in0.data = in_data[0];
+      (*backward2)(ograd, in0, &igrad, req[0], ctx.run_ctx);
+    } else {
+      LOG(FATAL) << "Backward is not supported";
+    }
+  }
+};  // class UnaryOperator
+
+class TBlobUnaryOpProp : public OperatorProperty {
+ public:
+  std::string name;
+  TBlobOpRegEntryImpl* source;
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return std::map<std::string, std::string>();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    if (source->unary_infer_ == nullptr) {
+      out_shape->push_back(dshape);
+    } else {
+      out_shape->push_back((*(source->unary_infer_))(dshape));
+    }
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new TBlobUnaryOpProp();
+    ptr->source = source;
+    ptr->name = name;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return name;
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (source->funary_grad_t1_.size() != 0) {
+      return {out_grad[0], out_data[0]};
+    } else if (source->funary_grad_t2_.size() != 0) {
+      return {out_grad[0], in_data[0]};
+    } else {
+      LOG(FATAL) << "Backward of " << name << " is not decalred";
+      return {};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    if (source->inplace_out_in0_grad_) {
+      return {{out_grad[0], in_grad[0]}};
+    } else {
+      return {};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    if (source->inplace_in0_out_forward_) {
+      return {{in_data[0], out_data[0]}};
+    } else {
+      return {};
+    }
+  }
+
+  Operator* CreateOperator(Context ctx) const {
+    size_t dev_mask = ctx.dev_mask();
+    TBlobUnaryOperator *op = new TBlobUnaryOperator();
+    CHECK(dev_mask < source->funary_.size() && source->funary_[dev_mask] != nullptr);
+    op->forward = source->funary_[dev_mask];
+    if (dev_mask < source->funary_grad_t1_.size()) {
+      op->backward1 = source->funary_grad_t1_[dev_mask];
+    }
+    if (dev_mask < source->funary_grad_t2_.size()) {
+      op->backward2 = source->funary_grad_t2_[dev_mask];
+    }
+    return op;
+  }
 };
 
+void TBlobOpRegEntryImpl::DoRegisterUnary() {
+  CHECK_EQ(reg_counter_, 1);
+  // The body to be registered
+  auto body = [this] (NDArray **used_vars,
+                      real_t *s,
+                      NDArray **mutate_vars) {
+    NDArray src = *used_vars[0];
+    NDArray *out = mutate_vars[0];
+
+    if (out->is_none()) {
+      *out = NDArray(src.shape(), src.ctx(), true);
+    } else {
+      CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+      CHECK(out->shape() == src.shape()) << "target shape mismatch";
+    }
+    // important: callback must always capture by value
+    NDArray ret = *out;
+    // get the const variables
+    std::vector<Engine::VarHandle> const_vars;
+    if (src.var() != ret.var()) const_vars.push_back(src.var());
+    // check if the function exist
+    int dev_mask = src.ctx().dev_mask();
+    if (static_cast<size_t>(dev_mask) >= funary_.size() ||
+        funary_[dev_mask] == nullptr) {
+      if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
+    }
+    // invoke the function
+    UnaryFunction fun = funary_[dev_mask];
+    Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
+        ret.CheckAndAlloc();
+        TBlob tmp = ret.data();
+        (*fun)(src.data(), &tmp, kWriteTo, ctx);
+#if MXNET_USE_CUDA
+        if (dev_mask == gpu::kDevMask) {
+          ctx.get_stream<gpu>()->Wait();
+        }
+#endif
+      }, src.ctx(), const_vars, {ret.var()});
+  };
+  // register the function.
+  NDArrayReg()
+      .set_body(body)
+      .set_num_use_vars(1)
+      .set_num_mutate_vars(1)
+      .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
+      .add_argument("src", "NDArray", "Source input to the function");
+  // register the operator
+  auto op_factory = [this]() {
+    TBlobUnaryOpProp *prop = new TBlobUnaryOpProp();
+    prop->name = this->name;
+    prop->source = this;
+    return prop;
+  };
+  OpReg()
+      .set_body(op_factory)
+      .add_argument("src", "Symbol", "Source symbolic input to the function");
+}
 
 TBlobOpRegEntry& TBlobOpRegistry::__REGISTER_OR_FIND__(const std::string &name) {
   if (fmap_.count(name) != 0) return *fmap_.at(name);
@@ -127,6 +333,5 @@ TBlobOpRegistry::~TBlobOpRegistry() {
     delete kv.second;
   }
 }
-
 }  // namespace common
 }  // namespace mxnet
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
index 910543efacb3..495144aa931e 100644
--- a/src/common/tblob_op_registry.h
+++ b/src/common/tblob_op_registry.h
@@ -11,44 +11,90 @@
 
 #include <dmlc/registry.h>
 #include <mxnet/base.h>
+#include <mxnet/operator.h>
 #include <map>
 #include <string>
 #include <vector>
 
+#if DMLC_USE_CXX11
+#include <functional>
+#endif
+
 namespace mxnet {
 namespace common {
+/*! \brief namespace of arguments */
+namespace arg {
+/*! \brief super class of all gradient function argument */
+struct GradFunctionArgument {
+  /*! \brief The real data */
+  TBlob data;
+};
+/*! \brief First input to the function */
+struct Input0 : GradFunctionArgument {};
+/*! \brief Second input to the function */
+struct Input1 : GradFunctionArgument {};
 
-/*! \brief pre-declare generic TBlob function*/
-struct GenericTBlobOp;
+/*! \brief Ouput value of the function to the function */
+struct OutValue : GradFunctionArgument {};
+/*! \brief Gradient of output value */
+struct OutGrad : GradFunctionArgument {};
+}  // namespace arg
 
 /*! \brief registry for function entry */
 class TBlobOpRegEntry {
  public:
-  /*! \brief unary tblob function */
   typedef void (*UnaryFunction)(const TBlob &src,
-                                TBlob *ret,
+                                TBlob* ret,
+                                OpReqType req,
                                 RunContext ctx);
+  typedef TShape (*UnaryShapeInfer)(const TShape &src);
+  typedef void (*UnaryGradType1)(const arg::OutGrad& out_grad,
+                                 const arg::OutValue& out_value,
+                                 TBlob* in_grad,
+                                 OpReqType req,
+                                 RunContext ctx);
+  typedef void (*UnaryGradType2)(const arg::OutGrad& out_grad,
+                                 const arg::Input0& in_data0,
+                                 TBlob* in_grad,
+                                 OpReqType req,
+                                 RunContext ctx);
   /*! \brief declare self type */
   typedef TBlobOpRegEntry TSelf;
   /*! \brief name of the entry */
   std::string name;
+  /*!
+   * \brief set shape inference function, by default use same shape.
+   * \param dev_mask The device mask of the function can act on.
+   * \param funary The unary function that peforms the operation.
+   */
+  virtual TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) = 0;
   /*!
    * \brief set function of the function to be funary
    * \param dev_mask The device mask of the function can act on.
    * \param funary The unary function that peforms the operation.
+   * \param inplace_in_out Whether do inplace optimization on in and out.
+   */
+  virtual TSelf& set_function(int dev_mask,
+                              UnaryFunction funary,
+                              bool inplace_in_out) = 0;
+  /*!
+   * \brief set gradient of the function of this function.
+   * \param dev_mask The device mask of the function can act on.
+   * \param fgrad The gradient function to be set.
+   * \param inplace_out_in_grad whether out_grad and in_grad can share memory.
    */
-  virtual TSelf& set_function(int dev_mask, UnaryFunction funary) = 0;
+  virtual TSelf& set_gradient(int dev_mask,
+                              UnaryGradType1 fgrad,
+                              bool inplace_out_in_grad) = 0;
+  virtual TSelf& set_gradient(int dev_mask,
+                              UnaryGradType2 fgrad,
+                              bool inplace_out_in_grad) = 0;
   /*!
    * \brief Describe the function.
    * \param description The description of the function.
    * \return reference to self.
    */
   virtual TSelf& describe(const std::string &description) = 0;
-  /*!
-   * \brief get the internal function representation
-   * \return the internal function representation.
-   */
-  virtual GenericTBlobOp *GetOp() const = 0;
   /*! \brief destructor */
   virtual ~TBlobOpRegEntry() {}
 };
@@ -80,22 +126,10 @@ class TBlobOpRegistry {
   std::map<std::string, TBlobOpRegEntry*> fmap_;
 };
 
-#if DMLC_USE_CXX11
-struct GenericTBlobOp {
-  /*! \brief function type of the function */
-  typedef std::function<void (const std::vector<TBlob> &in,
-                              TBlob *out,
-                              RunContext ctx)> OpType;
-  /*! \brief the real operator */
-  OpType op;
-};
-#endif
-
 #define MXNET_REGISTER_TBLOB_FUN(Name, DEV)                             \
   static ::mxnet::common::TBlobOpRegEntry &                             \
   __make_ ## TBlobOpRegEntry ## _ ## Name ## __ ## DEV ##__ =           \
       ::mxnet::common::TBlobOpRegistry::Get()->__REGISTER_OR_FIND__(#Name)
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_TBLOB_OP_REGISTRY_H_
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 7832ce1798cd..45e3e42f2495 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -8,39 +8,86 @@
 
 #include "../common/tblob_op_registry.h"
 #include "../operator/mshadow_op.h"
-
+#include "../operator/operator_common.h"
 #if defined(__CUDACC__)
-#define DEVICE gpu
+#define XPU gpu
 #else
-#define DEVICE cpu
+#define XPU cpu
 #endif
 
 namespace mxnet {
 namespace ndarray {
 
+using namespace common; // NOLINT(*)
+
 template<typename xpu, typename OP>
-void EvalUnary_(const TBlob &src,
-                TBlob *ret, RunContext ctx) {
+void UnaryForward_(const TBlob &src,
+                   TBlob *ret,
+                   OpReqType req,
+                   RunContext ctx) {
+  using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-      = F<OP>(src.FlatTo2D<xpu, real_t>(s));
+  mshadow::Tensor<xpu, 2> out = ret->FlatTo2D<xpu, real_t>(s);
+  Assign(out, req, F<OP>(src.FlatTo2D<xpu, real_t>(s)));
 }
 
-// helper macro to register mshadow element-wise unary opts
-// usually you only need to use this to register common operations
-#define REGISTER_MSHADOW_UNARY(Name, Op)            \
-  MXNET_REGISTER_TBLOB_FUN(Name, DEVICE)            \
-  .set_function(DEVICE::kDevMask, EvalUnary_<DEVICE, Op>)
+// backward function that takes input value of the op
+template<typename xpu, typename OP>
+void UnaryBackwardUseIn_(const arg::OutGrad& out_grad,
+                         const arg::Input0& in_data0,
+                         TBlob *in_grad,
+                         OpReqType req,
+                         RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
+  Assign(igrad, req,
+         F<OP>(in_data0.data.FlatTo2D<xpu, real_t>(s)) *
+         out_grad.data.FlatTo2D<xpu, real_t>());
+}
 
+// backward function that takes output value of the op
+template<typename xpu, typename OP>
+void UnaryBackwardUseOut_(const arg::OutGrad& out_grad,
+                          const arg::OutValue& out_value,
+                          TBlob *in_grad,
+                          OpReqType req,
+                          RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
+  Assign(igrad, req,
+         F<OP>(out_value.data.FlatTo2D<xpu, real_t>(s)) *
+         out_grad.data.FlatTo2D<xpu, real_t>());
+}
 
-// register all unary operations here
-REGISTER_MSHADOW_UNARY(square, op::mshadow_op::square)
+// Register all unary operations here
+// Square
+struct square_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 2.0f * a;
+  }
+};
+// The true means inplace can be enabled.
+MXNET_REGISTER_TBLOB_FUN(square, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_grad>, true)
 .describe("Take square of the src");
 
-REGISTER_MSHADOW_UNARY(sqrt, op::mshadow_op::square_root)
-.describe("Take square root of the src");
 
+// Square root
+struct square_root_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 0.5f / a;
+  }
+};
+MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, square_root_grad>, true)
+.describe("Take square root of the src");
 }  // namespace ndarray
 }  // namespace mxnet
 #endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 9238ee049c0b..c8ca495d3349 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -80,7 +80,6 @@ struct tanh_grad {
   }
 };
 
-
 struct square {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return a * a;
@@ -107,6 +106,7 @@ struct square_root {
     return sqrt(a);
   }
 };
+
 }  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet

From cd1fb67acc1ed1b70b28a47a8b2dae9755fc8d66 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 21 Oct 2015 23:10:17 -0600
Subject: [PATCH 071/122] [OP] Fix reshape

---
 src/operator/reshape-inl.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index a2a7e58cb3e7..20411b78c585 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -23,6 +23,7 @@ namespace op {
 enum ReshapeOpInputs {kData};
 enum ReshapeOpOutputs {kOut};
 
+
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   TShape target_shape;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
@@ -33,7 +34,7 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 template<typename xpu>
 class ReshapeOp : public Operator {
  public:
-  explicit ReshapeOp(ReshapeParam param) {}  // Do nothing, just make a special factory
+  explicit ReshapeOp(ReshapeParam param) {}  // Do nothing
 
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
@@ -47,9 +48,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (req[kOut] == kNullOp) return;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    // TODO(bing): potentail bug here for non-4D input
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (data.dptr_ == out.dptr_) return;
@@ -71,8 +71,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad_out = out_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> grad_in = in_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad_out = out_grad[kData].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
     if (grad_out.dptr_ == grad_in.dptr_) return;

From d54c2777a7fbfb1a09833a2c728c318c326e4394 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 10:44:48 -0700
Subject: [PATCH 072/122] Optimizer factory is now case insensitive. Changed to
 explicit registration because metaclass grammar is different between python 2
 and python 3.

---
 python/mxnet/optimizer.py | 94 +++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 1c708c533ca0..7f44a1cdcae1 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,17 +4,49 @@
 
 class Optimizer(object):
     """Base class of all optimizers."""
-    class __metaclass__(type):
-        """Meta class for tracking all subclasses(implementations)
-        of Optimizer."""
-        __optimizers__ = {}
-
-        def __new__(meta, name, bases, attrs):
-            cls = type.__new__(meta, name, bases, attrs)
-            #Allow overriding of existing optimizer.
-            #Always keep the last one.
-            meta.__optimizers__[cls.__name__] = cls
-            return cls
+    opt_registry = {}
+
+    @staticmethod
+    def Register(klass):
+        """Register optimizers to the optimizer factory"""
+        assert(isinstance(klass, type))
+        name = klass.__name__.lower()
+        if name in Optimizer.opt_registry:
+            print('WARNING: New optimizer %s.%s is overriding ' \
+                  'existing optimizer %s.%s'%(
+                      klass.__module__, klass.__name__,
+                      Optimizer.opt_registry[name].__module__,
+                      Optimizer.opt_registry[name].__name__))
+        Optimizer.opt_registry[name] = klass
+        return klass
+
+    @staticmethod
+    def CreateOptimizer(name, rescale_grad=1, **kwargs):
+        """Create an optimizer with specified name.
+
+        Parameters
+        ----------
+        name: str
+            Name of required optimizer. Should be the name
+            of a subclass of Optimizer. Case insensitive.
+
+        rescale_grad : float
+            Rescaling factor on gradient.
+
+        kwargs: dict
+            Parameters for optimizer
+
+        Returns
+        -------
+        opt : Optimizer
+            The result optimizer.
+        """
+        if name.lower() in Optimizer.opt_registry:
+            return Optimizer.opt_registry[name.lower()](
+                rescale_grad=rescale_grad,
+                **kwargs)
+        else:
+            raise ValueError('Cannot find optimizer %s' % name)
 
     def __init__(self, rescale_grad=1):
         self.iteration = 0
@@ -37,7 +69,10 @@ def create_state(self, index, weight):
     def update(self, index, weight, grad, state):
         """Update the parameters. override in implementations"""
 
+#convenience wrapper for Optimizer.Register
+register = Optimizer.Register
 
+@register
 class SGD(Optimizer):
     """A very simple SGD optimizer with momentum and weight regularization.
 
@@ -123,12 +158,12 @@ def update(self, index, weight, grad, state):
             assert self.momentum == 0.0
             weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
 
-
+@register
 class Test(Optimizer):
     """For test use"""
     def __init__(self, rescale_grad=1):
         super(Test, self).__init__(rescale_grad)
-        
+
     # pylint: disable=no-self-use
     def create_state(self, index, weight):
         """Create a state to duplicate weight"""
@@ -139,37 +174,8 @@ def update(self, index, weight, grad, state):
         weight[:] += grad * self.rescale_grad
         state[:] = weight
 
-def create(name, rescale_grad=1, **kwargs):
-    """Create an optimizer with specified name.
-
-    Parameters
-    ----------
-    name: str
-        Name of required optimizer
-
-    rescale_grad : float
-        Rescaling factor on gradient.
-
-    kwargs: dict
-        Parameters for optimizer
-
-    Returns
-    -------
-    opt : Optimizer
-        The result optimizer.
-    """
-    #TODO(eric): kept for backward compatibility.
-    #            remove after all downstream functions move to 
-    #            new naming standard.
-    if name == 'sgd' or name == 'SGD':
-        return SGD(rescale_grad=rescale_grad, **kwargs)
-    if name == 'test':
-        return Test(rescale_grad=rescale_grad)
-
-    if name in Optimizer.__optimizers__:
-        return Optimizer.__optimizers__[name](rescale_grad=rescale_grad, **kwargs)
-    else:
-        raise ValueError('Cannot find optimizer %s' % name)
+#backward compatibility wrapper for Optimizer.CreateOptimizer
+create = Optimizer.CreateOptimizer
 
 def get_updater(optimizer):
     """Return a clossure of the updater needed for kvstore

From c5128011c5fd7014e0e9204d7fab48c41a07ef45 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 14:43:14 -0700
Subject: [PATCH 073/122] add compatibility for old style caffe prototxt

---
 tools/caffe_converter/convert_symbol.py | 43 +++++++++++++++++--------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index ea673c4a7863..9b5bcde99848 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -2,6 +2,7 @@
 from caffe.proto import caffe_pb2
 from google.protobuf import text_format
 import argparse
+import sys
 
 def readProtoSolverFile(filepath):
     solver_config = caffe.proto.caffe_pb2.NetParameter()
@@ -22,7 +23,12 @@ def proto2script(proto_file):
     top = dict()
     flatten_count = 0
     symbol_string = ""
-    layer = proto.layer
+    if len(proto.layer):
+        layer = proto.layer
+    elif len(proto.layers):
+        layer = proto.layers
+    else:
+        raise Exception('Invalid proto file.')
 
     # We assume the first bottom blob of first layer is the output from data layer
     input_name = layer[0].bottom[0]
@@ -33,7 +39,7 @@ def proto2script(proto_file):
         type_string = ''
         param_string = ''
         name = layer[i].name.replace('/', '_')
-        if layer[i].type == 'Convolution':
+        if layer[i].type == 'Convolution' or layer[i].type == 4:
             type_string = 'mx.symbol.Convolution'
             param = layer[i].convolution_param 
             pad = 0 if len(param.pad) == 0 else param.pad[0]
@@ -42,7 +48,7 @@ def proto2script(proto_file):
                 (param.num_output, pad, pad, param.kernel_size[0],\
                 param.kernel_size[0], stride, stride, not param.bias_term)
             need_flatten[name] = True
-        if layer[i].type == 'Pooling':
+        if layer[i].type == 'Pooling' or layer[i].type == 17:
             type_string = 'mx.symbol.Pooling'
             param = layer[i].pooling_param
             param_string = "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
@@ -55,37 +61,37 @@ def proto2script(proto_file):
             else:
                 raise Exception("Unknown Pooling Method!")
             need_flatten[name] = True
-        if layer[i].type == 'ReLU':
+        if layer[i].type == 'ReLU' or layer[i].type == 18:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='relu'"
-            need_flatten[name] = need_flatten[mapping[proto.layer[i].bottom[0]]]
-        if layer[i].type == 'LRN':
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
             param = layer[i].lrn_param  
             param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" %\
                 (param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
-        if layer[i].type == 'InnerProduct':
+        if layer[i].type == 'InnerProduct' or layer[i].type == 14:
             type_string = 'mx.symbol.FullyConnected'
             param = layer[i].inner_product_param
             param_string = "num_hidden=%d, no_bias=%s" % (param.num_output, not param.bias_term)
             need_flatten[name] = False
-        if layer[i].type == 'Dropout':
+        if layer[i].type == 'Dropout' or layer[i].type == 6:
             type_string = 'mx.symbol.Dropout'
             param = layer[i].dropout_param
             param_string = "p=%f" % param.dropout_ratio
-            need_flatten[name] = need_flatten[mapping[proto.layer[i].bottom[0]]]
-        if layer[i].type == 'Softmax':
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Softmax' or layer[i].type == 20:
             type_string = 'mx.symbol.Softmax'
 
             # We only support single output network for now.
             output_name = name
-        if layer[i].type == 'Flatten':
+        if layer[i].type == 'Flatten' or layer[i].type == 8:
             type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
-        if layer[i].type == 'Split':
+        if layer[i].type == 'Split' or layer[i].type == 22:
             type_string = 'split'
-        if layer[i].type == 'Concat':
+        if layer[i].type == 'Concat' or layer[i].type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
         if type_string == '':
@@ -121,3 +127,14 @@ def proto2symbol(proto_file):
     exec(sym)
     exec("ret = " + output_name)
     return ret
+
+def main():
+    symbol_string, output_name = proto2script(sys.argv[1])
+    if len(sys.argv) > 2:
+        with open(sys.argv[2], 'w') as fout:
+            fout.write(symbol_string)
+    else:
+        print(symbol_string)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From e4e773813042f80a8841ba2c14a9f28cef8db367 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 22 Oct 2015 13:07:53 -0600
Subject: [PATCH 074/122] [OP] update flatten

---
 src/operator/reshape-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 20411b78c585..d02d1cedebd1 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -173,7 +173,7 @@ class FlattenProp : public ReshapeProp {
     for (uint32_t i = 1; i < dshape.ndim(); ++i) {
       target_dim *= dshape[i];
     }
-    out_shape->push_back(mshadow::Shape4(dshape[0], 1, 1, target_dim));
+    out_shape->push_back(mshadow::Shape2(dshape[0], target_dim));
     return true;
   }
 

From 195586dea5271866b7ffcf471973ecce152c2549 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 15:30:48 -0700
Subject: [PATCH 075/122] fixed naming

---
 python/mxnet/optimizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 7f44a1cdcae1..e72a0cdd1163 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -7,7 +7,7 @@ class Optimizer(object):
     opt_registry = {}
 
     @staticmethod
-    def Register(klass):
+    def register(klass):
         """Register optimizers to the optimizer factory"""
         assert(isinstance(klass, type))
         name = klass.__name__.lower()
@@ -21,7 +21,7 @@ def Register(klass):
         return klass
 
     @staticmethod
-    def CreateOptimizer(name, rescale_grad=1, **kwargs):
+    def create_optimizer(name, rescale_grad=1, **kwargs):
         """Create an optimizer with specified name.
 
         Parameters
@@ -70,7 +70,7 @@ def update(self, index, weight, grad, state):
         """Update the parameters. override in implementations"""
 
 #convenience wrapper for Optimizer.Register
-register = Optimizer.Register
+register = Optimizer.register
 
 @register
 class SGD(Optimizer):
@@ -175,7 +175,7 @@ def update(self, index, weight, grad, state):
         state[:] = weight
 
 #backward compatibility wrapper for Optimizer.CreateOptimizer
-create = Optimizer.CreateOptimizer
+create = Optimizer.create_optimizer
 
 def get_updater(optimizer):
     """Return a clossure of the updater needed for kvstore

From 00bcab3de0d662cc54b8a6290a9ff009256cd04f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tianqi.tchen@gmail.com>
Date: Thu, 22 Oct 2015 15:22:24 -0700
Subject: [PATCH 076/122] Move the link to the figure to web-data

---
 doc/developer-guide/multi_node.md  |  2 +-
 doc/developer-guide/note_engine.md | 24 ++++++++++++------------
 doc/program_model.md               |  8 ++++----
 doc/python/symbol_in_pictures.md   | 18 +++++++++---------
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/doc/developer-guide/multi_node.md b/doc/developer-guide/multi_node.md
index 3f43636b41dd..14772251580e 100644
--- a/doc/developer-guide/multi_node.md
+++ b/doc/developer-guide/multi_node.md
@@ -4,7 +4,7 @@
 
 MXNet uses a two-level *parameter server* for data synchronization.
 
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/multi-node/ps_arch.png width=400/>
+<img src=https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/multi-node/ps_arch.png width=400/>
 
 - On the first layer, data are synchronized over multiple devices within a
   single worker machine. A device could be a GPU card, CPU, or other computational
diff --git a/doc/developer-guide/note_engine.md b/doc/developer-guide/note_engine.md
index a71949886f1d..5c8dedd73da1 100644
--- a/doc/developer-guide/note_engine.md
+++ b/doc/developer-guide/note_engine.md
@@ -35,7 +35,7 @@ However, it is quite hard to code the sequence manually, as the last operation,
 ```D = B * C```, needs to wait for both the above operations to complete before it starts running.
 We can represent the computation as the following dependency graph.
 
-![Dep Simple](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_simple.png)
+![Dep Simple](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png)
 
 In this specific case, the graph is also called data-flow graph, as it represents the dependency
 in terms of data and computation.
@@ -56,7 +56,7 @@ learning libraries when things go parallel.
 ### Data Flow Dependency
 The central thing that almost every dependency engine will have to solve, is the dataflow dependency problem.
 
-![Dep Simple](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_simple.png)
+![Dep Simple](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png)
 
 Data Flow dependency describes how the outcome of one computation can be used in other computations.
 As we have elaborated this in last section, we will only put the same figure here. Libraries that have
@@ -68,7 +68,7 @@ This is simple in the serial case. Because we can simply recycle the memory afte
 go out of scope. However, things becomes a bit harder in parallel case. Consider the following
 example
 
-![Dep Del](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_del.png)
+![Dep Del](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_del.png)
 
 In the above example, because both computation needs to use values from A. We cannot perform
 the memory recycling before these computation completes. So a correct engine
@@ -80,7 +80,7 @@ is executed after both ```B = A + 1``` and ```C = A + 2``` completes.
 Random number generators are commonly used in machine learning. However, they also bring
 interesting challenges for dependency engine. Consider the following example
 
-![Dep Rand](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_rand.png)
+![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_rand.png)
 
 Here we are generating random numbers in a sequence. While it seems that the two random number
 generations can be parallelized. This is usually not the case. Because usually a pseudorandom
@@ -131,7 +131,7 @@ a simple SGD update, and copies the updated weight back to each GPU.
 This is a common data parallel program written in a serial manner.
 The following dependency graph shows how it can be parallelized:
 
-![Dep Net](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_net.png)
+![Dep Net](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_net.png)
 
 Few important notes:
 - The copy of gradient to CPU, can happen as soon as we get gradient of that layer.
@@ -202,14 +202,14 @@ Because we cannot assume the object we are scheduling on. What we can do instead
 ```virtual tag``` that is associated with each object to represent what we need to schedule.
 So at the beginning, user can allocate the variable tag, and attach it to each of object that we want to schedule.
 
-![Dep Net](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/tag_var.png)
+![Dep Net](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/tag_var.png)
 
 After having the variable tags, user call ```push``` to tell the engine about the function we want to execute.
 In addition, user need to specify the dependencies of the operation by ```read_vars``` and ```write_vars```.
 - ```read_vars``` are variable tags of objects which the operation will "read from", without changing its internal state.
 - ```mutate_vars``` are variable tags of objects which the operation will mutate their internal states.
 
-![Push Op](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/push_var.png)
+![Push Op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_var.png)
 
 The above figure shows how we can push operation ```B = A + 1``` to dependency engine. Here ```B.data```,
 ```A.data``` are the real allocated space. We should note that engine is ***only aware of variable tags***.
@@ -227,16 +227,16 @@ The first line reads variable `A` and mutates variable `B`. The second line read
 
 The engine is going to maintain a queue for each variable, as the following animation shows for each of the four lines. Green blocks represents a read action, while a red one represents a mutation.
 
-![Dependency Queue](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_queue.gif)
+![Dependency Queue](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_queue.gif)
 
 Upon building this queue, the engine sees that the first two green blocks at the front of A's queue, could actually be run in parallel, because they are both read actions and won't conflict with each other. The following graph illustrates this point.
 
-![Dependency Parallelism](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_parallel.png)
+![Dependency Parallelism](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_parallel.png)
 
 The cool thing about all this scheduling is, it is not confined to numerical calculations. Since everything scheduled is only a tag, the engine could schedule everything!
 
 The following figure gives a complete push sequence of the programs we mentioned in previous sections.
-![Push Seq](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/push_seq.png)
+![Push Seq](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_seq.png)
 
 ### Port Existing Codes to the Dependency Engine
 Because the generic interface do not take control of things like memory allocation and what operation to execute.
@@ -259,11 +259,11 @@ The general idea is as follows
 The following figure gives a visual example of the scheduling algorithm, which might give you a better sense
 of what is going on in the engine.
 
-![Dep Tracking](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/engine_queue_step.png)
+![Dep Tracking](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_step.png)
 
 The following figure gives another example that involves random number generations.
 
-![Dep Rand](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/engine_queue_rand.png)
+![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_rand.png)
 
 As we can see, the algorithm is mainly about update pending queues of operations and doing the right
 state transition when operation completed. More care should be taken to make sure the state transition
diff --git a/doc/program_model.md b/doc/program_model.md
index 753d7d77acb4..fdfb6799d882 100644
--- a/doc/program_model.md
+++ b/doc/program_model.md
@@ -38,7 +38,7 @@ The difference in symbolic programs is when ```C = B * A``` is executed, there i
 Instead, these operations generates a computation graph (symbolic graph) that represents the computation it described.
 The following picture gives a computation graph to compute ```D```.
 
-![Comp Graph](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph.png)
+![Comp Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png)
 
 Most symbolic style programs will contain, either explicitly or implicitly, a ```compile``` step.
 This converts the computation graph into a function that can be called.
@@ -88,7 +88,7 @@ d = c + 1
 ...
 ```
 
-![Comp Graph](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph.png)
+![Comp Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png)
 
 Assume each cell in the array cost 8 bytes. How many memory do we need to cost if we are going to execute the above program in python console?
 Let us do some math, we need memory for 4 arrays of size 10, that means we will need ```4 * 10 * 8 = 320``` bytes. On the other hand,
@@ -110,7 +110,7 @@ Another optimization that symbolic programs can do is operation folding. In the
 Which is represented in the following graph. This means one GPU kernel will be executed(instead of two) if the computation runs on GPU.
 This is actually what we will do to hand crafted operations in optimized libraries such as cxxnet, caffe. Doing so will improve the computation efficiency.
 
-![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph_fold.png)
+![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_fold.png)
 
 We cannot do that in imperative programs. Because the intermediate value can be reference
 some point in the future. The reason that such optimization is possible in symbolic programs, is that we get the entire computation graph, and a clear
@@ -178,7 +178,7 @@ grad_a, grad_b = f(A=np.ones(10), B=np.ones(10)*2)
 The grad function of D generate a backward computation graph, and return a gradient node ```gA, gB```.
 They corresponds to the red nodes in the following figure.
 
-![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph_backward.png)
+![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_backward.png)
 
 What the imperative program did was actually the same as the symbolic way. It implicitly saves a backward
 computation graph in the grad closure. When we invoked the ```d.grad```, we start from ```d(D)```,
diff --git a/doc/python/symbol_in_pictures.md b/doc/python/symbol_in_pictures.md
index 64c5d9cb1f25..dd924f98c141 100644
--- a/doc/python/symbol_in_pictures.md
+++ b/doc/python/symbol_in_pictures.md
@@ -8,7 +8,7 @@ Compose Symbols
 The symbols are description of computation we want to do. The symbolic construction API generates the computation
 graph that describes the need of computation. The following picture is how we compose symbols to describe basic computations.
 
-![Symbol Compose](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/compose_basic.png)
+![Symbol Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_basic.png)
 
 - The ```mxnet.symbol.Variable``` function creates argument nodes that represents inputs to the computation.
 - The Symbol is overloaded with basic element-wise arithmetic operations. 
@@ -18,14 +18,14 @@ Configure Neural Nets
 Besides fine-grained operations, mxnet also provide a way to perform big operations that is analogy to layers in neural nets.
 We can use these operators to describe a neural net configuration.
 
-![Net Compose](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/compose_net.png)
+![Net Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_net.png)
 
 
 Example of Multi-Input Net
 --------------------------
 The following is an example of configuring multiple input neural nets.
 
-![Multi Input](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/compose_multi_in.png)
+![Multi Input](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_multi_in.png)
 
 
 Bind and Execute Symbol 
@@ -33,11 +33,11 @@ Bind and Execute Symbol
 When we need to execute a symbol graph. We call bind function to bind ```NDArrays``` to the argument nodes
 to get a ```Executor```.
 
-![Bind](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/bind_basic.png)
+![Bind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/bind_basic.png)
 
 You can call ```Executor.Forward``` to get the output results, given the binded NDArrays as input.
 
-![Forward](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_forward.png)
+![Forward](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_forward.png)
 
 
 Bind Multiple Outputs
@@ -45,7 +45,7 @@ Bind Multiple Outputs
 You can use ```mx.symbol.Group``` to group symbols together then bind them to 
 get outputs of both.
 
-![MultiOut](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_multi_out.png)
+![MultiOut](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_multi_out.png)
 
 But always remember, only bind what you need, so system can do more optimizations for you.
 
@@ -55,7 +55,7 @@ Calculate Gradient
 You can specify gradient holder NDArrays in bind, then call ```Executor.backward``` after ```Executor.forward```
 will give you the corresponding gradients.
 
-![Gradient](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_backward.png)
+![Gradient](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_backward.png)
 
 
 Simple Bind Interface for Neural Nets
@@ -65,14 +65,14 @@ graph like neural nets. ```Symbol.simple_bind``` provides a way to simplify
 the procedure. You only need to specify input data shapes, and the function will allocate the arguments, and bind
 the Executor for you.
 
-![SimpleBind](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_simple_bind.png)
+![SimpleBind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_simple_bind.png)
 
 Auxiliary States
 ----------------
 Auxiliary states are just like arguments, except that you cannot take gradient of them. These are states that may 
 not be part of computation, but can be helpful to track. You can pass the auxiliary state in the same way as arguments.
 
-![SimpleBind](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_aux_state.png)
+![SimpleBind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_aux_state.png)
 
 More Information
 ----------------

From 52f02d9c7a57f475529af0771a343f2743f0953f Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 22 Oct 2015 18:44:23 -0600
Subject: [PATCH 077/122] [Example] Fast sign adversary

---
 example/README.md                            |   3 +-
 example/adversary/adversary_generation.ipynb | 375 +++++++++++++++++++
 2 files changed, 377 insertions(+), 1 deletion(-)
 create mode 100644 example/adversary/adversary_generation.ipynb

diff --git a/example/README.md b/example/README.md
index 3921d3fcd198..34ad16da9a83 100644
--- a/example/README.md
+++ b/example/README.md
@@ -14,7 +14,8 @@ Contents
 --------
 * [mnist](mnist) gives examples on training mnist.
 * [cifar10](cifar10) gives examples on CIFAR10 dataset.
-
+* [adversary](adversary) Find adversary sample by using fast sign method
+* [rnn](rnn) LSTM example
 
 Python Howto
 ------------
diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb
new file mode 100644
index 000000000000..3fafebd9b14d
--- /dev/null
+++ b/example/adversary/adversary_generation.ipynb
@@ -0,0 +1,375 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fast Sign Adversary Generation Example\n",
+    "\n",
+    "This notebook demos find adversary example by using symbolic API and integration with Numpy",
+    "\n",
+    "Reference: \n",
+    "\n",
+    "[1] Goodfellow, Ian J., Jonathon Shlens, and Christian Szegedy. \"Explaining and harnessing adversarial examples.\" arXiv preprint arXiv:1412.6572 (2014)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import mxnet as mx\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.cm as cm\n",
+    "\n",
+    "from data import mnist_iterator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build Network\n",
+    "\n",
+    "note: in this network, we will calculate softmax, gradient in numpy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "dev = mx.gpu()\n",
+    "batch_size = 100\n",
+    "train_iter, val_iter = mnist_iterator(batch_size=batch_size, input_shape = (1,28,28))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# input\n",
+    "data = mx.symbol.Variable('data')\n",
+    "# first conv\n",
+    "conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)\n",
+    "tanh1 = mx.symbol.Activation(data=conv1, act_type=\"tanh\")\n",
+    "pool1 = mx.symbol.Pooling(data=tanh1, pool_type=\"max\",\n",
+    "                          kernel=(2,2), stride=(2,2))\n",
+    "# second conv\n",
+    "conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)\n",
+    "tanh2 = mx.symbol.Activation(data=conv2, act_type=\"tanh\")\n",
+    "pool2 = mx.symbol.Pooling(data=tanh2, pool_type=\"max\",\n",
+    "                          kernel=(2,2), stride=(2,2))\n",
+    "# first fullc\n",
+    "flatten = mx.symbol.Flatten(data=pool2)\n",
+    "fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)\n",
+    "tanh3 = mx.symbol.Activation(data=fc1, act_type=\"tanh\")\n",
+    "# second fullc\n",
+    "fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def Softmax(theta):\n",
+    "    max_val = np.max(theta, axis=1, keepdims=True)\n",
+    "    tmp = theta - max_val\n",
+    "    exp = np.exp(tmp)\n",
+    "    norm = np.sum(exp, axis=1, keepdims=True)\n",
+    "    return exp / norm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def LogLossGrad(alpha, label):\n",
+    "    grad = np.copy(alpha)\n",
+    "    for i in range(alpha.shape[0]):\n",
+    "        grad[i, label[i]] -= 1.\n",
+    "    return grad"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Prepare useful data for the network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data_shape = (batch_size, 1, 28, 28)\n",
+    "arg_names = fc2.list_arguments() # 'data' \n",
+    "arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)\n",
+    "\n",
+    "arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]\n",
+    "grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]\n",
+    "reqs = [\"write\" for name in arg_names]\n",
+    "\n",
+    "model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)\n",
+    "arg_map = dict(zip(arg_names, arg_arrays))\n",
+    "grad_map = dict(zip(arg_names, grad_arrays))\n",
+    "data_grad = grad_map[\"data\"]\n",
+    "out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Init weight "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "for name in arg_names:\n",
+    "    if \"weight\" in name:\n",
+    "        arr = arg_map[name]\n",
+    "        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def SGD(weight, grad, lr=0.1, grad_norm=batch_size):\n",
+    "    weight[:] -= lr * grad / batch_size\n",
+    "\n",
+    "def CalAcc(pred_prob, label):\n",
+    "    pred = np.argmax(pred_prob, axis=1)\n",
+    "    return np.sum(pred == label) * 1.0\n",
+    "\n",
+    "def CalLoss(pred_prob, label):\n",
+    "    loss = 0.\n",
+    "    for i in range(pred_prob.shape[0]):\n",
+    "        loss += -np.log(max(pred_prob[i, label[i]], 1e-10))\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train a network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.92\t Train Loss: 0.28077\n",
+      "Train Accuracy: 0.97\t Train Loss: 0.08434\n",
+      "Train Accuracy: 0.98\t Train Loss: 0.05849\n",
+      "Train Accuracy: 0.99\t Train Loss: 0.04577\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:11: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:4: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_round = 4\n",
+    "train_acc = 0.\n",
+    "nbatch = 0\n",
+    "for i in range(num_round):\n",
+    "    train_loss = 0.\n",
+    "    train_acc = 0.\n",
+    "    nbatch = 0\n",
+    "    train_iter.reset()\n",
+    "    for data, label in train_iter:\n",
+    "        arg_map[\"data\"][:] = data\n",
+    "        model.forward(is_train=True)\n",
+    "        theta = model.outputs[0].asnumpy()\n",
+    "        alpha = Softmax(theta)\n",
+    "        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size\n",
+    "        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size\n",
+    "        losGrad_theta = LogLossGrad(alpha, label.asnumpy())\n",
+    "        out_grad[:] = losGrad_theta\n",
+    "        model.backward([out_grad])\n",
+    "        # data_grad[:] = grad_map[\"data\"]\n",
+    "        for name in arg_names:\n",
+    "            if name != \"data\":\n",
+    "                SGD(arg_map[name], grad_map[name])\n",
+    "        \n",
+    "        nbatch += 1\n",
+    "    #print(np.linalg.norm(data_grad.asnumpy(), 2))\n",
+    "    train_acc /= nbatch\n",
+    "    train_loss /= nbatch\n",
+    "    print(\"Train Accuracy: %.2f\\t Train Loss: %.5f\" % (train_acc, train_loss))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get pertubation by using fast sign method, check validation change"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Val Batch Accuracy:  1.0\n",
+      "Val Batch Accuracy after pertubation:  0.04\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:4: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
+     ]
+    }
+   ],
+   "source": [
+    "val_iter.reset()\n",
+    "data, label = val_iter.next()\n",
+    "arg_map[\"data\"][:] = data\n",
+    "model.forward(is_train=True)\n",
+    "theta = model.outputs[0].asnumpy()\n",
+    "alpha = Softmax(theta)\n",
+    "print(\"Val Batch Accuracy: \", CalAcc(alpha, label.asnumpy()) / batch_size)\n",
+    "#########\n",
+    "grad = LogLossGrad(alpha, label.asnumpy())\n",
+    "out_grad[:] = grad\n",
+    "model.backward([out_grad])\n",
+    "noise = np.sign(data_grad.asnumpy())\n",
+    "arg_map[\"data\"][:] = data.asnumpy() + 0.15 * noise\n",
+    "model.forward(is_train=True)\n",
+    "raw_output = model.outputs[0].asnumpy()\n",
+    "pred = Softmax(raw_output)\n",
+    "print(\"Val Batch Accuracy after pertubation: \", CalAcc(pred, label.asnumpy()) / batch_size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Visualize example after pertubation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "true: 9\n",
+      "pred: 8\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPwAAAD8CAYAAABTq8lnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztfU2MLNlZ5blVr6oyq+q913YztC2rB7Ogd0hGI3njGZmF\nhUCWYNgYtYSwkBmxYBiEWNhmAQwsBixhIVigYbCRzSB+NAiPWcBgpEFjFvwY2WNmMDRI3ZLxuLv9\n09XvvarMrKzKO4uXX76TJ78bEZkZERmZeY8UisisrIjIyHvu9//dEGNERkbGfuBg0zeQkZHRHjLh\nMzL2CJnwGRl7hEz4jIw9QiZ8RsYeIRM+I2OPsDLhQwjfGUL4+xDCP4YQ3l/nTWVkZDSDsEocPoRw\nCOAfALwLwJcA/DWA52OMX6DP5AB/RsYGEWMM+t6dFc/1dgD/FGN8CQBCCL8D4HsAfIE/9O53v3t2\n/MILL+C5555b8XLNY9P3N5lMMBgMcHV1hcFgsHA8HA43dm9todfr4fT0FP1+f7a37fT0FKPRKPl8\nrq6usOtJZAcHBzg6OsKdO3fc7eDgicL+4osv+udY8dpvAfBFev3P0/cyMjI6jFUl/EpTqc3AvO/K\nrDyZTHBzc7Px69/e3uL29haTyWT2fOwZhRAK90Czz7jo+re3tzg8PFzr/Hfu3MHh4SEODw9xcHAw\n2/j7hRBm79tn79y5g6OjI0wmk+S57f6858P7XceqhP8SgGfp9bN4LOXn8MILLzy50J07s0E8mUzc\n403i9PQUV1dXG7t+jBHD4RCj0Qjj8Rjj8XiO+MCTwc6Dnl/bs0w953URQii89p07qw6nxzg5OcHJ\nyQmOj49xdHQ0Iz5PLEzy4+Pjue9VNIZubm5wcHDQ2fG3LmzslGHVX+gzAL4lhPBWAP8PwPcBeF4/\nxDaxPViTYLpt+oGfnJxgMBhs7PqTyQTX19ez7ebmBjc3Ny7hbdDzdnBw4D5Xk751gAln1+TX6+L4\n+Hi2sV1qtil/f5PoMcbZRFQ2hrzxZ+iStrkKzP9huLi4cD+3EuFjjDchhH8P4H8AOATwEfbQp2AP\n3AYzb3UNym1FjHEm2T0Jz4Pdc9gcHh4uPNMQwmzSqANl98Cq9yowqX10dDRzTh0eHs5J+IODgzlt\nkTWOMnjPB9gfdR5YXcIjxvhHAP5oic/PEf76+nphcO8zYozuJOhJeLNZebtz587sWdrgt2e+LhEN\nJuH1+kbSdeFNYqrSs4TXCaiMuDbu+Jwxxr0ae+sZXUuCCT8ej3F9fY3RaITr6+u9eugebOAVmTus\nUhvRjo+PcXJyMkcQO58977oIz84yvf7x8fHa5/dMFSY9E1zJXsUstMlQyV5FO9gVtEZ4HoAmiUaj\nEUajEYbD4UY95F2B50xiCW+EMwl7fHyMXq83I5wNZtWm6pTwdn1Tv09OTmb3sO51Ug5BddqFEDCZ\nTObU+ypmy2g0mj0fI7udr4oPYBfQKuHtIbOEHw6HGAwGmfDwQ2k8CFml9ghnn2eysw28LjwNo9fr\nodfrod/v13J+3WtIzjYLsS0TVjNJbhOEjcMs4WsCk9ge7ng8njvm9zLSUNIyGZgIReSuEscv+l+b\naNR3YMdNY5n79XB7e7vgH9gnsgMNE57j2jc3NzN73cJO5pTKqAaT3iyZjOS3t7czE0mfb1Ecfxkv\nN5sPnCRTlwaR0TwaJTzHtc1DaoklNiC3Pf7ZJtTpybanPl+LfKjTz4vj23tlMLKbV17j5BndR2sS\n3hJLWIXPEn45sCd/PB7P3mepb884lbiTiqFXyZLTUKCGzTK6j9YkvA1K27zU0YxisEqvry0OXxTH\n57CaJrlUscFTCT+Z8NuDVgnvxZgz4auBve/62vLEvWer6aeauMNqehmKYuQZ24HWVHqz1Tm+nMm+\nHLgIxsjOzjcvhu+p9BxW07BeETROzk7DjO1AaxIe8OPL2WlXHWbDc7os78vi+F7ijhG+1+uVXt+L\nj2eybxcaJTw7ljLqhanpvDd4Utfi6Op4W8aG33ZwroLW03NBDrC79fKt5tJnrIeyevgymOrObZL2\nyQbX/HsusQUwC2HuYr28IRN+i1BWD1/2v1zoso9hNXZa3t7ezjkqDw4O5joOcT8BYHdMz0z4LUFZ\nLXrVODonzexbppyW1/L7h4eHC2HNuvsJdAGZ8FuEonr4qnF07Xq6bxL+8PCwMBnJkpdYsmfCZ2wE\nRfXwZYS3/1WTYN8IbxOmV0+vz8OiIrtUPpsJv0UoqoevEkf3HH77SHg+5np6Nm+4hHaXnk8m/Bah\nqB4+x9HLwVEOr57eutpy34YqDtFtQqOEX6VtcSo5J2O+TbT2ZN+HOHoT4FwGr8fArqFRwnPb3DKY\nc8Rr77RLcdCMzSE1tuyY+wloCfeuoFHCL9P2qKiJ4644TDI2Cy448jbrs8iEr9ovb1vQGQmv/ep3\nNSySsTlohaGVafOxtl3btSYtnZHw3M22qb7qGfsNbRFmmzYN8XoK7Ao6RXgOi2j9d0bGumCz0VqC\n2TYajeYIvqv9Gjqj0ntqvDV2yBI+ow6UrY2gTUN2sW9DZyS8tm3SRQIyMtYFCxJu+mlrI3A3Idvv\nCtENjRJ+mdiwFSrYdnR0NPd61x78srCssKK10zeNKrXk6/TGbwpK7E3fU5O5KJ3KtNNaZXaY7Hti\nSQhhri98F8tbvTZm/Nrrhc/Zb02jrB5+00IlVYtfp9OwM4TnJYT0h7Ba5X1GCGGu4WQXq93YKebF\nu73inTbLc8vq4Tftjbdnps8OQG3O684QHpiX8Pbw7Ufa9I/RBWhb6S5KeI1z86a1/MB8QUvTKKuH\n37SEbyMXZS3ChxBeAvAAwC2AcYzx7Wuca/ZjeCul7DvheW231Prpm0Yqzm2xbpuoeDJvm/BF9fBd\nIHzT69evK+EjgG+PMX593RthcvNrU78y4cOCKty1jjWpOLelq/LvyCac2f5Nf4+yevhNE17D0PY8\neRnwdVGHSl/br2QDQCX7LoZHVoHn9OqShNdlmDnsNRqNZqRisrdJtLJ6+E3D1gsEFnNR6kIdEv5P\nQwi3AP5zjPG/rHoi+wG4LhnYzVjoquh6PXsqscXi3JYmrY082iZ8qh5+0zC/BmtKnIvSBQn/jhjj\nl0MI/wLAp0IIfx9j/LT98YUXXph98Omnn8bTTz9dekKuT2Y7i//epRhu16DPoujZecfLDCq9lk7S\nGlqyRh29Xg/9fh/9fh+np6ez47LfsSzOz9fV1zyWujpeYowL+Sf8OsY415qMV/4xLaoMaxE+xvjl\n6f4rIYQ/APB2ADPCP/fcc8ucy61RZnWrqCd7V3/ENsFNHJTM+tw0gYe1KgDusTfR8vFwOFzYTMIP\nh8MZ0ZnwtvV6vUqEZxNP+yV4efDblBOvfoXj4+O5+7Y2XN5vaJOn4eLiwr3GyoQPIZwCOIwxPgwh\nnAH4DgD/cdXzAcU18UDxYoaZ8Ivk1j2HxeyY9ymJacc6oehmNntqY+nu7ct+Q8519445OsAlr9tS\nYl2UixJCmJlE/LsuO/bXkfDPAPiD6cXuAPitGOOfrHqyshhujHEuJGVx1Ez0eagkZ2lQtMwUh6VU\nFbbNm0R4z9VnVnLKr7n/nrdVIbyND++YIwJ2zM6vrkMnZo5o1BWaXpnwMcYXAbxt7Tt4cr45wnOd\nsq1R5816qoruM3j256w223Nba2/P6nKK8KnOtwcHBwuxd92Y8N5xGeG9hhUsFIbDIY6OjjAYDGY+\nIO48uw3jRHNRvNCht1WdDDqVaceE11plAAtxXFPrt+GHbAOenc7NLq2tNW9MPM8+5mPP9ufXJm09\nDe3m5mahrTYfn5yclBLeS+bRxB5tNV13WKtJpHJRWOKrOWOvq05onSG8DS6V8ObwAbCQbstqT8Zj\nMOk1ldUI53nI+/3+ghNMnafsN1E/isXUizbTJlJbGeFVCOhrS6hhyW6Za9sAzRMwNZ43nlRtr5GI\nInSG8AAWVHr28Koazxl4WcIveuhVuvN68P1+H2dnZzg7O8Pp6els71VpMelTmX4qgVKbZ1JwQVAV\nwpvnX7fhcDiTjEz2rhUYFcHLEwB8Ycix+WXSbztDeJ6lWCqwp1XLZveR7Kk4uw0SJjc75Y6Pj3F6\nejq3mZS3LUVUlfBMcn5dFjYzYjPpeatCeG99PLs+O+500cyUN7tr46fI627Sn3+bZSezzhA+oxyp\n+LptTCZPirIKz2vFczqzntsqFT2/gBcW8o7NsVoU0lvmGXgLcZgGoZEHb3VdL7uua8RvCpnwWwQl\nCcdiDw4OXKecOujUWcZltl7M3Zx1mvShmV4eae2c6uVf1ab2/BOshXiaDWsAXuadfd99QSb8lkE9\n8Szxjo6OZoRmaW7HRgbTBJgULIWNEHzsheE8dTIlrb1kkVUSpjwJb4TV76RqfyoCAewP6TPhtwgq\n3b04uznl2BlneyWB1tWrBORQj71OJd0Yce1zfM98vpRWUIX4+t21ht1LJuJNCW6Za/tCdiATfuug\nZNc4O0v4s7MznJ+f4/z8HGdnZ8k8bM6lT6XV2rWLbHBP2vPE4ZF9GSmv313vKyXh7Rlxaiqv+74N\nHvy6kAm/JUhJd4/wLOHPz89x79493L17d3YePp++B6SdWqn/KZLUTPoqdn+V56DJKfZeKmXYSM85\nG6x18H3uOjLhtwhqv7PzymxzS6w5PT3F+fk57t69i3v37uHevXuuDVuUtJEiQFW7Xf/uORtX+f58\nX/xMOBTnmS0e4dUE2XU0SvhlChZ0La+udCHZJFRyKsE1zq4lpxxnt0w6L76uz1qJvork43tn1bvo\nc1XBXn7WFlLPxTSfEMJCc8hl8tB3AY0S/urqqvJnb25uZllTvHLnviXXpNRjc1JprJ03U93Pzs5m\nsXYv+cTCbBZfLyqaMSzzG6T+T9uVVU0H5efBJgJ/HwBuVqGZOKPRCIeHh3MLlvK59gWNEn4wGFT+\n7O3t7Vx+tBJ+H36UIvvYCH90dJSsKTfnnBHeEm44vVRj66nsOD2uCrX/lfzL5H1XeU6s4nuE7/V6\nszFlGhJLeO63sA/ojISfTCYLS/fuY7daz4NtAzvlmNOUWduY8Oyl5tg6E1BVfgBLha48suvxuqQv\n0oB4UmTCm1RnbcC+r6n42WlXA5aR8PbwvbW590XCM1JeeXbMnZ2dzVT48/PzhZ5xvV5vQcIXkdBC\nVYZVnrt3bu9a/PlVia8TVkrC88IO9gysUMvCk/uCThFeyyn30X5XoquEt2w6IzyH3bymFmzDF8XZ\njdys3q7qxU5J8jIfQdXkG++9FOHH4/Gc2q4VmV3q698GOqPSe2ol25H7BI/42sTC1Pm7d+/i/v37\nuHfv3ozcXhyaVVmDquBsTvDvsSohlNz8W3o2fpXnohMDv2c5CUZ4k+w8nnShDO6LuA/ojIQH0svk\n7hPhPbKzfco2PMfZn3rqqcJ6dUtUST1LT81ehexVpPoqZOfnY/+rTk6V8EZu/j68NrxNhFmlrwnW\niy6jHCmSmwTSmDKHnGxLFbiwBCsiMDvouH0S9yQAikld5PFnqXtzc4OTk5OF2m6+Ry+zz/sOHMnw\nmn4Y8a2pSlfX5msDOdOuQ2D1XdtIm+3OtrmVti5btpoC+1G446yFS7WfGvd+Lwrx2V7Lc7VkNzVR\ned/JC2HaXh2dmlOvz2ufkAnfIagdyt1dzHbnDjEqqcpIUQazcb2egqPRKNmckluJFxG/qC+9JcYo\nMZWk7MzUjDvbq4akpK8jp39bkQnfIbCtbiTXrDHP++51oFl2MKudq+vCDYfDhX7zrAWMx+PSBB6t\nz7f9aDRCv99fmOTY6Xh0dDRX1WdkV3NFHZ0s5T0JnwmfsTGoR97UeO5koyo9O54823cZqM3Oq79e\nXV0tNJDU10XqfIxxlhBkyULD4RCnp6ezSYO1F/ZZWBIQd67hZ6bOv2Ul/D4hE74jsMHHKj1XwDHZ\nPQnPDq9ViZ9S6QeDAS4vL2eSfjAYLGzD4bA0Vdcq+AaDAc7OzuZqJ8bj8UKfeg7jcZKNRRxS39ez\n4S3JJiXh94X4mfAdAifYcAhOV2lRCa+E5/Ppe0UoUumvrq5wdXWFy8tLd391dVXquDOym1+A06gt\nVGaZcbroSKosNpVLwJNnSsLvm4ceyITvFFgqqYS3Yhi14ZXwjGUHc0qlN8I/evTI3R4+fIjLy8sF\nwivx1SxgsnOMnHP52czRZ2WhPG8i8FR6zwG4T9Id2ELCcx40Z01V/dHKYrybhGfDlznsVpVUaveW\nSffLy8sFkuveU+P5NUtd1WBMrbf10HWhSC3sub29nSvz1co+dd55XXf3jezAlhG+KA+6LMlHwzne\n8SahA1Sl/DKSPYWitFomu0r2y8vLmRQ3FZ7VcitnTmXVaaKO2tfqrFPfhKb78qTPSNVg2ETDnvtV\nIhm7gK0hPP/IvNyO/Y0XGvCgNl3K0bVJqITXhheedF/FMZfKkPOW+GJV3shujjpVzb1z63Xse3oZ\ncZ5fQidjTvnVQh9NAuJnas4+nej3DVtFePYiczGIeWCLwM4w3jSBY5PggchFMGW2+yrqvG5VJLwR\nnW1xdrx5OfLesTonOQSnWXE8ofG96iTAf/MID8BV57sy2beFUsKHED4K4N0AXo0xfuv0vTcC+F0A\n3wTgJQDviTFeNHifAJ6o9DwA2J4rAueje4OhC/BSaz0Jz6vFLEP4FNGLwnFsv7OHXVX6KnUT+tzZ\nbPEmNE8K831X+V58vXVTj3cBVST8bwD4FQAfp/c+AOBTMcYPhRDeP339gQbubwa121gqVbFlzcFj\nA4IHnTqwNgXPacfSz7Nx15Hu6gRLOezMKZdKujEJr99FX3uE50nNWx7Ks+GN7HbsTXrqvPNU+kx4\nBzHGT4cQ3ipvfzeAd06PPwbgz9Aw4af3Mit3ZMle5YfjBhBsQ3ap3r6I8EUScBXSq9fbJk8umFEJ\nr+uyq0pv3yG1TxFel78q8lHwhGV/YxU/lXiUkvB8j/uAVW34Z2KMr0yPXwHwTE33kwTbbqkfuAhm\n46vDiJ1Jm4TnpdfQFdu5no1bhpTqWybhzYa30Bnn0vPm2cWp16kUYrXhVcLzd7HzGVLS217nsFwN\nTrsYYwwhtMKYdcgZYyztfa9OJ88J1RRijC6ZTJpeX18DWCz/PDg4qOS0BLAg0bm8NbUV9RjkGPfR\n0dEC2fS111pbJbtHdnXc6XPjY804ZNJrvH6V3zUVhaiKIg2o7Pycm6D5BzY5l2FVwr8SQnhTjPHl\nEMKbAby64nk6A8/Lqw+1SXB1GPskjJCWdsplpb1eby4jrQxKdD1Wu1ftbM5L9yYLnoh0Ugoh4I1v\nfOOsHdf5+flCo80U2ZcBT0imCdo5NJV32d+2bHyUncd7JuqY9M7PJiyTnr+DZWIaLi58H/qqhP8k\ngPcC+IXp/hMrnqdT0AHMr1kTaAJefgCr2tfX17NqMy5HBZ50eqny/ZTk+p4NWs/GLhqM5jzzmnHY\n9oY3vAFPPfUU7t27N6uYY8J7zSmWNVl4r2DNaZVGqUo8HSdl52FzjY+BJz6mMs0r9RtW/Q5VwnK/\njccOum8IIXwRwE8B+HkAvxdCeB+mYblKV+s4ODzltcxuEjzQgUWyD4dD3L17d5bVxpLdiFmGlHQq\nSlhh0gPpBSYAf0Dze/fv38f9+/dx9+5dnJ+flxJ+2dCZqsGquhvZPROl6vn5d9GtTChwQpX5j/hZ\nVzk/hxxXMU2qeOmfT/zpXZWvsgVgEpjzirdl1slbBRxiU7KbE82T7BzWMqQIooRXp12ZF93+lnLO\naXWaZjbywpYpwq+b8qzfjb+jt8jJOoRXx2UZ4e1ZqibF11cHKvt06tAytybTrg2wGqU93ZomvP3w\nrGVweGwwGCxIdm59VeX+VDoU+So80quKruo3E533dmxr1dtmdqfXyMObVKp+P88p6RGeO9pWIT0T\n3sjIYcoqhOeJVR2u3u/PY9DMJu/5VH1OmfBT6MPmjDP7UZuEeZF1MA2HQ/R6vVm9uX2WyW7JLwov\nGcVTB5XwNnhUwnvSW4+LNm85LF0dx+57FYcdfx8mtW3qtFtVpef6fe77V0b4o6OjuQnVnpv+Fqm+\nghaF0Ml2mQSsTHiC97BNwjZN+BDCgmTn0FW/3599zluQgouJUvDi8CrtDSq1OQtOF7zw+tB5e2ur\nzUtiMeFTiTDLEF+92byp024dlV4n5cFgUEp48wPxZHrnzp25/0ud3yYU1aA4FyNL+BVR5PhZ5n/5\nHGWwH8zzJVg/dwvJGcm1gKVKjLfMscWFOzypWK16aqtCel4GSzMHV7HXvWfvSUmewNeR8N71il4X\n3Z83MTHRPT8STwwcOq3VabdPUInG9laZFzylHi8jRXTAarYY23S6eZluTKJUgoc63VgSs8/Aio9Y\nRdfXqc3Lmfcy6daFkkkdX+uG5divYeOD/7+KDW/ajH3eNA8gvWT6sp74wnuo5Sw7ALVb1YFVhfBe\n/NT+tqxTSNW0MsLb+ufs+GKbL6Umq81uhOcwkEl9JbA65dQzr691vbt1EmxSz09Dq2yaMZE08aaq\nFqZCgcdIGeHtOZoaHmOcOVvtd9f6BLvHupAJT+AfU+1Zi0OnwM4cbrllJK4ClVCs4ocQSgnvJXQU\nEYn/poT3HIQaZvM2vr6+9iR/nRIegCvhjfDcIXcVld4IzwKBJ8sq5+EJzp7xZDKZaR0aCl7X7FBk\nwk/BarBJc35dlnhjP9p4PJ6TsEbWKmAJxeetSngjkU5WLIVskHrvGeGV7EdHRwtr16mXOJVdp5/T\nyaAJlZ5t4yIJv65Kz++pkEjdn27cKcgIrwlfdZEdyISfA6eoqvpWJqUnkydNEoFF9bwMbL/zexau\nA7Cw8oses2rKUoRztQ2q0gOYG8ga+jMfgZcHzv6Cor03CdSlzuszVPudVeU6bHh7zfUFZedhzUML\nlIoKmFibWBeZ8ASOA9sP64WsPDCxmOzLSDANkZlkNylc5rRTwnseXJbu9tpgA5nJrsUxVTc7d8qR\nqMd1oEjCG+nX8dLbPduxVQpWtbPNQWfS3CS82uye87cuZMJPwVKOiVLVoWM/uDqOliU8Z9LxvZUR\n3gYSS1MbkB7pPdh35z4BGrbT+1rlOLWvA15YTlV6L722CniS4oQZu24ZLDnKJk/20g+HwwWPvB5n\nCS/wBpIdq6PIsx/LBqAXY+d9HSg6p673ZktA6TJUtnEM3+rVq0pnQ51krIKq10898zLCe86wZcNe\nKR9IFXixeLvPpgu0gB0iPKuJnp1piR/cWWUZL7H+UKp23d7eLvRpr9vDak4dIzpnqE0mkwXC6+Z5\n1Tl0xs/Rjvn5No0yc6BswmW/hEf6ulJrU2p32Xk4954nnjoFRhl2hvDAfHmmeoO5N5x2fq06mNk+\n1Hg7p8Rq+mYdP6iZCab+XV5ezpHdpHnRpv3f9bXa1oY2pbxn73PUw56Ft3FkxLPjvXDXMjFuPbeO\nhbLf+ebmplDTaAM7Q3j1quvmpYIuS3iVHJy66aVvNiHhjfBM9pubGwyHQ1et581Uf16llZNr2NHH\nr+35Ng1PS9P7UDXcc6oWSXiW7m3Xw3MVpo6RtrAzhAfmwyQqwbxUUA6jlSE1iDRRItW0YF1wnHYw\nGMyRfTQa4erqqpDwli5rVWr9fn8h3s5kt/frchZVhZKek3c0gsGRDM53KCK8Sudl7Hc977L18N5E\n0aZ0B3aI8GqvcydUzd9WdX8ZCV8U7imKodYBk/CmutqgY7KnSH9ycoLz8/PZIOXKLXtemrSjIbym\n4ZGdN7abmeQa2WBistPOSKlbVdLreZeth/fMgazSrwHNc2Y1Vhs48MBa1oZP1csXOXTqgBHeyD4a\njeY0lzLC88Rkz4uTawzqOGub9Ep4LiPV1GMmmRJeTS2WqKwlLCvhV62HV81kmRh+XdgZwtvg1Mos\nU2XNKcWf1eMi8CBJ1cun4qd1qvRGdm/i0mWp1H7X6reijjlG9E1I+JQ/RsnOWYxseqRUektjtc/w\nvgpSEr5qPbydIzVO2sDOEN5DKsRUNJBTk4L9n8ZPWYo0jZS2YPeqDkQNP5mjzrrkeCvH2Pk4c5Cd\nZk0ilY9vmz0DYD7BiT3ymoeum2FVgvHEr9dsI46+LnaG8DwAbMBzOEclmELDQHrcFXhONB28nqeb\nO6PY/6gU5P+1c/L/6zWKXq+CIt9KUYTE9rbY5SbDXl3HzhAeeFJZZoOAbdAqhPfKOu1v2wJNVvGK\nVQCf8Px5m1j4PTs/7/V4HXhVdKn75cIh23tr1nthr30m/84QXgcED5Tb29XWjwfaU2eXQSpUxmT0\nJDwTnk0TLuvV/HsmvJI9ZTKtCi/+ztAJykwSIzlLeI5GbMJW7ip2hvDAE3J7UqGMtJaNp11MNhGL\nroIi0qtkVjVZHZBGIK/5h52HJ8CmCO9l1+n9elESi5QY4VMSft/JDuwQ4XlA6OsqCTYHBwdz6ZGq\nynYRfK8ML2NNnwE7vIxAVtetEl67o/I1eKsDqXOmYuvsJTeVfpO56l3HzhAemA9dcfimSqydk06Y\n7Db4uybhGSqRVVJ6hPUkpteY0XP6pba64J2rSMLbyqmehK87F2LbsTOEtwEBYC7NsupgZEluA9zK\nSrdpsKhKn7KLPSeYNbwosuFZ1dZrrYtUjJz3XtmrEd7Ue69iMeMxdobwwHr16ZPJZOasYxXXbFzz\n9Gs65DLJKZ7Ti/frIISw0CXWq/1nwmvGmWaeeROHR/S6JLxmwWlWmhGaVXiT7Lb3vPTLJtekJhzN\na1hlDGwaO0X4daEqI3umDw8P52K8XP5aFakYfx0SMoRQud4/pZIXEVvDcHWr8QBmk6qRSo+vrq5w\neXk52/i1SvhVqtFSE429tslknQll08iEn0ITd1T9PTw8dBcJqGofMqFSRTzrIITg1vt74TQ7rkr0\npmx1haYt6/7q6iq5sf3u9Yir8huxmeMVuGgPQfYRbAsy4QleHN/ePzg4cJcrWsYhZB7v1Aot60LL\ngb0lnMrI7IXF2iK+PX9u46WeeFXh+VgJySWwVZBKmbVJSDeW8Nsi5UtHWQjhowDeDeDVGOO3Tt/7\nGQA/BOCRfG5iAAAdbklEQVQr0499MMb4x03dZBvQ2LS9Z68PDg4WUjqXTdvUaj4l6LpI1ft75Ew5\n9spIr+eoEyzhPTvdNv4bv/b6Eqyi0nMyEre6TuXq75qE/w0AvwLg4/ReBPDhGOOHG7mrDSEVxzeJ\nv04tMxOMy3d5UcV1YNpD2UIPTNxlJbv+v72uC2bDs/f90aNHMzvdiM3OO968gpllJmQvGYmz+TZd\ny14HSgkfY/x0COGtzp+6G5heEZyR5cXx1YmzbIyXSWkNOqx67fj4eO37V2egEt4j+jJqvJ6jbrB0\nHQ6HM6fcw4cP8fDhw7msOk/lZwJ6TS7KoMlImtyj3vlN1LOvi3X0yB8NIfwAgM8A+IkY40VN97QR\n2ICwH9WTaBym8Y7LYARUCW9NOtaFd88pNVwJrsdF6nxT5DfpyhLeCP/666/POeW4vNeONUS2rG3t\nZfNxco86ALcxR39Vwv8qgJ+dHv8cgF8E8L5a7qgAKUnFWOfHWPfHSxGBnXXemuq2rYui64cQ3Iae\nXmdbjud7PgDLPKx7sGthDDvqLi8v56IkqZV36sa2ENwmwzKsRPgY46t2HEL4dQB/uMp5lkEqhKTN\nEbz4aVt2lkpHPua++EVe9Dqu7z2nw8PDWQPLs7Mzd+v3++j3+8lYPg94LUhZ5/na/6bWzPPCYU3Y\n0PbcuGsSp1ZbAtY6Zl1T6PV6OD09nb2+uPAV7pUIH0J4c4zxy9OX3wvgb1c5z7JIxbAtLdZzqLQV\nNuHB4t2n9pxrYrlkvgfv+kZ43pjwvV5vbhUbXsucoVl5yzxf73P2XhHRta/8sjH2KuDnZ6vo2rkt\nSqNjjM29TZO+CqqE5X4bwDsBfEMI4YsAfhrAt4cQ3obH3voXAfxwo3eJ8r7zNgN7bYCbcDAV3WNZ\nX3ztolvH/ZU9n6OjoyTRbeNMPV2sw1CUjVYVTAw+TiXc6EKQTXnJ2anKZcL2vsX27T5szO2U0y7G\n+Lzz9kcbuJdSFMWxQwizAcKzrvYy29T9aYy8SQmfstO5L72S//z8vHSxDi/X3bSoZWLd3h4ol/Ae\n2euU8vz8lOz2m3HrNLv/nSJ8V2A2PBOKe7Gz/Qo8+SHqJNSq91e0tlvdhOewH5sQJycnher82dnZ\njOCe9mRgsqcaRBYhRfgYY6kNz6msdZPdnh/3QOAJnPsqsD/DIgvsxOwytorwHNbS5ZNSpZ9tER4o\nDrsdHx+7Dr067y91fV11JqXSqw8i1ThDSV+1Y6tHciU8Z7Z5hOfr12032+/Cx9wem5+FvWdZmNuC\nrSE8sKgy82BWspsN3+aP4UlYdoSl4uR1S3glvHnfmewe8VMJObZ5ZOdElarJLSnCp1R5s+HZXm8i\nPMbf2Rqi8HWY7Pa96/LBtIVGCV9HfrjBi1vz6xCCW/RgnymTLlVQFmdXG1i3dVGUWHNwML8QhUd0\nXleOJyJT+71rAYu92DlezoUl/Hne63upLVX15pUiN6U+F03AJkjUt1HnpN00GiU8xwXXxeHh4Uw1\n9hxKng3NMVRuV7VKHNXLQmO1nO+v6bCbt1mcXReM1GMOu+mS2UxKL87uVYwxKT0Se+E7df7ZsaXQ\nXl5ezppZVFmkMaM6GiV8v9+v7VyHh4dz9d7e6q9qw9pAMsJ7cXqgegyVY9wa7+Y4+6rrzxdBY+za\n3SYVZ1fSe4k1nlPOI28qbGb7VK1BUVMJPr68vJwVyzDhtcR1G5xjXcXWSPiDgydtpD0JqnFoL6zi\nxemXCat4sW4Ot2m5a1NeeF3+2p6LR3jeeH14vk+bND0pzAQtcqjZUsys9nPIjsNoutnfvPp2dQhm\nsq+HrZHwXkKLEoqdLfaaCerF6Vclu8bYvTh73amzmvbJ+e8admPHnG2aN+89w6I4uxKei1c0Tu4d\nexMA77kyjbvPNuGR31dsFeE9dZrJzjFUfq1tq8xe5RLYKvC84KzGtxVn5+uz402973xsK+iqw7Mo\nk05NoBTZtc9bakulPuv5telEW4lT+4CtUekBfykiT8Iz2TU5wshug2wZQnoSnp1gbcTZ1V9g3vZ+\nvz8jue5tU9s/ZcN7HnntAuO1ofKceuzJT0n+stfZaVcftkbCA344it8zSXVw8GT1UxvA3MmGB1TV\nkAr7Cbw49/HxcaNxdu/6RngvocYjfqrS0Mse08Qabi7JUp5J72XJeX0A+ZxK7CKPfsb6aJTwdcSe\nl4FOBvo3tf1tW/b8dqyvU9de9vypycyT6p4Kb954jbVrYk0qhqxqPRNfa9V580J1/F6K6PbeLqAo\nVwIoX2yjaWxVpt0mwSSwgc8+AavKS0nRqlpEymzhenYvNVaJruHLqgkiqVg6d6Ph9lO8KYm9vVf0\nsivSuyhPw557KiwJtEP6TPglYOaAEV59Al58Hqim1muc3atnTzWu4PJWTazhSIFKn5R091R7j/CP\nHj2aJcs8evSoNA7vhed2heyA/xvyazYn2UcCoDXHZCb8EuAfyVJJ2cHF4TgNDVYB2+hejN+T7ufn\n57Njr12V5irYdbwJSJNbqkh46zf3+uuvV35+uy7hUz0JYnyysg6vfcC+iqaRCV8ROvD5PZP6FvLi\nlF52IJbBiwJonJ0dcUz2s7MzN+ymKj1Q3IQy5TRjwnML6QcPHuDi4gIXFxdzURNPtV0lpXmboITX\nPA3LZVAVv64oThVkwi8B9farF5tDSBoarAKOs2shDHvjVZU34nOojfcq4e1aRd9Tyc6tm1mlN8J/\n/etfXwj3aQiwKG13V2C/OWeFmok1mUzmwqD2us08g0z4JcDOFZN4XLzikb2qFPMy+bw4e4rsWs/u\nOf9S0QtGymFXZMO//vrreO211xaqGTkb0c6t19gleBKe/Soszdmeryt0WwWZ8BVhg9OkuNrD3DiB\nJfUyaqsX59fwW5EdX8UxVybZ+biKDc8SnjUSO2bzxruO93pb4flgeNLWXJBN9GzYG8Lz4GepzMRc\nFcsQWvd2zNWANki8enYOu2kv+dT1qlaapYpbjLSaBadxdn22XmXiumDNoO0YtppuRY5HnVg9Kd6m\n7W7YG8IDizYyx0DXbdZxcHBQWg9fFGfX8lav+IVLW+3cLC3WRSrvnZ+TZ3qYJqLRgbprCTyHnxKv\nSXB2obc+PPs7xuPxQr2HFQhpf/02NZy9IbwOVB3EdRC+qB6+LM6uYTeP9JxQYwPJBhl3nLHvtCy8\nVVFVytt38fr2sWe6ifJgzfFfpYnmOtCUYW6qyfd3c3ODw8PDmdbDppFH+DajFXtDeGC+QYb9CEbE\ndQdMCKG0Hp41DI2zHx8fu2Rn0mtPe076qUvCF2XFGTzn4snJycJ3aoLwXhsz7m3QJNSMYQlt1+aI\nhsbZbWL2CN8W9orwRjhOirGBWwfhy+rhecIx4nr17FoAY+WtPFmoqmj3sA6KylhTKj13D/a68TQl\n4c2ByARsmvA62ah/QlV6luymDXgLpWSVvgHwQNXX6zrt7HxF9fCpsJvZ5L1eb0GF13XfvPzsOrO0\nUio9O6aKwofssON9E4TXMt3r6+vGieOZE/zs+f74tan46vj0ztE09obwwJPyWU2KqeuBlxVOsEah\n9q9544sWevTCWZwMtC60hFXtePsOXnJJr9ebm9h4Xzfh7R61Jr9p1TjlMFQJb8fWb8E2TWbS/28D\ne0N4Dn959fJ1XiOVt56SjEVLQDHhPUeaqtzrwFPnPTveSy6x9e2L0nbXhUp4JvxgMGhFSmpYUEOD\nHML00pnL/r9p7A3hgWYGYeoa+l5KnWfJrj3jNc4eQphTFw0sWcpQlPTCvea9zbrWcCiJtZY2UHT/\nRTH6tgjV9QzCvSJ800hlugFIqvHcmkpbSJsJwrajhsmWGWD6eU2jTbWosu3q6gpXV1dzoaU2vczs\nJ+FcCoNNRPqMdq1IZx1kwtcITWdlO7bIbjcJb1Jfm0t6hF9lIGu6rB6zmuy1s9KVYdpOHFH/geYG\npOLz9plM+kz42qFOKw7F6QKPVgGXKm81wscY53ror5Nd5jmMbNMutLpZK6vhcLgxCW/Pksluz1hN\nEI2DZ5QQPoTwLICPA/hGABHAr8UYfzmE8EYAvwvgmwC8BOA9McaLhu+10/AkO1euFan0RnovpAcs\nVul5qnkVsERnh59JQpbm1hteNyM/t5DelIRX34hNQpusN+86yiT8GMCPxxg/F0I4B/A3IYRPAfhB\nAJ+KMX4ohPB+AB+YbnsNj+ycOss17l4oTs0AHrRqby9rvxuK0lO9BpV8rCvNbErCaytyS5zSuP9k\n0n69eddRSPgY48sAXp4ePwohfAHAWwB8N4B3Tj/2MQB/hkx4APMFMtqTrsyGT3mWWcLbfhXvsxcH\n5uwxlfC27JPZ7txjnr32bTvtQnjcNJTVe3vNajzXm2c8RmUbPoTwVgDfBuAvATwTY3xl+qdXADxT\n+51tIdR+Z7J7hNfGFuyUq+qgW8Vpx9dgwrOEZ8JzV1pPO2hTpbfNkqZ44lMnpy0vZv+TnXYVCT9V\n538fwI/FGB/yjBljjCEE90m+8MILs+Onn34aTz/99Hp323GwU0kdcGyz86KOnF7LJPJi7qkYv4dU\nvJqz1bSene10z25POfvaJDzvFUZy9oHUmdrbZdjvVIZSwocQjvCY7L8ZY/zE9O1XQghvijG+HEJ4\nM4BXvf997rnnlrjl7YZJHfXE8+o0d+/encXceQVX1go8u9z+VoaipBM2D3gVGfbCa9iNk3BS8f8s\nNbsBi/gYLi58H3qZlz4A+AiAv4sx/hL96ZMA3gvgF6b7Tzj/vnfgSjiT6GyrG+HtfW2UwWopS7Oq\nqbNFSTX8vqnxbKvz3sJu7In3zIu2s9gy1keZhH8HgO8H8PkQwmen730QwM8D+L0QwvswDcs1dodb\nAk4xNcIXrRKjhFcpbnZnVbW5yIvPcWjNRWdbndV5XhyyqKVTJvt2ocxL/+cAUrrku+q/ne2FEVYX\neTw/P8fdu3dxfn4+F3dPqfR2Lk4qqWone9lznjRWwlsHWs2s0zXf7f+zOr+9yJl2NUJVeiP8vXv3\ncH5+PuesS6n0RnZT76tmiakH3rQD29u5WcKbNDdPfNFyzxwa9HwDGduBTPiakFLpTcLfu3dvYSko\nlfB2HiVU1cUsONzGJoGdyz7DTjsmfKpKTru6GLJav33IhK8RptIfHx/PVHeT8Pfv359rfaW937RR\nxDoZdCzJOevMPscS3rLoLM6udfDaxkmvmbFdaJTwXmNFrynAtsRJU/cdwnwDSwvJac58qmOtFyte\n9pmYNsAE1+Qab812Ds15ST9tx9rL4DkMbe+FELt0711Ao4S/urqaHXPKqXfcddJznrtXDWdqfGqR\nCE0GqeM760Bm+1ybWVxfX8/F2L3OqavW2bcFTQvWY84pYN9DF7/LptAo4QeDweyYmxfoti3Q/Hh+\nrc64KmRXNb4qUim2Zp9zXjxLco6z62IK3Dyiq2E3Nlu8zSvsaTP1dxvQmoTnnHJv/fSuS3hgsa88\nb6a6c4Yd2+lNkV1TZrXfGxfDWCadva8LKmhIr2uk56ShVAsu7fteNWlpX9CahOcup/YDcDpp18Eq\nvObKszrPKj2vQMO2eh3EB/y+dmyvc+MKzqQrWv0klaHXBeh31L70Xk++LOHn0Srh2YHCJaTbAm1E\nyWt/a2GMSnjOptNmGavAc1yphDc1XstcuWONR3g7X5cJ7/WlT3Xd7dJ32DRaU+k9NV5LHLsO9kOw\nN57z5rkvndrw6xI95Z3WGLv2bDfCs4T3bPiy62wanp+CU4E952P21M+jNQl/dHQEYJ7sbTZPWBfa\nYcVbv52LYjzHnZ1H9+v4L8pseLPdlfBVutZ0jSRF33EwGBQ2Csl4jEYJryuaWkMCU3PN5mqrpzmQ\nXixC/67/4y2+oF1rtKe8LqhYdp0qSNnZbLsbmbWRhRJePfTbAJbg2rxjW77DJtEa0zyHC3vndXKo\nG9qNpsyBpsdVl4fi3vK6ZDTfyypgFVXV1pubG7eBhdnwRXXumSj7g40R3ohmf2vaeeflAXB1Wtmm\najy3pzo/P5+R3at1LyL7MuTnZ6hpr1bfrk0o2WnHDi5W5TPh9wet5tIz4W2g2yCu0tFlHVg4TTfN\nA0hl1KWWiDIJf35+Pte2SiW8nXtVaDWcruXOYThuT8Wk99Ym3yZ1PmN9bETCm/rO6mgbhDcJrdEC\ng5JdF5Lw2kwb2c/Pz5POupRjbtV8eW4r7XmsmexMei9OnSX8fqFVCc8EZ0cTS8GmwI0krMiEQ4MG\nJjynz3oqvUl4I7xWwamEZ6walvOcVSzdUxL+6urK7TqbCb9faF3CA/OVXJ4HuwlwzJ/teR7wbLNr\nb3l12vFCkCblU3n2dVUFskrveeU5Ju3Z8RqbzjHq/UPrhGdy2b6NPHqOgzOJOVZbRnZNtlEJr5l0\ndVcDeumzWuJaZMN78elM9v1C6yr9pgbYZDJZcNhxTgBrAN5myxmpROTJITWJWaupMmhuPB/HGBfI\nzVlzHHrTNeDMUbcN8CYj22vqbBfTf7uOvep4w34DywMw+9rWJmOpzq9vbm4WVn7t9/uz6rPRaJQM\n6VX1T/AA1mKWyWTixtn5+PLyctZ9dhOLPa6Lonr3yWSykDefnY7LY28IX5YHYP4EtcFtPx6P3VVf\njWwnJyeuGl9Vpdewm263t7eFK7pamyouf2177bd1Yb9DquZd6921315GOfaG8MATZ6HlAYQQZiSz\nZYaZ6LyNx+O5hBuT7ka6Xq/ndsJZxoZP9ZOzTReL0E3TZ7dNwmtxTFm9e84jWB57RXiTIJz0YxKf\nk3C8tNvxeDznmTdysUrvxe95q3JvXMetx0z2FPG1/HWbJGBZvbvmEGSVfnnsDeF5MOlrU/FT9epM\n+NPTU1xeXuL09HTBSabaAZsEVVR6lmTeXhtZ6J5t3G204VnCc36Bfh+vu+62fMdNY28ID8y3ajay\nsyQHFotmmPBaKKMqtPa8M0dgldThGOOcuqrqKy8LpRl0Nul4i0hsq4TXxS6Hw2GycCiTvTr2hvA2\nKIwAqRBaqrjl+vp6lmDD6jyHwLwGnTY4qxCenVK6pZJp+D64JfU2dnzhtGGviUeud18fjRLeq3Mv\nirM2jXWvx8ktrGradufOnZXbIpsq62XMpVZ5VQnPlXR6vA1SXp1zumVyr49GCc/rVafiq9umlmls\nnL3ppi2YBlFVstveVFn2uGu3Gi/+bq9TIb1tITxPpLl8txk0Svh+vz87ZnVNt22BFyvn76GpuUWS\nXt+356Mrutp2dXW1oGHoVpS0sg2kYW1JV5HJqAeFhA8hPAvg4wC+EUAE8Gsxxl8OIfwMgB8C8JXp\nRz8YY/xj/X+W8NpPXOvht2FAAoukV8JbVR5/J4/c3jm5vfRgMMCjR4/w4MEDPHjwAA8fPnTtek5E\nUY2Jj7eBNBqK27YowzagTMKPAfx4jPFzIYRzAH8TQvgUHpP/wzHGDxf9M0t4bi3MGW5V88y7AFXn\nOUHGJjGuwPMkfOq1p9Ib4V977TU8ePBgwYuvHv1UWu62OLdSq9Zuw71vCwoJH2N8GcDL0+NHIYQv\nAHjL9M+lLFXCc8YZS8htQpFKr6G4MqJpgYyp9KPRaEb4119/HRcXF7i4uHDj85yMog7RbSI7gKTJ\nl0lfHyrb8CGEtwL4NgB/AeAdAH40hPADAD4D4CdijBf6P6zSe2o8273bACWRl/5aRHYluO6LJPxr\nr722IP30deq8eu2uwvM7ZLLXi0qEn6rz/w3Aj00l/a8C+Nnpn38OwC8CeJ/+H0t4bWtl5Gi6000T\n8ApbVCLxoC1T6+29FOEvLi7wta99rTDP3tOUto0o3iS1bd+h6yglfAjhCMDvA/ivMcZPAECM8VX6\n+68D+EPvf1988cXZ8f3799Hv93F0dDSTStZY4ubmyTrym4zTF8EmKu4hZ7Hwy8vLuW444/F41tvu\n+vp61uPOzuPtr6+v8fDhQzx8+HDOO2/b5eWlG1/n44z9hYVmy1DmpQ8APgLg72KMv0TvvznG+OXp\ny+8F8Lfe/z/33HOzY+ubzos5sLrGSStdVOvMi27rtRnJrZPOeDyekZ73dsxLbXmq/Xg8xte+9jV8\n9atfnTnpLi8vMRwO58JTRQ7BjP2FdWAyXFwsWNgAyiX8OwB8P4DPhxA+O33vJwE8H0J4Gx57618E\n8MNlN8RNI43cJpWs73vKaWNlrJuExskfPXo0K4qZTCYYjUazBpbenvvue4S/ubnBxcXFzF5nwlvI\nLZM+Y12Ueen/HIBnZP/RKhdjwttAtYlASx/VwdcFsIS3iIOp+YPBYK5dlh4b4VP2/O3t7SzebrF3\nJrzFoz2tJ5M+oypaK55hCa8LS3KPua7G6VnC2yKZRvbRaISTkxO3eIZLZFPOOjvX5eXlrE2VHbOE\nT8XYMzKqotVqOSa4JamYKq/LMrE3vwtgwrNkt5j50dHRQsMLro33Ji0mq5kFXq48S3j7v0z4jFXQ\nuoQH5ju9msSySQBYXJGmKxLeogmTyWSWAsuqu9dAo6jvvtrymkzDx54Kn1X6jGXROuGZ1LY3wttr\njtN3JTHHJLxNRB6hvbp63ReRU9V2jVR495SRsQwaJXyRs02JYKWkXouprsC+T1fMjLqRmqT4N6ii\nZaSaiGSsh1T7tWXQKOGvr68rf5ZTRDmZJNup7UHbbOtxUeltjNFtDeZNGhmrQduor0L8Rgm/zGon\nXBTCrZky2dsDR1K8NfKK2mgblvFhZCwHJfwqz7ZThNeyyC5k2O0TNDmKk4ds9R3egPk8CSW6agkZ\n68GLAG094bNKvzno0tiaImz191rmrM7KVGgyYz14C50s+1w7Y8PrwgtZpW8fnCfBxUBWCMRSBXhM\ndpP89v8s4dU0yGr9+vCWMNtKCW+loVml3xwsGcqyIXnxzJOTkzlpwp1++H2WQLo4Z8b68JyhW+m0\ns/h2Vuk3A5YaZr8b4fv9Pnq93qyIyXISbm5u3GWylfR2vox6sE7Ys9Ffgb23g8FgriGGIlUJlsne\nHjx1nJ134/F4bhntlJqect5tEmXjb9No6/5a+xWGw2Fbl8rIWEDXx19b95ddpxkZe4RM+IyMPUJo\nykYOIWTjOyNjg4gxLjhYGiN8RkZG95BV+oyMPUImfEbGHqEVwocQvjOE8PchhH8MIby/jWsugxDC\nSyGEz4cQPhtC+KsO3M9HQwivhBD+lt57YwjhUyGEF0IIfxJCeKpj9/czIYR/nj7Dz4YQvnND9/Zs\nCOF/hhD+bwjh/4QQ/sP0/U48v4L7a+X5NW7DhxAOAfwDgHcB+BKAvwbwfIzxC41eeAmEEF4E8K9i\njF/f9L0AQAjh3wB4BODjMcZvnb73IQBfjTF+aDppviHG+IEO3d9PA3gYSxYYbeHe3gTgTZEWQAXw\nbwH8IDrw/Aru7z1o4fm1IeHfDuCfYowvxRjHAH4HwPe0cN1l0ZnKjhjjpwG8Jm9/N4CPTY8/hseD\nZCNI3B/QgWcYY3w5xvi56fEjALYAaieeX8H9AS08vzYI/xYAX6TX/4wnX7AriAD+NITwmRDCv9v0\nzSTwTIzxlenxKwCe2eTNJPCjIYT/HUL4yCZNDkN4sgDqX6KDz4/u7y+mbzX+/Nog/DbE/d4RY/w2\nAN8F4EemKmtnER/bYV17rr8K4JsBvA3Al/F4gdGNYaou/z4eL4D6kP/WhecXZIFWtPT82iD8lwA8\nS6+fxWMp3xnYOnkxxq8A+AM8NkO6hlem9h9CCG8G8GrJ51tFjPHVOAWAX8cGn2F4sgDqb8bpAqjo\n0PMLiQVa23h+bRD+MwC+JYTw1hDCMYDvA/DJFq5bCSGE0xDC3enxGYDvQGJxzA3jkwDeOz1+L4BP\nFHy2dUxJZEguMNrCfbgLoKIjzy91f209v1Yy7UII3wXglwAcAvhIjPE/NX7RigghfDMeS3Xgcbnw\nb236/kIIvw3gnQC+AY/tzZ8C8N8B/B6AfwngJQDviTH6S4S2f38/DeDb8VgdnS0wSjZzm/f2rwH8\nLwCfxxO1/YMA/godeH6J+/tJAM+jheeXU2szMvYIOdMuI2OPkAmfkbFHyITPyNgjZMJnZOwRMuEz\nMvYImfAZGXuETPiMjD1CJnxGxh7h/wP1lwmC4rJXUQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x7f16709a6128>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import random as rnd\n",
+    "idx = rnd.randint(0, 99)\n",
+    "images = data.asnumpy()  + 0.15 * noise\n",
+    "plt.imshow(images[idx, :].reshape(28,28), cmap=cm.Greys_r)\n",
+    "print(\"true: %d\" % label.asnumpy()[idx])\n",
+    "print(\"pred: %d\" % np.argmax(pred, axis=1)[idx])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

From ad385c04841ba77092acf8276342df1bfcfe6479 Mon Sep 17 00:00:00 2001
From: II-Matto <wszbupteducn@126.com>
Date: Fri, 23 Oct 2015 11:45:50 +0800
Subject: [PATCH 078/122] Fix num_pad of io.NDArrayIter.

---
 python/mxnet/io.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index f6056d9c25e1..4da24e1c1cf1 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -156,10 +156,7 @@ def __init__(self, data, label,
             self.batch_data[i, 0:actual_size, ::] = data[loc:loc+actual_size, ::]
             self.batch_label[i, 0:actual_size] = label[loc:loc+actual_size]
             loc += batch_size
-        if data.shape[0] > batch_size:
-            self.num_pad = data.shape[0] % batch_size
-        else:
-            self.num_pad = batch_size - data.shape[0]
+        self.num_pad = batch_size - data.shape[0] % batch_size
         self.out_data = None
         self.out_label = None
         self.current_batch = -1

From cf84037c7a55f3bc98fec447ae3a39929030f002 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 22 Oct 2015 21:21:31 -0700
Subject: [PATCH 079/122] [EXAMPLE] check in memcost

---
 example/memcost/Makefile             | 22 +++++++
 example/memcost/README.md            | 30 ++++++++++
 example/memcost/inception_memcost.py | 90 ++++++++++++++++++++++++++++
 3 files changed, 142 insertions(+)
 create mode 100644 example/memcost/Makefile
 create mode 100644 example/memcost/README.md
 create mode 100644 example/memcost/inception_memcost.py

diff --git a/example/memcost/Makefile b/example/memcost/Makefile
new file mode 100644
index 000000000000..ca6b543be4d4
--- /dev/null
+++ b/example/memcost/Makefile
@@ -0,0 +1,22 @@
+
+.PHONY: no_optimization with_inplace with_sharing with_both
+
+no_optimization:
+	@echo "Estimating the cost with no optimization..."
+	@MXNET_EXEC_ENABLE_INPLACE=false MXNET_EXEC_MATCH_RANGE=0 python inception_memcost.py
+
+with_inplace:
+	@echo "Estimating the cost with inplace optimization..."
+	@MXNET_EXEC_ENABLE_INPLACE=true MXNET_EXEC_MATCH_RANGE=0 python inception_memcost.py
+
+with_sharing:
+	@echo "Estimating the cost with memory sharing ..."
+	@MXNET_EXEC_ENABLE_INPLACE=false python inception_memcost.py
+
+with_both:
+	@echo "Estimating the cost with all optimizations ..."
+	@python inception_memcost.py
+
+forward_only:
+	@echo "Estimating the cost of forward only ..."
+	@python inception_memcost.py 'null'
diff --git a/example/memcost/README.md b/example/memcost/README.md
new file mode 100644
index 000000000000..c3a908490bc7
--- /dev/null
+++ b/example/memcost/README.md
@@ -0,0 +1,30 @@
+Memory Cost of Deep Nets under Different Allocations
+====================================================
+This folder contains a script to show the memory cost of different allocation strategies,
+discussed in [Note on Memory Optimization](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html).
+
+We use inception-bn as an example, with batch size of 32.
+
+How to See the cost
+-------------------
+The possible options are gathered together in the [Makefile](Makefile).
+Type the following command to see the allocation cost. Look for the
+```Final message Total x MB allocated```
+- ```make no_optimization```
+  - Shows the cost without any optimization.
+- ```make with_inplace```
+  - Shows the cost with inplace optimization.
+- ```make with_sharing```
+  - Shows the cost with memory allocating algorithm for sharing.
+- ```make with_both```
+  - Shows the cost of memory allocation with both inplace and sharing optimization.
+- ```make forward_only```
+  - Shows the cost of when we only want to run forward pass.
+
+Notes
+-----
+- You can change the symbol in the [inception_memcost.py](inception_memcost.py) to the net you interested in.
+- You will need to install mxnet or type make on the root folder before use the script.
+- The estimation is only on space cost of intermediate node and weights.
+  - The cost of temporal workspace is not estimated.
+- The estimation does real allocation on CPU, the plan is the same on GPU.
diff --git a/example/memcost/inception_memcost.py b/example/memcost/inception_memcost.py
new file mode 100644
index 000000000000..8183c6774724
--- /dev/null
+++ b/example/memcost/inception_memcost.py
@@ -0,0 +1,90 @@
+# pylint: skip-file
+import sys
+sys.path.append('../../python/')
+import mxnet as mx
+import logging
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden, grad_scale):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
+    in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
+    in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
+
+
+
+softmax = inception(1000, 1.0)
+batch_size = 32
+softmax = inception(1000, 1.0)
+
+if len(sys.argv) == 2:
+    grad_req = sys.argv[1]
+else:
+    grad_req = 'write'
+
+texec = softmax.simple_bind(ctx=mx.cpu(),
+                            data=(batch_size, 3, 224, 224),
+                            grad_req=grad_req)
+# We extract the memory cost from the execution plan
+print(texec.debug_str().split('\n')[-3])

From 3c4f424e59bc46bf91863ee47d28dd7d9e2a98a3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tianqi.tchen@gmail.com>
Date: Thu, 22 Oct 2015 21:35:45 -0700
Subject: [PATCH 080/122] Note memory

---
 README.md                          |   3 +-
 doc/developer-guide/index.md       |   4 +-
 doc/developer-guide/note_memory.md | 234 +++++++++++++++++++++++++++++
 doc/index.md                       |   1 +
 4 files changed, 240 insertions(+), 2 deletions(-)
 create mode 100644 doc/developer-guide/note_memory.md

diff --git a/README.md b/README.md
index 1641d2905a68..17a2069e3ca8 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,10 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
+* [Design Note: Squeeze the Memory Consumption of Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
 * [LSTM Example by using symbolic API](https://github.com/dmlc/mxnet/tree/master/example/rnn)
 * [MXNet R Package brings Deep learning for R!](https://github.com/dmlc/mxnet/tree/master/R-package)
-* [Note on Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
+* [Design Note: Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
 
 Contents
 --------
diff --git a/doc/developer-guide/index.md b/doc/developer-guide/index.md
index f54e4fd04a99..9f8bf6938ea2 100644
--- a/doc/developer-guide/index.md
+++ b/doc/developer-guide/index.md
@@ -57,7 +57,9 @@ Open Source Design Notes
 * [Dependency Engine for Deep Learning](note_engine.md)
 	- Introduces the dependency tracking and scheduling component for general deep learning,
 	  this motivates the design of Engine module.
-
+* [Squeeze the Memory Consumption of Deep Learning](note_memory.md)
+	- Introduces how we can reduce memory consumption of deep nets
+	  
 List of Other Resources
 -----------------------
 * [Doxygen Version of C++ API](https://mxnet.readthedocs.org/en/latest/doxygen) gives a comprehensive document of C++ API.
diff --git a/doc/developer-guide/note_memory.md b/doc/developer-guide/note_memory.md
new file mode 100644
index 000000000000..544a2d1f6f1e
--- /dev/null
+++ b/doc/developer-guide/note_memory.md
@@ -0,0 +1,234 @@
+Squeeze the Memory Consumption of Deep Learning
+===============================================
+One important theme about deep learning is to train deeper and larger nets.
+While the hardware has been upgraded rapidly in recent years, the huge deepnet monsters are
+always hungry about the GPU RAMS. Being able to use less memory for the same net also means we can
+user larger batch size, and usually higher GPU utilization rate.
+
+This article discusses how memory allocation optimization can be done for deep neural nets, and provide
+some of candidate solutions to the problems. The solutions discussed in this article is by no means complete,
+but rather as example that we think is useful to most cases.
+
+Computation Graph
+-----------------
+We will start the discussion by introducing computation graph, since this is the tool that will help us in the later
+part of the section. Computation graph describes the (data-flow) dependencies between the operations in the deep nets.
+The operation performed in the graph can either be fine-grained or coarse grained.
+The following figure gives two examples of computation graph.
+
+![Comp Graph Example](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/comp_graph_example.png)
+
+The idea of computation graph is deeply rooted in the packages such as Theano, CGT. Actually they also exists implicitly
+in most libraries as the network configuration. The major difference in these library comes to how do they calculate gradient.
+There are mainly two ways, doing back-propagation on the same graph, or have an explicit backward path that calculates
+the gradient needed. 
+
+![Backward Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png)
+
+Libraries like caffe, cxxnet, torch uses the backprop on same graph. While libraries like Theano, CGT takes the explicit
+backward path approach. We will adopt the ***explicit backward path*** way in the article, because it brings several advantages
+in turns of optimization.
+
+However, we should emphasize that choosing the explicit backward path way for execution will not restrict us
+to scope of symbolic libraries such as Theano, CGT. We can also use the explicit backward path for gradient calculation of
+layer-based(which ties forward, backward together) libraries. The following graph shows how this can be done.
+Basically, we can introduce a backward node that links to the forward node of the graph, and calls the ```layer.backward```
+in the backward operations. 
+
+![Backward Layer](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/explicit_back_layer.png)
+
+So this discussion applies to almost all deep learning libraries that exists
+(There are differences between these libraries,  e.g. high order differentiation. which are beyond the scope of in this article).
+
+Why explicit backward path is better? Let us explain it with two examples. The first reason is that the explicit backward path
+clearly describes the dependency between the computation. Consider the following case, where we want to get
+the gradient of A and B. As we can see clearly from the graph, that computation of ```d(C)``` gradient do not depend on F.
+This means we can free the memory of ```F``` right after the the forward computation is done, similarly the memory 
+of ```C``` can be recycled.
+
+![Backward Prune](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_dep_prune.png)
+
+Another advantage of explicit backward path is to be able to have a different backward path rather than an mirror of forward one.
+One common example is the split connection case, as shown in the following figure.
+
+![Backward Agg](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_agg_grad.png)
+
+In this example, the output of B is referenced by two operations. If we want to do the gradient calculation in the same
+network, an explicit split layer need to be introduced. This means we need to do the split for the forward pass as well.
+In this figure, the forward pass do not contain a split layer, but the graph will automatically insert a gradient
+aggregation node before passing gradient back to B. This helps us to save the memory cost of allocating output of
+split layer, as well as operation cost of replicate the data in forward pass.
+
+If we adopt the explicit backward view of computation graph, there is no difference between the forward pass
+and backward pass. We will simply go forward in topological order of the computation graph, and carry out computations.
+This also simplifies our discussions. The problem now becomes:
+
+- How to allocate the memory of each output node of a computation graph?
+
+Hmm, seems it has nothing to do with deep learning, but more of context of compiling, data flow optimization etc.
+But it is really the hungry monster of deep learning that motivates us attack this problem, and benefit from it.
+
+What can be Optimized
+---------------------
+Hopefully you are convinced that the computation graph is a good way to discuss memory allocation optimization techniques.
+As you can see some memory saving can already been bought by using explicit backward graph. Let us discuss more about
+what optimization we can do, and what is the baseline.
+
+Asumme we want to build a neural net with ```n``` layers. A typical implementation of neural net will
+need to allocate node space for output of each layer, as well as gradient values for back-propagation.
+This means we need roughly ```2 n``` memory cells. This is the same in the explicit backward graph case, as
+the number of nodes in backward pass in roughly the same as forward pass. 
+
+### Inplace Operations
+One of the very first thing that we can do is inplace memory sharing of operations. This is usually done for
+simple operations such as activation functions. Consider the following case, where we want to
+compute the value of three chained sigmoid function.
+
+![Inplace op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png)
+
+Because we can compute sigmoid in the ```inplace``` manner, that is, use the same memory for input and output.
+We can simply allocate one copy of memory, and use it compute arbitrary length of sigmoid chain.
+
+However, the inplace optimization sometimes can be done in the wrong way, especially when the package tries
+to be a bit general. Consider the following case, where the value of B is not only used by C, but also F.
+
+![Inplace trap](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png)
+
+We cannot perform inplace optimization because the value of B is still needed after ```C=sigmoid(B)``` is computed.
+So an algorithm that simply do inplace optimization for every sigmoid operation might fall into such trap,
+and we need to be careful on when we can do it.
+
+### Normal Memory Sharing
+Memories can also be shared besides the inplace operation. Consider the following case, because the 
+value of B is no longer needed when we compute E, we can reuse the memory to hold the result of E.
+
+![Normal Sharing](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png)
+
+We would like to point out that is ***memory sharing does not necessarily require same data shape****.
+In the above example, the shape of ```B``` and ```E``` can be different, and we can simply allocate a
+memory region that is the maximum of the two sizes and share it between the two.
+
+### Real Neural Net Allocation Example
+The above examples are all make up cases, that only contains the computation of the forward pass.
+Actually the idea holds the same for the real neural net cases. The following figure shows an allocation
+plan we can do for a two layer perception.
+
+![Net Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png)
+
+In the above example:
+- Inplace optimization is applied on computing ```act1```, ```d(fc1)```, ```out``` and ```d(fc2)```.
+- The memory sharing is used between ```d(act1)``` and ```d(A)```.
+
+Memory Allocation Algorithm
+---------------------------
+We have discussed how the general techniques to optimize memory allocations in previous section.
+However, we also see that there are traps which we want to avoid like the inplace case.
+How can we allocate the memory correctly? This is not a new problem. For example, it is very similar
+to register allocation in compilers. So there could be a lot we can borrow. We do not attempt to give
+a comprehensive review of techniques here, but rather introduce some simple but useful trick to attack
+the problem.
+
+The key problem is we want to place resources, such that they do not conflict each other.
+More specifically, each variable have a ```life time``` between the time it get computed till the last time it get used.
+In the multilayer perception case, the ```life time``` of ```fc1``` ends after ```act1``` get computed.
+
+![Net Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png)
+
+The principle is ***to only allow memory sharing between the variables whose lifetime do not overlap***. There are multiple
+ways to solve this problem. One possible way is to construct the conflicting graph of with each variable as node and link edge
+between variables with overlapping lifespan, and run a graph-coloring algorithm. This will likely require ```$O(n^2)$```
+complexity where ```n``` is number of nodes in the graph, which could be an reasonable price to pay. 
+
+We will introduce another simple heuristic here. The idea is to simulate the procedure of traversing the graph,
+and keep a counter of future operations that depends on the node.
+
+![Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png)
+
+- An inplace optimization can be performed when only current operation depend on the source(i.e. counter=1)
+- A memory can be recycled into the box on the upper right corner when counter goes to 0
+- Every time, when we need new memory, we can either get it from the box, or allocate a new one.
+
+One note is that during the simulation, no memory is allocated, but we rather keep record of how much memory each node need,
+and allocate the maximum of the shared parts in the final memory plan.
+
+### Static vs Dynamic Allocation
+
+If you think carefully, you will find the above strategy exactly simulates the dynamic memory allocation procedure in imperative
+languages such as python. The counter is the reference counter of each memory object, and the object get garbage collected when
+the reference counter goes to zero. In that sense, we are simulating the dynamic memory allocation once to create a static allocation plan. 
+Now the question is, can we simply use an imperative language that dynamically allocates and de-allocates memories?
+
+The major difference is that the static allocation is only done once, so we can afford to use more complicated algorithms
+- For example, do searching over memories sizes that are similar to the require memory block.
+- The allocation can also be made graph aware, see more discussion in next section.
+- The dynamic way will push more pressure on fast memory allocator and garbage collector.
+
+There is also one takeaway for users who want to reply on dynamic memory allocations:
+***do not take unnecessary reference of object***. For example, if we organize all the nodes in
+a list and store then in a Net object, these nodes will never get de-referenced, getting us no gain of the space.
+Unfortunately, this is one common way to organize the code.
+
+
+Allocation for on Parallel Operations
+-------------------------------------
+In the previous section, we discussed how we can ```simulate``` the running procedure of computation graph,
+and get a static allocation plan. However, there are more problems when we want to optimize for parallel computation
+as resource sharing and parallelization are on the two ends of a balance.
+Let us look at the following two allocation plan for the same graph:
+
+![Parallel Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/parallel_alloc.png)
+
+Both allocation plans are valid, if we run the computation in a serial manner from ```A[1]``` to ```A[8]```.
+However, the allocation plan on the left side introduces extra dependencies, which means we cannot
+run computation of ```A[2]``` and ```A[5]``` in parallel, while the right one can.
+
+As we can see that if we want to parallelizing the computation, more care need to be done in terms of computation.
+
+### Stay Safe and Correct First
+Stay correct, this is the very first principle we need to know. This means execute in a way to take the implicit dependency
+memory sharing into consideration. This can done by adding the implicit dependency edge to execution graph.
+Or even simpler, if the execution engine is mutate aware as described in the
+[dependency engine note](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html), push the operation
+in sequence and write to the same variable tag that represents the same memory region.
+
+Another way is always produce memory allocation plan that is safe, which means never allocate same memory to nodes that can
+be parallelized. This may not be the ideal case, because sometimes memory reduction is more desirable, and there is not too
+much gain we can get by multiple computing stream execution on the same GPU.
+
+### Try to Allow More Parallelization
+Given that we can always be correct, we are now safe to do some optimizations. The general idea is to try to
+encourage memory sharing between nodes that cannot be parallelized. This again can be done by creating a ancestor relation
+graph and query this during allocation, which cost around ```$O(n^2)$``` time to construct. We can also use heuristic here,
+for example, one way is to color the path in the graph.
+The idea is shown in the figure below, every time we tries to find a longest path in the graph, color them to same color,
+and continue.
+
+![Path Color](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/graph_color.png)
+
+After we get the color of the node, we can only allow sharing (or encourage such sharing ) between nodes in the same color.
+This is a more strict version than the ancestor relation, but only cost ```$O(n)$``` time if we only search for first ```k``` path.
+
+The strategy discussed here is by no means the only solution, we can expect more sophisticated approaches along this line.
+
+How much can We Save
+--------------------
+Thanks for reading till this part! We have discussed the techniques and algorithms we can use to squeeze the memory usage of deep learning.
+Now comes the question on how much we can really save by using these techniques. The answer is we can roughly reduce the memory consumption
+by half using these techniques.
+
+Most of the ideas in this article inspires the design of mxnet.
+We provide an [Memory Cost Estimation Script](https://github.com/dmlc/mxnet/tree/master/example/memcost),
+which you can play with to see how much memory we need under different strategies.
+
+If you play with the script, there is one option called ```forward_only```, which shows the cost only running the forward pass.
+You will find that the cost is extremely low compared to others.  You won't be surprised if you read previous part of
+the article, this is simply because more memory re-use if we only run the forward pass. So here are the two takeaways:
+
+- Use computation graph to allocate the memory smartly and correctly.
+- Running deep learning prediction cost much less memory than deep learning training.
+
+Contribution to this Note
+-------------------------
+This note is part of our effort to [open-source system design notes](http://mxnet.readthedocs.org/en/latest/#open-source-design-notes)
+ for deep learning libraries. You are more welcomed to contribute to this Note, by submitting a pull request.
+
diff --git a/doc/index.md b/doc/index.md
index 6ca7dff52683..c288fab8face 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -35,6 +35,7 @@ in terms of abstraction, optimization and trade-offs.
 
 * [Programming Models for Deep Learning](program_model.md)
 * [Dependency Engine for Deep Learning](developer-guide/note_engine.md)
+* [Squeeze the Memory Consumption of Deep Learning](developer-guide/note_memory.md)
 
 Indices and tables
 ------------------

From 68a4b7739c5a76d853ecc30377ed24ae756d13f7 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 22 Oct 2015 21:43:51 -0700
Subject: [PATCH 081/122] Update note_memory.md

---
 doc/developer-guide/note_memory.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/developer-guide/note_memory.md b/doc/developer-guide/note_memory.md
index 544a2d1f6f1e..5bffb8e13a59 100644
--- a/doc/developer-guide/note_memory.md
+++ b/doc/developer-guide/note_memory.md
@@ -213,8 +213,9 @@ The strategy discussed here is by no means the only solution, we can expect more
 How much can We Save
 --------------------
 Thanks for reading till this part! We have discussed the techniques and algorithms we can use to squeeze the memory usage of deep learning.
-Now comes the question on how much we can really save by using these techniques. The answer is we can roughly reduce the memory consumption
-by half using these techniques.
+Now comes the question on how much we can really save by using these techniques.
+
+The answer is we can roughly reduce the memory consumption ***by half*** using these techniques. This is on the coarse grained operation graphs that are already optimized with big operations. More memory reduction could be seen if we are optimizing a fine-grained computation network used by symbolic libraries such as Theano.
 
 Most of the ideas in this article inspires the design of mxnet.
 We provide an [Memory Cost Estimation Script](https://github.com/dmlc/mxnet/tree/master/example/memcost),

From 0b595e55d49bb03d98969d51d062e390ac39a932 Mon Sep 17 00:00:00 2001
From: Chiyuan Zhang <pluskid@gmail.com>
Date: Fri, 23 Oct 2015 01:26:41 -0400
Subject: [PATCH 082/122] Add news about julia binding release

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 17a2069e3ca8..9d7e30e31d67 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
+* [MXNet.jl Julia binding initial release](https://github.com/dmlc/MXNet.jl)
 * [Design Note: Squeeze the Memory Consumption of Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
 * [LSTM Example by using symbolic API](https://github.com/dmlc/mxnet/tree/master/example/rnn)
 * [MXNet R Package brings Deep learning for R!](https://github.com/dmlc/mxnet/tree/master/R-package)

From dbb94c9d982c78119416e2b9549916931d845521 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 22 Oct 2015 23:24:29 -0700
Subject: [PATCH 083/122] Update index.md

add link to julia package
---
 doc/index.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/doc/index.md b/doc/index.md
index c288fab8face..e74040996138 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,16 +3,12 @@ MXNet Documentation
 [MXNet](https://github.com/dmlc/mxnet) is a deep learning framework designed for both *efficiency* and *flexibility*.
 It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
 
-How to Get Started
-------------------
-* Check out [Python Getting started Guide](python/tutorial.md)
-* The [example](../example) folder contains example usecases of mxnet.
-
 User Guide
 ----------
 * [Build and Installation](build.md)
 * [Python Package Document](python/index.md)
 * [R Package Document](R-package/index.md)
+* [MXNet.jl Julia Package](https://github.com/dmlc/MXNet.jl)
 * [Frequently Asked Questions](faq.md)
 
 

From 5511fb47c8e26934a12b08de0b6147f6c94f51d6 Mon Sep 17 00:00:00 2001
From: XIN <turinglife@users.noreply.github.com>
Date: Sat, 24 Oct 2015 01:21:07 +0800
Subject: [PATCH 084/122] Update note_memory.md

---
 doc/developer-guide/note_memory.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/developer-guide/note_memory.md b/doc/developer-guide/note_memory.md
index 5bffb8e13a59..43a3299d0618 100644
--- a/doc/developer-guide/note_memory.md
+++ b/doc/developer-guide/note_memory.md
@@ -3,7 +3,7 @@ Squeeze the Memory Consumption of Deep Learning
 One important theme about deep learning is to train deeper and larger nets.
 While the hardware has been upgraded rapidly in recent years, the huge deepnet monsters are
 always hungry about the GPU RAMS. Being able to use less memory for the same net also means we can
-user larger batch size, and usually higher GPU utilization rate.
+use larger batch size, and usually higher GPU utilization rate.
 
 This article discusses how memory allocation optimization can be done for deep neural nets, and provide
 some of candidate solutions to the problems. The solutions discussed in this article is by no means complete,

From 8dea6d1f5ca0c431ff72eaa7aa0fe5f97b916320 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 23 Oct 2015 15:45:50 -0700
Subject: [PATCH 085/122] Update README.md

---
 example/memcost/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/memcost/README.md b/example/memcost/README.md
index c3a908490bc7..446c31e65ff2 100644
--- a/example/memcost/README.md
+++ b/example/memcost/README.md
@@ -25,6 +25,6 @@ Notes
 -----
 - You can change the symbol in the [inception_memcost.py](inception_memcost.py) to the net you interested in.
 - You will need to install mxnet or type make on the root folder before use the script.
-- The estimation is only on space cost of intermediate node and weights.
-  - The cost of temporal workspace is not estimated.
+- The estimation is only on space cost of intermediate node.
+  - The cost of temporal workspace is not estimated, so you will likely need more memory when running real nets.
 - The estimation does real allocation on CPU, the plan is the same on GPU.

From 44f5a498aec270b5f25ca16631eed26712391f41 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 23 Oct 2015 15:46:16 -0700
Subject: [PATCH 086/122] [MSHADOW] Upgrade get_with_shape

---
 mshadow                          |  2 +-
 src/operator/batch_norm-inl.h    |  6 ++----
 src/operator/concat-inl.h        | 12 ++++--------
 src/operator/convolution-inl.h   | 19 ++++++++-----------
 src/operator/leaky_relu-inl.h    |  6 ++----
 src/operator/slice_channel-inl.h | 15 ++++++---------
 6 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/mshadow b/mshadow
index 129e060d76cd..f00b2086218e 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 129e060d76cd5d7f42ac4c26cf39d3289a8540a6
+Subproject commit f00b2086218e116bd78a84d8e96d3bef8d4229d1
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index 625614725282..50f89878e615 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -66,8 +66,7 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 4> data;
     Tensor<xpu, 4> out, out_no_affine;
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
       data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       if (ctx.is_train) {
@@ -125,8 +124,7 @@ class BatchNormOp : public Operator {
     const real_t scale = static_cast<real_t>(out_data[kOut].shape_[1]) /
                          static_cast<real_t>(out_data[kOut].shape_.Size());
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {out_data[kOut].shape_[0], out_data[kOut].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(out_data[kOut].shape_[0], out_data[kOut].shape_[1], 1, 1);
       data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       grad_in = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index cc51079462b8..a8821588b7ab 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -53,13 +53,11 @@ class ConcatOp : public Operator {
     if (in_data[kData0].ndim() == 2) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
-        uint32_t ds[] = {in_data[i].shape_[0], in_data[i].shape_[1], 1, 1};
-        TShape dshape(ds, ds + 4);
+        Shape<4> dshape = Shape4(in_data[i].shape_[0], in_data[i].shape_[1], 1, 1);
         data[i] = in_data[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         dim += in_data[i].shape_[1];
       }
-      uint32_t ds_out[] = {in_data[kData0].shape_[0], dim, 1, 1};
-      TShape dshape_out(ds_out, ds_out + 4);
+      Shape<4> dshape_out = Shape4(in_data[kData0].shape_[0], dim, 1, 1);
       out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
@@ -87,14 +85,12 @@ class ConcatOp : public Operator {
     if (out_grad[kOut].ndim() == 2) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
-        uint32_t ds[] = {in_grad[i].shape_[0], in_grad[i].shape_[1], 1, 1};
-        TShape dshape(ds, ds + 4);
+        Shape<4> dshape = Shape4(in_grad[i].shape_[0], in_grad[i].shape_[1], 1, 1);
         grad_in[i] = in_grad[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         dim += in_grad[i].shape_[1];
         CHECK_EQ(req[i], kWriteTo);
       }
-      uint32_t ds_out[] = {in_grad[kData0].shape_[0], dim, 1, 1};
-      TShape dshape_out(ds_out, ds_out + 4);
+      Shape<4> dshape_out = Shape4(in_grad[kData0].shape_[0], dim, 1, 1);
       grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 7299ad97ec0b..eccb4df9448c 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -72,14 +72,12 @@ class ConvolutionOp : public Operator {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
-    // TODO(bing): check the BLAS Handle, be careful
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    uint32_t ws[] = {param_.num_group,
-                     param_.num_filter / param_.num_group,
-                     data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]
-                    };
-    TShape wmat_shape(ws, ws + 3);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               param_.num_filter / param_.num_group,
+               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
     Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
     Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
 #if defined(__CUDACC__)
@@ -149,11 +147,10 @@ class ConvolutionOp : public Operator {
     // get data
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    uint32_t ws[] = {param_.num_group,
-                     param_.num_filter / param_.num_group,
-                     data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]
-                    };
-    TShape wmat_shape(ws, ws + 3);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               param_.num_filter / param_.num_group,
+               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
     Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
     Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> gdata = in_grad[kData].get<xpu, 4, real_t>(s);
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 68cb52eea25f..dc2c45127a03 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -75,8 +75,7 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 4> mask;
     Tensor<xpu, 1> weight;
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
       data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       if (param_.act_type == kRReLU) {
@@ -139,8 +138,7 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 1> weight;
     Tensor<xpu, 1> grad_weight;
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
       grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       gdata = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       output = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 25d8ef2cd844..ad0910df8731 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -51,12 +51,10 @@ class SliceChannelOp : public Operator {
     std::vector<Tensor<xpu, 4> > outputs(size_);
     Tensor<xpu, 4> data;
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
       data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      uint32_t dim = data.shape_[1] / size_;
-      ds[1] = dim;
-      TShape slice_shape(ds, ds + 4);
+      Shape<4> slice_shape = dshape;
+      slice_shape[1] = dshape[1] / size_;
       for (int i = 0; i < size_; ++i) {
         outputs[i] = out_data[i].get_with_shape<xpu, 4, real_t>(slice_shape, s);
       }
@@ -84,13 +82,12 @@ class SliceChannelOp : public Operator {
     std::vector<Tensor<xpu, 4> > grad_out(size_);
     Tensor<xpu, 4> grad;
     if (out_grad[kOut0].ndim() == 2) {
-      uint32_t ds[] = {out_grad[kOut0].shape_[0], out_grad[kOut0].shape_[1], 1, 1};
-      TShape slice_shape(ds, ds + 4);
+      Shape<4> slice_shape = Shape4(out_grad[kOut0].shape_[0], out_grad[kOut0].shape_[1], 1, 1);
       for (int i = 0; i < size_; ++i) {
         grad_out[i] = out_grad[i].get_with_shape<xpu, 4, real_t>(slice_shape, s);
       }
-      ds[1] *= size_;
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = slice_shape;
+      dshape[1] *= size_;
       grad = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
     } else {
       for (int i = 0; i < size_; ++i) {

From 6cc65db04d60e05f0382b5c43141072b5582ee35 Mon Sep 17 00:00:00 2001
From: muli <muli@cs.cmu.edu>
Date: Fri, 23 Oct 2015 21:55:02 -0400
Subject: [PATCH 087/122] [kvstore] bugfix for not allowing pull immediately
 after init

---
 src/kvstore/kvstore_local.h           | 5 ++---
 tests/python/unittest/test_kvstore.py | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 4426d4e82f32..e31930436821 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -64,7 +64,8 @@ class KVStoreLocal : public KVStore {
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
-      if (updater_ != nullptr) {
+      auto it = merge_buf_.find(key);
+      if (updater_ != nullptr || it == merge_buf_.end()) {
         auto it = local_.find(key);
         CHECK(it != local_.end()) << "key " << key << " has not been inited";
         const NDArray& src = it->second;
@@ -72,8 +73,6 @@ class KVStoreLocal : public KVStore {
           CopyFromTo(src, vptr, priority);
         }
       } else {
-        auto it = merge_buf_.find(key);
-        CHECK(it != merge_buf_.end()) << "key " << key << " has not been pushed";
         auto& src = it->second.merged;
         for (auto* vptr : grouped_vals[i]) {
           CopyFromTo(src, vptr, priority);
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index 77439677320f..dd8149d4822e 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -27,6 +27,13 @@ def test_single_kv_pair():
     kv.pull(3, out = val)
     check_diff_to_scalar(val, 1)
 
+def test_init():
+    """test init"""
+    kv = mx.kv.create()
+    kv.init(3, mx.nd.ones(shape)*4)
+    a = mx.nd.zeros(shape)
+    kv.pull(3, out=a)
+    check_diff_to_scalar(a, 4)
 
 def test_list_kv_pair():
     """list key-value pair push & pull"""
@@ -110,6 +117,7 @@ def test_get_type():
     assert kv.type == kvtype
 
 if __name__ == '__main__':
+    test_init()
     test_get_type()
     test_single_kv_pair()
     test_list_kv_pair()

From 66bec9b838d0217308d4b0749c72a1ea8c51cac0 Mon Sep 17 00:00:00 2001
From: Mu Li <muli@cs.cmu.edu>
Date: Sat, 24 Oct 2015 00:07:01 -0400
Subject: [PATCH 088/122] Update dist_async_inception.py

---
 tests/python/multi-node/dist_async_inception.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/python/multi-node/dist_async_inception.py b/tests/python/multi-node/dist_async_inception.py
index cb7fd656471f..8cb116fc26e2 100755
--- a/tests/python/multi-node/dist_async_inception.py
+++ b/tests/python/multi-node/dist_async_inception.py
@@ -6,15 +6,11 @@
 
 mx.random.seed(0)
 logging.basicConfig(level=logging.DEBUG)
-
 kv = mx.kvstore.create('dist_async')
-
 (train, val) = common.cifar10(num_parts = kv.num_workers,
                               part_index = kv.rank,
                               batch_size = 128,
                               input_shape=(3,28,28))
-
-# assume each worker has two gpus
 devs = [mx.gpu(i) for i in range(2)]
 model = mx.model.FeedForward.create(
     ctx           = devs,

From 9a4484a1bf67f4ecb9a704c7c91e5fbd1bc5324a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 23 Oct 2015 23:55:35 -0700
Subject: [PATCH 089/122] [API] Add predict API

---
 Makefile                                      |   2 +-
 include/mxnet/c_predict_api.h                 | 148 +++++++++++
 include/mxnet/kvstore.h                       |   3 +-
 predict/python/mxnet_predict.py               | 203 +++++++++++++++
 python/mxnet/model.py                         |   2 -
 src/{ => c_api}/c_api.cc                      |  97 +-------
 src/c_api/c_api_error.cc                      |  21 ++
 src/c_api/c_api_error.h                       |  39 +++
 src/c_api/c_predict_api.cc                    | 231 ++++++++++++++++++
 src/common/tblob_op_registry.h                |   3 +-
 src/common/thread_local.h                     |  77 ++++++
 tests/python/predict/mxnet_predict_example.py |  61 +++++
 12 files changed, 787 insertions(+), 100 deletions(-)
 create mode 100644 include/mxnet/c_predict_api.h
 create mode 100644 predict/python/mxnet_predict.py
 rename src/{ => c_api}/c_api.cc (91%)
 create mode 100644 src/c_api/c_api_error.cc
 create mode 100644 src/c_api/c_api_error.h
 create mode 100644 src/c_api/c_predict_api.cc
 create mode 100644 src/common/thread_local.h
 create mode 100644 tests/python/predict/mxnet_predict_example.py

diff --git a/Makefile b/Makefile
index 8cd116b57800..df365a3cd2dd 100644
--- a/Makefile
+++ b/Makefile
@@ -137,7 +137,7 @@ include tests/cpp/unittest.mk
 test: $(TEST)
 
 lint: rcpplint
-	python dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts python
+	python dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts python predict/python
 
 doc: doxygen
 
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
new file mode 100644
index 000000000000..f997f91c4787
--- /dev/null
+++ b/include/mxnet/c_predict_api.h
@@ -0,0 +1,148 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_predict_api.h
+ * \brief C predict API of mxnet, contains a minimum API to run prediction.
+ *  This file is self-contained, and do not dependent on any other files.
+ */
+#ifndef MXNET_C_PREDICT_API_H_
+#define MXNET_C_PREDICT_API_H_
+
+#ifdef __cplusplus
+#define MXNET_EXTERN_C extern "C"
+#endif
+
+#ifdef _WIN32
+#ifdef MXNET_EXPORTS
+#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllexport)
+#else
+#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllimport)
+#endif
+#else
+#define MXNET_DLL MXNET_EXTERN_C
+#endif
+
+/*! \brief manually define unsigned int */
+typedef unsigned int mx_uint;
+/*! \brief manually define float */
+typedef float mx_float;
+/*! \brief handle to Predictor */
+typedef void *PredictorHandle;
+/*! \brief handle to NDArray list */
+typedef void *NDListHandle;
+
+/*!
+ * \brief Get the last error happeneed.
+ * \return The last error happened at the predictor.
+ */
+MXNET_DLL const char* MXGetLastError();
+/*!
+ * \brief create a predictor
+ * \param symbol_file The path to the symbol file.
+ * \param param_file the path to the parameter file.
+ * \param dev_type The device type, 1: cpu, 2:gpu
+ * \param dev_id The device id of the predictor.
+ * \param num_input_nodes Number of input nodes to the net,
+ *    For feedforward net, this is 1.
+ * \param input_keys The name of input argument.
+ *    For feedforward net, this is {"data"}
+ * \param input_shape_indptr Index pointer of shapes of each input node.
+ *    The length of this array = num_input_nodes + 1.
+ *    For feedforward net that takes 4 dimensional input, this is {0, 4}.
+ * \param input_shape_data A flatted data of shapes of each input node.
+ *    For feedforward net that takes 4 dimensional input, this is the shape data.
+ * \param out The created predictor handle.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredCreate(const char* symbol_file,
+                           const char* param_file,
+                           int dev_type, int dev_id,
+                           mx_uint num_input_nodes,
+                           const char** input_keys,
+                           const mx_uint* input_shape_indptr,
+                           const mx_uint* input_shape_data,
+                           PredictorHandle* out);
+/*!
+ * \brief Get the shape of output node.
+ *  The returned shape_data and shape_ndim is only valid before next call to MXPred function.
+ * \param handle The handle of the predictor.
+ * \param index The index of output node, set to 0 if there is only one output.
+ * \param shape_data Used to hold pointer to the shape data
+ * \param shape_ndim Used to hold shape dimension.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredGetOutputShape(PredictorHandle handle,
+                                   mx_uint index,
+                                   mx_uint** shape_data,
+                                   mx_uint* shape_ndim);
+/*!
+ * \brief Set the input data of predictor.
+ * \param handle The predictor handle.
+ * \param key The name of input node to set.
+ *     For feedforward net, this is "data".
+ * \param data The pointer to the data to be set, with the shape specified in MXPredCreate.
+ * \param size The size of data array, used for safety check.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredSetInput(PredictorHandle handle,
+                             const char* key,
+                             const mx_float* data,
+                             mx_uint size);
+/*!
+ * \brief Run a forward pass to get the output
+ * \param handle The handle of the predictor.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredForward(PredictorHandle handle);
+/*!
+ * \brief Get the output value of prediction.
+ * \param handle The handle of the predictor.
+ * \param index The index of output node, set to 0 if there is only one output.
+ * \param data User allocated data to hold the output.
+ * \param size The size of data array, used for safe checking.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredGetOutput(PredictorHandle handle,
+                              mx_uint index,
+                              mx_float* data,
+                              mx_uint size);
+/*!
+ * \brief Free a predictor handle.
+ * \param handle The handle of the predictor.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredFree(PredictorHandle handle);
+/*!
+ * \brief Create a NDArray List by loading from ndarray file.
+ *     This can be used to load mean image file.
+ * \param nd_file The path to the ndarray file to load.
+ * \param out The out put NDListHandle
+ * \param out_length Length of the list.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXNDListCreate(const char* nd_file,
+                             NDListHandle *out,
+                             mx_uint* out_length);
+/*!
+ * \brief Get an element from list
+ * \param handle The handle to the NDArray
+ * \param index The index in the list
+ * \param out_key The output key of the item
+ * \param out_data The data region of the item
+ * \param out_shape The shape of the item.
+ * \param out_ndim The number of dimension in the shape.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXNDListGet(NDListHandle handle,
+                          mx_uint index,
+                          const char** out_key,
+                          const mx_float** out_data,
+                          const mx_uint** out_shape,
+                          mx_uint* out_ndim);
+/*!
+ * \brief Free a predictor handle.
+ * \param handle The handle of the predictor.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXNDListFree(NDListHandle handle);
+
+#endif  // MXNET_C_PREDICT_API_H_
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index 8f547df0831a..6a3fb6a769f9 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -199,8 +199,7 @@ class KVStore {
    *
    * But note that, this functions only blocks the main thread of workers until
    * all of them are reached this point. It doesn't guarantee that all
-   * operations issued before are actually finished, such as \ref Push and \ref
-   * Pull. In that case, we need to call \ref Wait or \ref WaitAll
+   * operations issued before are actually finished, such as \ref Push and \ref Pull.
    */
   virtual void Barrier() { }
 
diff --git a/predict/python/mxnet_predict.py b/predict/python/mxnet_predict.py
new file mode 100644
index 000000000000..69e83bae011c
--- /dev/null
+++ b/predict/python/mxnet_predict.py
@@ -0,0 +1,203 @@
+# coding: utf-8
+# pylint: disable=invalid-name, too-many-arguments
+"""Lightweight API for mxnet prediction.
+
+This is for prediction only, use mxnet python package instead for most tasks.
+"""
+from __future__ import absolute_import
+
+import os
+import sys
+import ctypes
+import numpy as np
+
+__all__ = ["Predictor", "load_ndarray_file"]
+
+if sys.version_info[0] == 3:
+    py_str = lambda x: x.decode('utf-8')
+else:
+    py_str = lambda x: x
+
+def c_str(string):
+    """"Convert a python string to C string."""
+    return ctypes.c_char_p(string.encode('utf-8'))
+
+def c_array(ctype, values):
+    """Create ctypes array from a python array."""
+    return (ctype * len(values))(*values)
+
+def _find_lib_path():
+    """Find mxnet library."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    api_path = os.path.join(curr_path, '../../lib/')
+    dll_path = [curr_path, api_path]
+    dll_path = [os.path.join(p, 'libmxnet.so') for p in dll_path] + \
+        [os.path.join(p, 'libmxnet_predict.so') for p in dll_path]
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if len(lib_path) == 0:
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(dll_path)))
+    return lib_path
+
+
+def _load_lib():
+    """Load libary by searching possible path."""
+    lib_path = _find_lib_path()
+    lib = ctypes.cdll.LoadLibrary(lib_path[0])
+    # DMatrix functions
+    lib.MXGetLastError.restype = ctypes.c_char_p
+    return lib
+
+
+def _check_call(ret):
+    """Check the return value of API."""
+    if ret != 0:
+        raise RuntimeError(py_str(_LIB.MXGetLastError()))
+
+_LIB = _load_lib()
+# type definitions
+mx_uint = ctypes.c_uint
+mx_float = ctypes.c_float
+mx_float_p = ctypes.POINTER(mx_float)
+PredictorHandle = ctypes.c_void_p
+NDListHandle = ctypes.c_void_p
+
+devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3}
+
+class Predictor(object):
+    """A predictor class that runs prediction.
+
+    Parameters
+    ----------
+    symbol_file : str
+        Path to the symbol file.
+
+    param_file : str
+        Path to the parameter file.
+
+    input_shapes : dict of str to tuple
+        The shape of input data
+
+    dev_type : str, optional
+        The device type of the predictor.
+
+    dev_id : int, optional
+        The device id of the predictor.
+    """
+    def __init__(self, symbol_file, param_file, input_shapes,
+                 dev_type="cpu", dev_id=0):
+        dev_type = devstr2type[dev_type]
+        indptr = [0]
+        sdata = []
+        keys = []
+        for k, v  in input_shapes.items():
+            if not isinstance(v, tuple):
+                raise ValueError("Expect input_shapes to be dict str->tuple")
+            keys.append(c_str(k))
+            sdata.extend(v)
+            indptr.append(len(sdata))
+        handle = PredictorHandle()
+        _check_call(_LIB.MXPredCreate(
+            c_str(symbol_file), c_str(param_file),
+            ctypes.c_int(dev_type), ctypes.c_int(dev_id),
+            mx_uint(len(indptr) - 1),
+            c_array(ctypes.c_char_p, keys),
+            c_array(mx_uint, indptr),
+            c_array(mx_uint, sdata),
+            ctypes.byref(handle)))
+        self.handle = handle
+
+    def __del__(self):
+        _check_call(_LIB.MXPredFree(self.handle))
+
+    def forward(self, **kwargs):
+        """Perform forward to get the output.
+
+        Parameters
+        ----------
+        **kwargs
+            Keyword arguments of input variable name to data.
+
+        Examples
+        --------
+        >>> predictor.forward(data=mydata)
+        >>> out = predictor.get_output(0)
+        """
+        for k, v in kwargs.items():
+            if not isinstance(v, np.ndarray):
+                raise ValueError("Expect numpy ndarray as input")
+            v = np.ascontiguousarray(v, dtype=np.float32)
+            _check_call(_LIB.MXPredSetInput(
+                self.handle, c_str(k),
+                v.ctypes.data_as(mx_float_p),
+                mx_uint(v.size)))
+        _check_call(_LIB.MXPredForward(self.handle))
+
+    def get_output(self, index):
+        """Get the index-th output.
+
+        Parameters
+        ----------
+        index : int
+            The index of output.
+
+        Returns
+        -------
+        out : numpy array.
+            The output array.
+        """
+        pdata = ctypes.POINTER(mx_uint)()
+        ndim = mx_uint()
+        _check_call(_LIB.MXPredGetOutputShape(
+            self.handle, index,
+            ctypes.byref(pdata),
+            ctypes.byref(ndim)))
+        shape = tuple(pdata[:ndim.value])
+        print shape
+        data = np.empty(shape, dtype=np.float32)
+        _check_call(_LIB.MXPredGetOutput(
+            self.handle, mx_uint(index),
+            data.ctypes.data_as(mx_float_p),
+            mx_uint(data.size)))
+        return data
+
+
+def load_ndarray_file(nd_file):
+    """Load ndarray file and return as list of numpy array.
+
+    Parameters
+    ----------
+    nd_file : str
+        The name to the ndarray file.
+
+    Returns
+    -------
+    out : dict of str to numpy array or list of numpy array
+        The output list or dict, depending on whether the saved type is list or dict.
+    """
+    handle = NDListHandle()
+    olen = mx_uint()
+    _check_call(_LIB.MXNDListCreate(
+        c_str(nd_file), ctypes.byref(handle), ctypes.byref(olen)))
+    keys = []
+    arrs = []
+
+    for i in range(olen.value):
+        key = ctypes.c_char_p()
+        cptr = mx_float_p()
+        pdata = ctypes.POINTER(mx_uint)()
+        ndim = mx_uint()
+        _check_call(_LIB.MXNDListGet(
+            handle, mx_uint(i), ctypes.byref(key),
+            ctypes.byref(cptr), ctypes.byref(pdata), ctypes.byref(ndim)))
+        shape = tuple(pdata[:ndim.value])
+        dbuffer = (mx_float * np.prod(shape)).from_address(ctypes.addressof(cptr.contents))
+        ret = np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+        keys.append(py_str(key.value))
+        arrs.append(ret)
+    _check_call(_LIB.MXNDListFree(handle))
+
+    if len(keys) == 0 or len(keys[0]) == 0:
+        return arrs
+    else:
+        return {keys[i] : arrs[i] for i in range(len(keys))}
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 6e6e01287539..d09f37eb8d90 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -608,8 +608,6 @@ def _init_iter(self, X, y, is_train):
                 raise TypeError('y must be ndarray when X is numpy.ndarray')
             if X.shape[0] != y.shape[0]:
                 raise ValueError("The numbers of data points and labels not equal")
-            if X.ndim != 2:
-                raise ValueError("Data must be 2D")
             if y.ndim == 2 and y.shape[1] == 1:
                 y = y.flatten()
             if y.ndim != 1:
diff --git a/src/c_api.cc b/src/c_api/c_api.cc
similarity index 91%
rename from src/c_api.cc
rename to src/c_api/c_api.cc
index 5be4ad29a150..1d5fef33768a 100644
--- a/src/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -20,26 +20,13 @@
 #include <mutex>
 #include <memory>
 #include <functional>
-
-// macro hanlding for threadlocal variables
-#ifdef __GNUC__
-  #define MX_TREAD_LOCAL __thread
-#elif __STDC_VERSION__ >= 201112L
-  #define  MX_TREAD_LOCAL _Thread_local
-#elif defined(_MSC_VER)
-  #define MX_TREAD_LOCAL __declspec(thread)
-#endif
-
-#ifndef MX_TREAD_LOCAL
-#message("Warning: Threadlocal is not enabled");
-#endif
+#include "./c_api_error.h"
+#include "../common/thread_local.h"
 
 using namespace mxnet;
 
 /*! \brief entry to to easily hold returning information */
 struct MXAPIThreadLocalEntry {
-  /*! \brief holds last error message */
-  std::string last_error;
   /*! \brief result holder for returning string */
   std::string ret_str;
   /*! \brief result holder for returning strings */
@@ -68,84 +55,8 @@ struct MXAPIThreadLocalEntry {
   }
 };
 
-/*!
- * \brief A threadlocal store to store threadlocal variables.
- *  Will return a thread local singleton of type T
- * \tparam T the type we like to store
- */
-class MXAPIThreadLocalStore {
- public:
-  /*! \brief store return entry */
-  typedef MXAPIThreadLocalEntry T;
-  /*! \return get a thread local singleton */
-  static T* Get() {
-    static MX_TREAD_LOCAL T* ptr = nullptr;
-    if (ptr == nullptr) {
-      ptr = new T();
-      Singleton()->RegisterDelete(ptr);
-    }
-    return ptr;
-  }
-
- private:
-  /*! \brief constructor */
-  MXAPIThreadLocalStore() {}
-  /*! \brief destructor */
-  ~MXAPIThreadLocalStore() {
-    for (size_t i = 0; i < data_.size(); ++i) {
-      delete data_[i];
-    }
-  }
-  /*! \return singleton of the store */
-  static MXAPIThreadLocalStore *Singleton() {
-    static MXAPIThreadLocalStore inst;
-    return &inst;
-  }
-  /*!
-   * \brief register str for internal deletion
-   * \param str the string pointer
-   */
-  void RegisterDelete(T *str) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    data_.push_back(str);
-    lock.unlock();
-  }
-  /*! \brief internal mutex */
-  std::mutex mutex_;
-  /*!\brief internal data */
-  std::vector<T*> data_;
-};
-
-// NOTE: all functions return 0 upon success
-// consider add try/catch block for user error
-// handling in the future
-
-/*! \brief  macro to guard beginning and end section of all functions */
-#define API_BEGIN() try {
-/*! \brief every function starts with API_BEGIN();
-     and finishes with API_END() or API_END_HANDLE_ERROR */
-#define API_END() } catch(dmlc::Error &_except_) { return MXHandleException(_except_); } return 0;
-/*!
- * \brief every function starts with API_BEGIN();
- *   and finishes with API_END() or API_END_HANDLE_ERROR
- *   The finally clause contains procedure to cleanup states when an error happens.
- */
-#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return MXHandleException(_except_); } return 0; // NOLINT(*)
-
-/*! \brief return str message of the last error */
-const char *MXGetLastError() {
-  return MXAPIThreadLocalStore::Get()->last_error.c_str();
-}
-
-/*!
- * \brief handle exception throwed out
- * \param e the exception
- * \return the return value of API after exception is handled
- */
-int MXHandleException(const dmlc::Error &e) {
-  MXAPIThreadLocalStore::Get()->last_error = e.what();
-  return -1;
-}
+// define the threadlocal store.
+typedef mxnet::common::ThreadLocalStore<MXAPIThreadLocalEntry> MXAPIThreadLocalStore;
 
 // Internal function to get the information
 // from function registry
diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc
new file mode 100644
index 000000000000..2e9c74985d8f
--- /dev/null
+++ b/src/c_api/c_api_error.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_api_error.cc
+ * \brief C error handling
+ */
+#include "./c_api_error.h"
+#include "../common/thread_local.h"
+
+struct ErrorEntry {
+  std::string last_error;
+};
+
+typedef mxnet::common::ThreadLocalStore<ErrorEntry> MXAPIErrorStore;
+
+const char *MXGetLastError() {
+  return MXAPIErrorStore::Get()->last_error.c_str();
+}
+
+void MXAPISetLastError(const char* msg) {
+  MXAPIErrorStore::Get()->last_error = msg;
+}
diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h
new file mode 100644
index 000000000000..fe47052f704b
--- /dev/null
+++ b/src/c_api/c_api_error.h
@@ -0,0 +1,39 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_api_error.h
+ * \brief Error handling for C API.
+ */
+#ifndef MXNET_C_API_C_API_ERROR_H_
+#define MXNET_C_API_C_API_ERROR_H_
+
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <mxnet/c_api.h>
+
+/*! \brief  macro to guard beginning and end section of all functions */
+#define API_BEGIN() try {
+/*! \brief every function starts with API_BEGIN();
+     and finishes with API_END() or API_END_HANDLE_ERROR */
+#define API_END() } catch(dmlc::Error &_except_) { return MXAPIHandleException(_except_); } return 0;  // NOLINT(*)
+/*!
+ * \brief every function starts with API_BEGIN();
+ *   and finishes with API_END() or API_END_HANDLE_ERROR
+ *   The finally clause contains procedure to cleanup states when an error happens.
+ */
+#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return MXAPIHandleException(_except_); } return 0; // NOLINT(*)
+
+/*!
+ * \brief Set the last error message needed by C API
+ * \param msg The error message to set.
+ */
+void MXAPISetLastError(const char* msg);
+/*!
+ * \brief handle exception throwed out
+ * \param e the exception
+ * \return the return value of API after exception is handled
+ */
+inline int MXAPIHandleException(const dmlc::Error &e) {
+  MXAPISetLastError(e.what());
+  return -1;
+}
+#endif  // MXNET_C_API_C_API_ERROR_H_
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
new file mode 100644
index 000000000000..c0d6deafcb60
--- /dev/null
+++ b/src/c_api/c_predict_api.cc
@@ -0,0 +1,231 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_predict_api.cc
+ * \brief C predict API of mxnet
+ */
+#include <mxnet/c_predict_api.h>
+#include <mxnet/symbolic.h>
+#include <mxnet/ndarray.h>
+#include <memory>
+#include "./c_api_error.h"
+
+using namespace mxnet;
+
+// predictor interface
+struct MXAPIPredictor {
+  // output arrays
+  std::vector<NDArray> out_arrays;
+  // argument arrays
+  std::vector<NDArray> arg_arrays;
+  // output shapes
+  std::vector<TShape> out_shapes;
+  // key to arguments
+  std::unordered_map<std::string, size_t> key2arg;
+  // executor
+  std::unique_ptr<Executor> exec;
+};
+
+struct MXAPINDList {
+  std::vector<std::string> keys;
+  std::vector<TShape> shapes;
+  std::vector<size_t> indptr;
+  std::vector<mx_float> data;
+};
+
+int MXPredCreate(const char* symbol_file,
+                 const char* param_file,
+                 int dev_type, int dev_id,
+                 mx_uint num_input_nodes,
+                 const char** input_keys,
+                 const mx_uint* input_shape_indptr,
+                 const mx_uint* input_shape_data,
+                 PredictorHandle* out) {
+  MXAPIPredictor* ret = new MXAPIPredictor();
+  API_BEGIN();
+  Symbol sym;
+  // load in the symbol.
+  {
+    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(symbol_file, "r"));
+    dmlc::istream is(fi.get());
+    dmlc::JSONReader reader(&is);
+    sym.Load(&reader);
+    is.set_stream(nullptr);
+  }
+  // load the parameters
+  std::unordered_map<std::string, NDArray> arg_params, aux_params;
+  {
+    std::vector<NDArray> data;
+    std::vector<std::string> names;
+    NDArray::Load(param_file, &data, &names);
+    CHECK_EQ(names.size(), data.size())
+        << "Invalid param file format";
+    for (size_t i = 0; i < names.size(); ++i) {
+      if (!strncmp(names[i].c_str(), "aux:", 4)) {
+        aux_params[std::string(names[i].c_str() + 4)]  = data[i];
+      }
+      if (!strncmp(names[i].c_str(), "arg:", 4)) {
+        arg_params[std::string(names[i].c_str() + 4)]  = data[i];
+      }
+    }
+  }
+
+  // shape inference and bind
+  std::unordered_map<std::string, TShape> known_shape;
+  for (mx_uint i = 0; i < num_input_nodes; ++i) {
+    known_shape[std::string(input_keys[i])] =
+        TShape(input_shape_data + input_shape_indptr[i],
+               input_shape_data + input_shape_indptr[i + 1]);
+  }
+  std::vector<TShape> arg_shapes;
+  std::vector<std::string> arg_names = sym.ListArguments();
+  std::vector<std::string> aux_names = sym.ListAuxiliaryStates();
+  std::vector<TShape> out_shapes(sym.ListOutputs().size());
+  std::vector<TShape> aux_shapes(aux_names.size());
+  for (size_t i = 0; i < arg_names.size(); ++i) {
+    std::string key = arg_names[i];
+    ret->key2arg[key] = i;
+    if (known_shape.count(key) != 0) {
+      arg_shapes.push_back(known_shape[key]);
+    } else {
+      arg_shapes.push_back(TShape());
+    }
+  }
+  CHECK(sym.InferShape(&arg_shapes, &out_shapes, &aux_shapes))
+      << "The shape information of is not enough to get the shapes";
+  ret->out_shapes = out_shapes;
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+
+  std::vector<NDArray> arg_arrays, aux_arrays;
+  for (size_t i = 0; i < arg_shapes.size(); ++i) {
+    NDArray nd = NDArray(arg_shapes[i], ctx);
+    if (arg_params.count(arg_names[i]) != 0) {
+      CopyFromTo(arg_params[arg_names[i]], &nd);
+    }
+    arg_arrays.push_back(nd);
+  }
+  for (size_t i = 0; i < aux_shapes.size(); ++i) {
+    NDArray nd = NDArray(aux_shapes[i], ctx);
+    if (aux_params.count(aux_names[i]) != 0) {
+      CopyFromTo(aux_params[aux_names[i]], &nd);
+    }
+    aux_arrays.push_back(nd);
+  }
+  ret->arg_arrays = arg_arrays;
+  // bind
+  {
+    std::vector<NDArray> grad_store(arg_arrays.size());
+    std::vector<OpReqType> grad_req(arg_arrays.size(), kNullOp);
+    ret->exec.reset(Executor::Bind(sym, ctx, arg_arrays,
+                                   grad_store, grad_req,
+                                   aux_arrays));
+    ret->out_arrays = ret->exec->outputs();
+  }
+  *out = ret;
+  API_END_HANDLE_ERROR(delete ret);
+}
+
+int MXPredGetOutputShape(PredictorHandle handle,
+                         mx_uint out_index,
+                         mx_uint** shape_data,
+                         mx_uint* shape_ndim) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  CHECK_LT(out_index, p->out_arrays.size())
+      << "Index exceed number of outputs";
+  *shape_data = p->out_shapes[out_index].data();
+  *shape_ndim = p->out_shapes[out_index].ndim();
+  API_END();
+}
+
+int MXPredSetInput(PredictorHandle handle,
+                   const char* key,
+                   const mx_float* data,
+                   mx_uint size) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  auto it = p->key2arg.find(key);
+  if (it == p->key2arg.end()) {
+    LOG(FATAL) << "cannot find input key " << key;
+  }
+  NDArray& nd = p->arg_arrays[it->second];
+  nd.SyncCopyFromCPU(data, size);
+  API_END();
+}
+
+int MXPredForward(PredictorHandle handle) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  p->exec->Forward(false);
+  API_END();
+}
+
+int MXPredGetOutput(PredictorHandle handle,
+                    mx_uint index,
+                    mx_float* data,
+                    mx_uint size) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  CHECK_LT(index, p->out_arrays.size())
+      << "Output index out of range";
+  const NDArray& nd = p->out_arrays[index];
+  nd.SyncCopyToCPU(data, size);
+  API_END();
+}
+
+int MXPredFree(PredictorHandle handle) {
+  API_BEGIN();
+  delete static_cast<MXAPIPredictor*>(handle);
+  API_END();
+}
+
+int MXNDListCreate(const char* nd_file,
+                   NDListHandle *out,
+                   mx_uint* out_length) {
+  MXAPINDList* ret = new MXAPINDList();
+  API_BEGIN();
+  std::vector<NDArray> arrays;
+  NDArray::Load(nd_file,
+                &(arrays),
+                &(ret->keys));
+  if (ret->keys.size() == 0) {
+    ret->keys.resize(arrays.size());
+  }
+  ret->indptr.push_back(0);
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    TShape shape = arrays[i].shape();
+    size_t begin = ret->data.size();
+    size_t size = shape.Size();
+    ret->shapes.push_back(shape);
+    ret->data.resize(begin + size);
+    arrays[i].SyncCopyToCPU(dmlc::BeginPtr(ret->data) + begin, size);
+    ret->indptr.push_back(begin + size);
+  }
+  *out = ret;
+  *out_length = static_cast<mx_uint>(arrays.size());
+  API_END();
+}
+
+int MXNDListGet(NDListHandle handle,
+                mx_uint index,
+                const char** out_key,
+                const mx_float** out_data,
+                const mx_uint** out_shape,
+                mx_uint* out_ndim) {
+  MXAPINDList* p = static_cast<MXAPINDList*>(handle);
+  API_BEGIN();
+  CHECK_LT(index, p->shapes.size())
+      << "Index out of range";
+  *out_key = p->keys[index].c_str();
+  *out_data = dmlc::BeginPtr(p->data) + p->indptr[index];
+  *out_shape = p->shapes[index].data();
+  *out_ndim = p->shapes[index].ndim();
+  API_END();
+}
+
+int MXNDListFree(NDListHandle handle) {
+  API_BEGIN();
+  delete static_cast<MXAPINDList*>(handle);
+  API_END();
+}
+
+
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
index 495144aa931e..2a521909d49b 100644
--- a/src/common/tblob_op_registry.h
+++ b/src/common/tblob_op_registry.h
@@ -64,8 +64,7 @@ class TBlobOpRegEntry {
   std::string name;
   /*!
    * \brief set shape inference function, by default use same shape.
-   * \param dev_mask The device mask of the function can act on.
-   * \param funary The unary function that peforms the operation.
+   * \param fshapeinfer The unary function that peforms the operation.
    */
   virtual TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) = 0;
   /*!
diff --git a/src/common/thread_local.h b/src/common/thread_local.h
new file mode 100644
index 000000000000..4853694df79f
--- /dev/null
+++ b/src/common/thread_local.h
@@ -0,0 +1,77 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file thread_local.h
+ * \brief Common utility for thread local storage.
+ */
+#ifndef MXNET_COMMON_THREAD_LOCAL_H_
+#define MXNET_COMMON_THREAD_LOCAL_H_
+
+#include <mutex>
+#include <memory>
+#include <vector>
+
+namespace mxnet {
+namespace common {
+
+// macro hanlding for threadlocal variables
+#ifdef __GNUC__
+  #define MX_TREAD_LOCAL __thread
+#elif __STDC_VERSION__ >= 201112L
+  #define  MX_TREAD_LOCAL _Thread_local
+#elif defined(_MSC_VER)
+  #define MX_TREAD_LOCAL __declspec(thread)
+#endif
+
+#ifndef MX_TREAD_LOCAL
+#message("Warning: Threadlocal is not enabled");
+#endif
+
+/*!
+ * \brief A threadlocal store to store threadlocal variables.
+ *  Will return a thread local singleton of type T
+ * \tparam T the type we like to store
+ */
+template<typename T>
+class ThreadLocalStore {
+ public:
+  /*! \return get a thread local singleton */
+  static T* Get() {
+    static MX_TREAD_LOCAL T* ptr = nullptr;
+    if (ptr == nullptr) {
+      ptr = new T();
+      Singleton()->RegisterDelete(ptr);
+    }
+    return ptr;
+  }
+
+ private:
+  /*! \brief constructor */
+  ThreadLocalStore() {}
+  /*! \brief destructor */
+  ~ThreadLocalStore() {
+    for (size_t i = 0; i < data_.size(); ++i) {
+      delete data_[i];
+    }
+  }
+  /*! \return singleton of the store */
+  static ThreadLocalStore<T> *Singleton() {
+    static ThreadLocalStore<T> inst;
+    return &inst;
+  }
+  /*!
+   * \brief register str for internal deletion
+   * \param str the string pointer
+   */
+  void RegisterDelete(T *str) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    data_.push_back(str);
+    lock.unlock();
+  }
+  /*! \brief internal mutex */
+  std::mutex mutex_;
+  /*!\brief internal data */
+  std::vector<T*> data_;
+};
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_THREAD_LOCAL_H_
diff --git a/tests/python/predict/mxnet_predict_example.py b/tests/python/predict/mxnet_predict_example.py
new file mode 100644
index 000000000000..8760c87a50f7
--- /dev/null
+++ b/tests/python/predict/mxnet_predict_example.py
@@ -0,0 +1,61 @@
+import sys, os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append("../../predict/python/")
+sys.path.append("../../python/")
+
+from mxnet_predict import Predictor, load_ndarray_file
+import mxnet as mx
+import logging
+import numpy as np
+from skimage import io, transform
+
+# Load the pre-trained model
+prefix = "Inception/Inception_BN"
+num_round = 39
+symbol_file = "%s-symbol.json" % prefix
+param_file = "%s-0039.params" % prefix
+predictor = Predictor(symbol_file, param_file, {'data':(1, 3, 224, 224)})
+mean_img = load_ndarray_file("Inception/mean_224.nd")["mean_img"]
+
+synset = [l.strip() for l in open('Inception/synset.txt').readlines()]
+
+def PreprocessImage(path, show_img=False):
+    # load image
+    img = io.imread(path)
+    print("Original Image Shape: ", img.shape)
+    # we crop image from center
+    short_egde = min(img.shape[:2])
+    yy = int((img.shape[0] - short_egde) / 2)
+    xx = int((img.shape[1] - short_egde) / 2)
+    crop_img = img[yy : yy + short_egde, xx : xx + short_egde]
+    # resize to 224, 224
+    resized_img = transform.resize(crop_img, (224, 224))
+    if show_img:
+        io.imshow(resized_img)
+    # convert to numpy.ndarray
+    sample = np.asarray(resized_img) * 256
+    # swap channel from RGB to BGR
+    sample = sample[:, :, [2,1,0]]
+    # swap axes to make image from (224, 224, 4) to (3, 224, 224)
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+
+    # sub mean
+    normed_img = sample - mean_img
+    normed_img.resize(1, 3, 224, 224)
+    return normed_img
+
+# Get preprocessed batch (single image batch)
+batch = PreprocessImage('./download.png', True)
+
+predictor.forward(data=batch)
+prob = predictor.get_output(0)[0]
+
+pred = np.argsort(prob)[::-1]
+# Get top1 label
+top1 = synset[pred[0]]
+print("Top1: ", top1)
+# Get top5 label
+top5 = [synset[pred[i]] for i in range(5)]
+print("Top5: ", top5)
+

From a6bd422395888ce74350ec14b3519ed74e6e1726 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Thu, 22 Oct 2015 23:01:03 -0700
Subject: [PATCH 090/122] a python script for generating image list required by
 im2rec and an option center crop that allow im2rec to output rectangular
 images

---
 tools/im2rec.cc    | 17 ++++++++++-
 tools/make_list.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 tools/make_list.py

diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 61cee06bd23d..89410bae2e6a 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -28,13 +28,15 @@ int main(int argc, char *argv[]) {
            "\tresize=newsize resize the shorter edge of image to the newsize, original images will be packed by default\n"\
            "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"\
            "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to NSPLIT parts by position\n"\
-           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n");
+           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"
+           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it rectangular.\n");
     return 0;
   }
   int label_width = 1;
   int new_size = -1;
   int nsplit = 1;
   int partid = 0;
+  int center_crop = 0;
   for (int i = 4; i < argc; ++i) {
     char key[128], val[128];
     if (sscanf(argv[i], "%[^=]=%s", key, val) == 2) {
@@ -42,6 +44,7 @@ int main(int argc, char *argv[]) {
       if (!strcmp(key, "label_width")) label_width = atoi(val);
       if (!strcmp(key, "nsplit")) nsplit = atoi(val);
       if (!strcmp(key, "part")) partid = atoi(val);
+      if (!strcmp(key, "center_crop")) center_crop = atoi(val);
     }
   }
   if (new_size > 0) {
@@ -49,6 +52,9 @@ int main(int argc, char *argv[]) {
   } else {
     LOG(INFO) << "Keep origin image size";
   }
+  if (center_crop) {
+    LOG(INFO) << "Center cropping to rectangular";
+  }
   
   using namespace dmlc;
   const static size_t kBufferSize = 1 << 20UL;
@@ -111,6 +117,15 @@ int main(int argc, char *argv[]) {
     if (new_size > 0) {
       cv::Mat img = cv::imdecode(decode_buf, CV_LOAD_IMAGE_COLOR);
       CHECK(img.data != NULL) << "OpenCV decode fail:" << path;
+      if (center_crop) {
+        if (img.rows > img.cols) {
+          int margin = (img.rows - img.cols)/2;
+          img = img(cv::Range(margin, margin+img.cols), cv::Range(0, img.cols));
+        } else {
+          int margin = (img.cols - img.rows)/2;
+          img = img(cv::Range(0, img.rows), cv::Range(margin, margin + img.rows));
+        }
+      }
       cv::Mat res;
       if (img.rows > img.cols) {
         cv::resize(img, res, cv::Size(new_size, img.rows * new_size / img.cols),
diff --git a/tools/make_list.py b/tools/make_list.py
new file mode 100644
index 000000000000..926902807a54
--- /dev/null
+++ b/tools/make_list.py
@@ -0,0 +1,73 @@
+import os
+import random
+import numpy as np
+import argparse
+
+def list_image(root, recursive, exts):
+    image_list = []
+    if recursive:
+        cat = {}
+        for path, subdirs, files in os.walk(root):
+            print path
+            for fname in files:
+                fpath = os.path.join(path, fname)
+                suffix = os.path.splitext(fname)[1].lower()
+                if os.path.isfile(fpath) and (suffix in exts):
+                    if path not in cat:
+                        cat[path] = len(cat)
+                    image_list.append((os.path.relpath(fpath, root), cat[path]))
+    else:
+        for fname in os.listdir(root):
+            fpath = os.path.join(root, fname)
+            suffix = os.path.splitext(fname)[1].lower()
+            if os.path.isfile(fpath) and (suffix in exts):
+                image_list.append((os.path.relpath(fpath, root), 0))
+    return image_list
+
+def write_list(path_out, image_list):
+    with open(path_out, 'w') as fout:
+        for i in xrange(len(image_list)):
+            fout.write('%d \t %d \t %s\n'%(i, image_list[i][1], image_list[i][0]))
+
+
+def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio):
+    image_list = list_image(root, recursive, exts)
+    random.shuffle(image_list)
+    N = len(image_list)
+    chunk_size = N/num_chunks
+    for i in xrange(num_chunks):
+        chunk = image_list[i*chunk_size:(i+1)*chunk_size]
+        if num_chunks > 1:
+            str_chunk = '_%d'%i
+        else:
+            str_chunk = ''
+        if train_ratio < 1:
+            sep = int(chunk_size*train_ratio)
+            write_list(prefix_out+str_chunk+'_train.lst', chunk[:sep])
+            write_list(prefix_out+str_chunk+'_val.lst', chunk[sep:])
+        else:
+            write_list(prefix_out+str_chunk+'.lst', chunk)
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Make image list files that are\
+        required by im2rec')
+    parser.add_argument('root', help='path to folder that contain images.')
+    parser.add_argument('prefix', help='prefix of output list files.')
+    parser.add_argument('--exts', type=list, default=['.jpeg','.jpg'],
+        help='list of acceptable image extensions.')
+    parser.add_argument('--chunks', type=int, default=1, help='number of chunks.')
+    parser.add_argument('--train_ratio', type=float, default=1.0,
+        help='Percent of images to use for training.')
+    parser.add_argument('--recursive', type=bool, default=False,
+        help='If true recursively walk through subdirs and assign an unique label\
+        to images in each folder. Otherwise only include images in the root folder\
+        and give them label 0.')
+    args = parser.parse_args()
+    
+    make_list(args.prefix, args.root, args.recursive,
+        args.exts, args.chunks, args.train_ratio)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From d5dd41920122822ad68d9888e2509e9bf6291a96 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 22:32:51 -0700
Subject: [PATCH 091/122] fixed get_with_shape calls to comply with changes in
 mshadow

---
 src/operator/cudnn_activation-inl.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
index 99bbfe93e871..cf5c49f9fd59 100644
--- a/src/operator/cudnn_activation-inl.h
+++ b/src/operator/cudnn_activation-inl.h
@@ -52,8 +52,7 @@ class CuDNNActivationOp : public Operator {
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
       data = in_data[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
       out = out_data[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
     } else {
@@ -106,8 +105,7 @@ class CuDNNActivationOp : public Operator {
     Tensor<gpu, 4> output_data;
     Tensor<gpu, 4> input_grad;
     if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
+      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
       data = in_data[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
       grad = out_grad[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
       output_data = out_data[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);

From 7f6376634e1fcbe1f1eede0407f9dcc20dee384d Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 22:33:56 -0700
Subject: [PATCH 092/122] multi output softmax

---
 CONTRIBUTORS.md            |  1 +
 mshadow                    |  2 +-
 src/operator/softmax-inl.h | 45 ++++++++++++++++++++++++++++++--------
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 47d0de5078a7..9f72042fb3ce 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -54,3 +54,4 @@ List of Contributors
 * [Shuzhe Wu](https://github.com/II-Matto)
 * [Xiaodong](https://github.com/XD-DENG)
 * [Nan Xiao](https://github.com/road2stat)
+* [Junyuan Xie](https://github.com/piiswrong)
diff --git a/mshadow b/mshadow
index f00b2086218e..ded43f16aedd 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit f00b2086218e116bd78a84d8e96d3bef8d4229d1
+Subproject commit ded43f16aeddd607fde8c2754df67c92bbcaee76
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index 87cab0cb4568..ca17f02f593c 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -25,9 +25,14 @@ enum SoftmaxOpOutputs {kOut};
 
 struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   float grad_scale;
+  bool multi_output;
   DMLC_DECLARE_PARAMETER(SoftmaxParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
+    DMLC_DECLARE_FIELD(multi_output).set_default(false)
+    .describe("If set to true, for a (n,k,x_1,..,x_n) dimensional"
+      "input tensor, softmax will generate n*x_1*...*x_n output, each"
+      "has k classes");
   };
 };
 
@@ -46,9 +51,18 @@ class SoftmaxOp : public Operator {
     CHECK_EQ(in_data.size(), 2) << "Softmax Input: [data, label]";
     CHECK_EQ(out_data.size(), 1) << "Softmax Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Softmax(out, data);
+    if (param_.multi_output) {
+      int n = in_data[kData].size(0);
+      int k = in_data[kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[kData].Size()/n/k));
+      Tensor<xpu, 3> data = in_data[kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> out = out_data[kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Softmax(out, data);
+    } else {
+      Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+      Softmax(out, data);
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -65,12 +79,25 @@ class SoftmaxOp : public Operator {
     CHECK_GE(in_grad.size(), 1);
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    SoftmaxGrad(grad, out, label);
-    if (param_.grad_scale < 1.0) {
-      grad *= param_.grad_scale;
+    if (param_.multi_output) {
+      int n = out_data[kOut].size(0);
+      int k = out_data[kOut].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[kOut].Size()/n/k));
+      Tensor<xpu, 2> label = in_data[kLabel].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 3> out = out_data[kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> grad = in_grad[kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      SoftmaxGrad(grad, out, label);
+      if (param_.grad_scale < 1.0) {
+        grad *= param_.grad_scale;
+      }
+    } else {
+      Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+      SoftmaxGrad(grad, out, label);
+      if (param_.grad_scale < 1.0) {
+        grad *= param_.grad_scale;
+      }
     }
   }
 

From 18ab5d9c38bfc54d18c970ce7367b22624d49c81 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sat, 24 Oct 2015 15:53:35 -0700
Subject: [PATCH 093/122] proper debug flag (-g -G) for nvcc

---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 500ffeaf6623..da5e73ce504b 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,11 @@ else
 endif
 CFLAGS += -I./mshadow/ -I./dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
-NVCCFLAGS = --use_fast_math -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+ifeq ($(DEBUG), 1)
+	NVCCFLAGS = -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+else
+	NVCCFLAGS = --use_fast_math -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+endif
 ROOTDIR = $(CURDIR)
 
 ifndef LINT_LANG

From 511c041c875f6359d8362a7f78bb8f85d459d292 Mon Sep 17 00:00:00 2001
From: Chiyuan Zhang <pluskid@gmail.com>
Date: Sat, 24 Oct 2015 21:45:40 -0400
Subject: [PATCH 094/122] also do grad clipping when momentum == 0

---
 python/mxnet/optimizer.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index e72a0cdd1163..105fc5646f75 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -144,19 +144,19 @@ def update(self, index, weight, grad, state):
             lr = self.lr_scheduler(self.iteration)
         else:
             lr = self.lr
+
+        grad = grad * self.rescale_grad
+        if self.clip_gradient != None:
+            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
         if state:
             mom = state
             mom[:] *= self.momentum
-            if self.clip_gradient == None:
-                mom[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
-            else:
-                mom[:] += -lr * (clip(grad * self.rescale_grad, -self.clip_gradient,
-                                      self.clip_gradient) +
-                                 self.wd * weight)
+            mom[:] += -lr * (grad + self.wd * weight)
             weight[:] += mom
         else:
             assert self.momentum == 0.0
-            weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
+            weight[:] += -lr * (grad + self.wd * weight)
 
 @register
 class Test(Optimizer):

From 86b2c1242348552af47b619cfbc1c85ae7b78415 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sat, 24 Oct 2015 15:57:42 -0700
Subject: [PATCH 095/122] multi softmax fix and test case

---
 mshadow                                |  2 +-
 src/operator/softmax-inl.h             |  6 ++++-
 tests/python/gpu/test_operator_gpu.py  |  7 ++++++
 tests/python/unittest/test_operator.py | 33 ++++++++++++++++++++++++++
 4 files changed, 46 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/gpu/test_operator_gpu.py

diff --git a/mshadow b/mshadow
index ded43f16aedd..28ffc0a1e25f 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit ded43f16aeddd607fde8c2754df67c92bbcaee76
+Subproject commit 28ffc0a1e25f0d7d3afff625a8706d2aa4720b78
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index ca17f02f593c..4a178f19d0aa 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -131,7 +131,11 @@ class SoftmaxProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
-    SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[0]));
+    if (param_.multi_output) {
+      SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
+    } else {
+      SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[0]));
+    }
     out_shape->clear();
     out_shape->push_back(dshape);
     return true;
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
new file mode 100644
index 000000000000..f8f43e3d52dc
--- /dev/null
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -0,0 +1,7 @@
+import sys
+sys.path.insert(0, '../unittest')
+from test_operator import *
+
+if __name__ == '__main__':
+	test_softmax_with_shape((3,4), mx.gpu())
+    test_multi_softmax_with_shape((3,4,5), mx.gpu())
\ No newline at end of file
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index fbc007b9fed7..b0743a6f0bb6 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -171,8 +171,41 @@ def test_regression():
                      lambda x: x,
                      lambda x, y : x - y)
 
+def check_softmax_with_shape(shape, xpu):
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.Softmax(data=X, label=L)
+    x = mx.random.uniform(-1, 1, shape, ctx = xpu)
+    l = mx.nd.empty((shape[0],), ctx = xpu)
+    l[:] = np.random.randint(0, shape[0]-1, (shape[0],))
+    grad = mx.nd.empty(shape, ctx = xpu)
+
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    print('foward')
+    exec1.forward()
+    print(exec1.outputs[0].asnumpy())
+    exec1.backward()
+    print(grad.asnumpy())
+
+def check_multi_softmax_with_shape(shape, xpu):
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.Softmax(data=X, label=L, multi_output=True)
+    x = mx.random.uniform(-1, 1, shape, ctx = xpu)
+    l = mx.nd.empty((shape[0], shape[2]), ctx = xpu)
+    l[:] = np.random.randint(0, shape[1]-1, (shape[0], shape[2]))
+    grad = mx.nd.empty(shape, ctx = xpu)
+
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    exec1.forward()
+    print(exec1.outputs[0].asnumpy())
+    exec1.backward()
+    print(grad.asnumpy())
+
 if __name__ == '__main__':
     test_elementwise_sum()
     test_concat()
     test_slice_channel()
     test_regression()
+    #check_softmax_with_shape((3,4), mx.cpu())
+    #check_multi_softmax_with_shape((3,4,5), mx.cpu())

From 1e2bd62e37afe31d609da7a50c3722f93c620fbe Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sat, 24 Oct 2015 02:50:12 -0600
Subject: [PATCH 096/122] [Python] Fix numpy IO

---
 python/mxnet/io.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 4da24e1c1cf1..5ec7de10bd82 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -157,6 +157,8 @@ def __init__(self, data, label,
             self.batch_label[i, 0:actual_size] = label[loc:loc+actual_size]
             loc += batch_size
         self.num_pad = batch_size - data.shape[0] % batch_size
+        if data.shape[0] % batch_size == 0:
+            self.num_pad = 0
         self.out_data = None
         self.out_label = None
         self.current_batch = -1

From 7bcbf2422c8c0a91a8d09e09e52bb83ee5c2186d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 25 Oct 2015 14:06:23 -0600
Subject: [PATCH 097/122] [example] Move

---
 example/README.md                             |   1 -
 example/imagenet/README.md                    |   6 +-
 example/imagenet/inception-full.py            | 101 ++++++++++++++++++
 .../predict-with-pretrained-model.ipynb       |   0
 4 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 example/imagenet/inception-full.py
 rename example/{notebooks => imagenet}/predict-with-pretrained-model.ipynb (100%)

diff --git a/example/README.md b/example/README.md
index 34ad16da9a83..1f8f271d98e4 100644
--- a/example/README.md
+++ b/example/README.md
@@ -7,7 +7,6 @@ Notebooks
 * [composite symbol](notebooks/composite_symbol.ipynb) gives you a demo of how to composite a symbolic Inception-BatchNorm Network
 * [cifar-10 recipe](notebooks/cifar-recipe.ipynb) gives you a step by step demo of how to use MXNet
 * [cifar-100](notebooks/cifar-100.ipynb) gives you a demo of how to train a 75.68% accuracy CIFAR-100 model
-* [predict with pretained model](notebooks/predict-with-pretrained-model.ipynb) gives you a demo of use a pretrained Inception-BN Network
 * [simple bind](notebooks/simple_bind.ipynb) gives you a demo of some details in ```mx.model``` module.
 
 Contents
diff --git a/example/imagenet/README.md b/example/imagenet/README.md
index 33bc50c9feae..2039b2ec6c18 100644
--- a/example/imagenet/README.md
+++ b/example/imagenet/README.md
@@ -12,7 +12,11 @@ Note: A commonly mistake is forgetting shuffle the image list. This will lead fa
 
 - [alexnet.py](alexnet.py) : alexnet with 5 convolution layers followed by 3
   fully connnected layers
-- [inception.py](inception.py) : inception + batch norm network
+- [inception.py](inception.py) : inception + batch norm network for ImageNet with 1000 classes problem
+- [inception-full.py](inception-full.py) : This inception network is used for ImageNet with 21841 classes
+
+## Notebooks
+- [predict with pretained model](predict-with-pretrained-model.ipynb) gives you a demo of use a pretrained Inception-BN Network
 
 ## Results
 
diff --git a/example/imagenet/inception-full.py b/example/imagenet/inception-full.py
new file mode 100644
index 000000000000..71a7cfd16ef0
--- /dev/null
+++ b/example/imagenet/inception-full.py
@@ -0,0 +1,101 @@
+# pylint: skip-file
+import sys
+sys.path.insert(0, "../mxnet/python")
+import mxnet as mx
+import logging
+from data import ilsvrc12_iterator
+
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, workspace=512, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden, grad_scale):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=128, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=288, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 96, 96, 96, 96, 144, "avg", 48, '3a')
+    in3b = InceptionFactoryA(in3a, 96, 96, 144, 96, 144, "avg", 96, '3b')
+    in3c = InceptionFactoryB(in3b, 192, 240, 96, 144, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 96, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
+
+softmax = inception(21841, 1.0)
+
+batch_size = 64
+num_gpu = 4
+gpus = [mx.gpu(i) for i in range(num_gpu)]
+input_shape = (3, 224, 224)
+
+train = ilsvrc12_iterator(batch_size=batch_size, input_shape=(3,224,224))
+
+model_prefix = "model/Inception-Full"
+num_round = 10
+
+logging.info("This script is used to train ImageNet fullset over 21841 classes.")
+logging.info("For noraml 1000 classes problem, please use inception.py")
+
+model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_round=num_round,
+                             learning_rate=0.05, momentum=0.9, wd=0.00001)
+
+model.fit(X=train,
+          eval_metric="acc",
+          epoch_end_callback=[mx.callback.Speedometer(batch_size), mx.callback.log_train_metric(100)],
+	  iter_end_callback=mx.callback.do_checkpoint(model_prefix))
diff --git a/example/notebooks/predict-with-pretrained-model.ipynb b/example/imagenet/predict-with-pretrained-model.ipynb
similarity index 100%
rename from example/notebooks/predict-with-pretrained-model.ipynb
rename to example/imagenet/predict-with-pretrained-model.ipynb

From 5730e4999f989115f2152d5c5622d4f9b0c2da0c Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 25 Oct 2015 14:27:19 -0600
Subject: [PATCH 098/122] [DOC] Add imagenet full

---
 README.md                     |   2 +
 doc/index.md                  |   7 ++-
 doc/pretrained.md             |   7 +++
 doc/tutorial/imagenet_full.md | 108 ++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 doc/pretrained.md
 create mode 100644 doc/tutorial/imagenet_full.md

diff --git a/README.md b/README.md
index 9d7e30e31d67..aef489ff7920 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
+* [Training Deep Net on 14 Million Images on A Single Machine](https://mxnet-bing.readthedocs.org/en/latest/tutorial/imagenet_full.html)
 * [MXNet.jl Julia binding initial release](https://github.com/dmlc/MXNet.jl)
 * [Design Note: Squeeze the Memory Consumption of Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
 * [LSTM Example by using symbolic API](https://github.com/dmlc/mxnet/tree/master/example/rnn)
@@ -23,6 +24,7 @@ Contents
 * [Documentation and Tutorials](http://mxnet.readthedocs.org/en/latest/)
 * [Open Source Design Notes](http://mxnet.readthedocs.org/en/latest/#open-source-design-notes)
 * [Code Examples](example)
+* [Pretrained Models](https://github.com/dmlc/mxnet-model-gallery)
 * [Installation](http://mxnet.readthedocs.org/en/latest/build.html)
 * [Features](#features)
 * [Contribute to MXNet](http://mxnet.readthedocs.org/en/latest/contribute.html)
diff --git a/doc/index.md b/doc/index.md
index e74040996138..6298ba800996 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -10,7 +10,7 @@ User Guide
 * [R Package Document](R-package/index.md)
 * [MXNet.jl Julia Package](https://github.com/dmlc/MXNet.jl)
 * [Frequently Asked Questions](faq.md)
-
+* [Pretrained Model Gallery](pretrained.md)
 
 Developer Guide
 ---------------
@@ -33,6 +33,11 @@ in terms of abstraction, optimization and trade-offs.
 * [Dependency Engine for Deep Learning](developer-guide/note_engine.md)
 * [Squeeze the Memory Consumption of Deep Learning](developer-guide/note_memory.md)
 
+Tutorial
+--------
+* [Training Deep Net on 14 Million Images on A Single Machine](tutorial/imagenet_full.md)
+
+
 Indices and tables
 ------------------
 
diff --git a/doc/pretrained.md b/doc/pretrained.md
new file mode 100644
index 000000000000..159b56017698
--- /dev/null
+++ b/doc/pretrained.md
@@ -0,0 +1,7 @@
+Pretrained Model Gallary
+========================
+This document contains the the pretrained in MXNet
+
+* [89.9% Top-5 Validation Accuracy for ImageNet 1,000 Classes Challenge](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-1k-inception)
+* [37.2% Top-1 Training Accuracy for Full ImageNet 21,841 Classes](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception)
+
diff --git a/doc/tutorial/imagenet_full.md b/doc/tutorial/imagenet_full.md
new file mode 100644
index 000000000000..64db843d3f8e
--- /dev/null
+++ b/doc/tutorial/imagenet_full.md
@@ -0,0 +1,108 @@
+# Training Deep Net on 14 Million Images on A Single Machine
+
+This note describes how to train a neural network on Full ImageNet Dataset [1] with 14,197,087 images in 21,841 classes. **We achieved a state-of-art model by using 4 GeForce GTX 980 cards on a single machine in 8.5 days.**
+
+There are several technical challenges in this problem.
+1. How to pack and store the massive data.
+2. How to minimize the memory consumption of the network, so we can use net with more capacity than those used for ImageNet 1K
+3. How to train the model fast.
+
+We also released our pre-trained model for this full ImageNet dataset.
+
+## Data Preprocessing
+The raw full ImageNet dataset is more than 1TB. Before training the network, we need to shuffle these images then load batch of images to feed the neural network. Before we describe how we solve it, let’s do some calculation first:
+
+Assume we have two good storage device [2]:
+
+```
+| Device                    | 4K Random Seek        | Sequential Seek |
+| ------------------------- | --------------------- | --------------- |
+| WD Black (HDD)            | 0.43 MB /s (110 IOPS) | 170 MB/s        |
+| Samsung 850 PRO (SSD)     | 40 MB/s (10,000 IOPS) | 550 MB/s        |
+```
+
+A very naive approach is loading from a list by random seeking. If use this approach, we will spend 677 hours with HDD or 6.7 hours with SSD respectively. This is only about read. Although SSD looks not bad, but 1TB SSD is not affordable for everyone.
+
+But we notice sequential seek is much faster than random seek. Also, loading batch by batch is a sequential action. Can we make a change? The answer is we can't do sequential seek directly. We need random shuffle the training data first, then pack them into a sequential binary package.
+
+This is the normal solution used by most deep learning packages. However, unlike ImageNet 1K dataset, where we ***cannot*** store the images in raw pixels format.  Because otherwise we will need more than 1TB space. Instead, we need to pack the images in compressed format.
+
+***The key ingredients are***
+- Store the images in jpeg format, and pack them into binary record.
+- Split the list, and pack several record files, instead of one file.
+   - This allows us to pack the images in distributed fashion, because we will be eventually bounded by the IO cost during packing.
+   - We need to make the package being able to read from several record files, which is not too hard.
+This will allow us to store the entire imagenet dataset in around 250G space.
+
+After packing, together with threaded buffer iterator, we can simply achieve an IO speed of around 3,000 images/sec on a normal HDD.
+
+## Training the model
+
+
+Now we have data. We need to consider which network structure to use. We use Inception-BN [3] style model, compared to other models such as VGG, it has fewer parameters, less parameters simplified sync problem. Considering our problem is much more challenging than 1k classes problem, we add suitable capacity into original Inception-BN structure, by increasing the size of filter by factor of 1.5 in bottom layers of original Inception-BN network.
+This however, creates a challenge for GPU memory. As GTX980 only have 4G of GPU RAM. We really need to minimize the memory consumption to fit larger batch-size into the training. To solve this problem we use the techniques such as node memory reuse, and inplace optimization, which reduces the memory consumption by half, more details can be found in  [memory optimization note](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
+
+Finally, we cannot train the model using a single GPU because this is a really large net, and a lot of data. We use data parallelism on four GPUs to train this model, which involves smart synchronization of parameters between different GPUs, and overlap the communication and computation. A [runtime denpdency engine](https://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html) is used to simplify this task, allowing us to run the training at around 170 images/sec.
+
+Here is a learning cureve of the training process:
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/imagenet_full/curve.png "Learning Curve")
+
+## Evaluate the Performance
+Train Top-1 Accuracy over 21,841 classes: 37.19%
+
+There is no official validation set over 21,841 classes, so we are using ILVRC2012 validation set to check the performance. Here is the result:
+
+```
+| Accuracy | Over 1,000 classes | Over 21,841 classes |
+| -------- | ------------------ | ------------------- |
+| Top-1    | 68.3%              | 41.9%               |
+| Top-5    | 89.0%              | 69.6%               |
+| Top=20   | 96.0%              | 83.6%               |
+```
+
+As we can see we get quite reasonable result after 9 iterations. Notably much less number of iterations is needed to achieve a stable performance, mainly due to we are facing a larger dataset.
+
+We should note that this result is by no means optimal, as we did not carefully pick the parameters and the experiment cycle is longer than the 1k dataset. We think there is definite space for improvement, and you are welcomed to try it out by yourself!
+
+
+## The Code and Model
+The code and step guide is publically available at [https://github.com/dmlc/mxnet/tree/master/example/imagenet](https://github.com/dmlc/mxnet/tree/master/example/imagenet)
+
+We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception)
+
+## How to Use The Model
+We should point out it 21k classes is much more challenging than 1k. Directly use the raw prediction is not a reasonable way.
+
+Look at this picture which I took in Mount Rainier this summer:
+
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/imagenet_full/rainier.png "Mount Rainer")
+
+We can figure out there is a mountain, valley, tree and bridge. And the prediction probability is :
+
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/imagenet_full/prob.png "Probability")
+
+We notice there are several peaks. Let's print out the label text in among 21k classes and ImageNet 1k classes:
+
+```
+| Rank  | Over 1,000 classes          | Over 21,841 classes        |
+| ----- | --------------------------- | -------------------------- |
+| Top-1 | n09468604 valley            | n11620673 Fir              |
+| Top-2 | n09332890 lakeside          | n11624531 Spruce           |
+| Top-3 | n04366367 suspension bridge | n11621281 Amabilis fir     |
+| Top-4 | n09193705 alp               | n11628456 Douglas fir      |
+| Top-5 | n09428293 seashore          | n11627908 Mountain hemlock |
+```
+
+There is no doubt that directly use probability over 21k classes loss diversity of prediction. If you carefully choose a subset by using WordNet hierarchy relation, I am sure you will find more interesting results.
+
+## Note
+[1] Deng, Jia, et al. "Imagenet: A large-scale hierarchical image database." *Computer Vision and Pattern Recognition*, 2009. CVPR 2009. IEEE Conference on. IEEE, 2009.
+
+[2] HDD/SSD data is from public website may not be accurate.
+
+[3] Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." *arXiv preprint arXiv:1502.03167* (2015).
+
+
+
+
+

From c39de261314dd93205c956c59ed03176740434a5 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Sun, 25 Oct 2015 16:25:25 -0600
Subject: [PATCH 099/122] Update imagenet_full.md

---
 doc/tutorial/imagenet_full.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorial/imagenet_full.md b/doc/tutorial/imagenet_full.md
index 64db843d3f8e..bf692ec5882b 100644
--- a/doc/tutorial/imagenet_full.md
+++ b/doc/tutorial/imagenet_full.md
@@ -1,4 +1,4 @@
-# Training Deep Net on 14 Million Images on A Single Machine
+# Training Deep Net on 14 Million Images by Using A Single Machine
 
 This note describes how to train a neural network on Full ImageNet Dataset [1] with 14,197,087 images in 21,841 classes. **We achieved a state-of-art model by using 4 GeForce GTX 980 cards on a single machine in 8.5 days.**
 

From 22988582e43734d5104854ef8458cbb525123768 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Mon, 26 Oct 2015 10:57:38 +0800
Subject: [PATCH 100/122] fix compile issue in VS

---
 CMakeLists.txt                 | 10 ++++++++++
 include/mxnet/c_api.h          |  4 ++--
 src/common/tblob_op_registry.h |  1 +
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe020b81502b..67e0b881df5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,16 @@ mxnet_source_group("Source\\Cuda" GLOB_RECURSE "src/*.cu")
 FILE(GLOB_RECURSE SOURCE "src/*.cc")
 FILE(GLOB_RECURSE cuda "src/*.cu")
 
+if(MSVC)
+  foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if(${flag_var} MATCHES "/MD")
+      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endif(${flag_var} MATCHES "/MD")
+  endforeach(flag_var)
+endif()
+
 if(USE_CUDA)
   # define preprocessor macro so that we will not include the generated forcelink header
   mshadow_cuda_compile(cuda_objs ${cuda})
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 82ab2ccb1239..dd3c225f66bc 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -13,9 +13,9 @@
 /*! \brief MXNET_DLL prefix for windows" */
 #ifdef _WIN32
 #ifdef MXNET_EXPORTS
-#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllexport)
+#define MXNET_DLL MXNET_EXTERN_C __declspec(dllexport)
 #else
-#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllimport)
+#define MXNET_DLL MXNET_EXTERN_C __declspec(dllimport)
 #endif
 #else
 #define MXNET_DLL MXNET_EXTERN_C
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
index 2a521909d49b..731e53900415 100644
--- a/src/common/tblob_op_registry.h
+++ b/src/common/tblob_op_registry.h
@@ -15,6 +15,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <functional>
 
 #if DMLC_USE_CXX11
 #include <functional>

From 9af318171541a6a51b3ad9077e61b97aaf0287b0 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Mon, 26 Oct 2015 11:06:31 +0800
Subject: [PATCH 101/122] get rid of __cdecl

---
 include/mxnet/c_predict_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index f997f91c4787..26ca23247627 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -13,9 +13,9 @@
 
 #ifdef _WIN32
 #ifdef MXNET_EXPORTS
-#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllexport)
+#define MXNET_DLL MXNET_EXTERN_C __declspec(dllexport)
 #else
-#define MXNET_DLL MXNET_EXTERN_C __cdecl __declspec(dllimport)
+#define MXNET_DLL MXNET_EXTERN_C __declspec(dllimport)
 #endif
 #else
 #define MXNET_DLL MXNET_EXTERN_C

From 549bd7830a58d502fb8d6a7f7001b5a8c41ce36b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 26 Oct 2015 20:12:07 -0700
Subject: [PATCH 102/122] [OP] Add norm

---
 mshadow                               |  2 +-
 python/mxnet/ndarray.py               | 14 ++++++++++++
 src/common/tblob_op_registry.cc       | 32 ++++++++++++++++++++-------
 src/common/tblob_op_registry.h        |  4 +++-
 src/ndarray/unary_function-inl.h      | 28 +++++++++++++++++++++--
 tests/python/unittest/test_ndarray.py |  7 +++---
 6 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/mshadow b/mshadow
index 28ffc0a1e25f..f2d0e252fe3b 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 28ffc0a1e25f0d7d3afff625a8706d2aa4720b78
+Subproject commit f2d0e252fe3b6891322ceb565389ea6c05e3b402
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index a2b5ccadb2d7..1e52f66cea81 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -305,6 +305,20 @@ def asnumpy(self):
             ctypes.c_size_t(data.size)))
         return data
 
+    def asscalar(self):
+        """Return a CPU scalar(float) of current ndarray.
+
+        This ndarray must have shape (1,)
+
+        Returns
+        -------
+        scalar : np.float
+            The scalar representation of the ndarray.
+        """
+        if self.shape != (1,):
+            raise ValueError("The current array is not a scalar")
+        return self.asnumpy()[0]
+
     def copyto(self, other):
         """Copy the content of current array to other.
 
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
index ae1f54da3c3a..8dac8944f144 100644
--- a/src/common/tblob_op_registry.cc
+++ b/src/common/tblob_op_registry.cc
@@ -18,7 +18,8 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
   // functions
   TSelf& set_function(int dev_mask,
                       UnaryFunction funary,
-                      bool inplace_in_out) override {
+                      bool inplace_in_out,
+                      bool register_symbolic) override {
     std::lock_guard<std::mutex> lock(mutex_);
     ++reg_counter_;
     if (funary_.size() <= static_cast<size_t>(dev_mask)) {
@@ -30,7 +31,13 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     }
     funary_[dev_mask] = funary;
     inplace_in0_out_forward_ = inplace_in_out;
-    if (reg_counter_ == 1) this->DoRegisterUnary();
+    if (reg_counter_ == 1) {
+      this->RegisterUnary();
+      register_symbolic_ = register_symbolic;
+      if (register_symbolic) {
+        this->RegisterUnarySymbolic();
+      }
+    }
     return *this;
   }
 
@@ -76,7 +83,9 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     std::lock_guard<std::mutex> lock(mutex_);
     if (reg_counter_ != 1) return *this;
     NDArrayReg().describe(description);
-    OpReg().describe(description);
+    if (register_symbolic_) {
+      OpReg().describe(description);
+    }
     return *this;
   }
 
@@ -87,6 +96,7 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
   std::mutex mutex_;
   // registration counter
   int reg_counter_{0};
+  bool register_symbolic_{true};
   // unary shape inferencer
   UnaryShapeInfer unary_infer_{nullptr};
   // unary functions on each device mask
@@ -121,7 +131,8 @@ class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
     return *op_reg_;
   }
   // start registering all stuffs
-  void DoRegisterUnary();
+  void RegisterUnary();
+  void RegisterUnarySymbolic();
 };
 
 // Unary operator to invoke generic TBlob function.
@@ -256,7 +267,7 @@ class TBlobUnaryOpProp : public OperatorProperty {
   }
 };
 
-void TBlobOpRegEntryImpl::DoRegisterUnary() {
+void TBlobOpRegEntryImpl::RegisterUnary() {
   CHECK_EQ(reg_counter_, 1);
   // The body to be registered
   auto body = [this] (NDArray **used_vars,
@@ -264,12 +275,15 @@ void TBlobOpRegEntryImpl::DoRegisterUnary() {
                       NDArray **mutate_vars) {
     NDArray src = *used_vars[0];
     NDArray *out = mutate_vars[0];
+    TShape dshape = src.shape();
+    if (unary_infer_ != nullptr) dshape = unary_infer_(dshape);
 
     if (out->is_none()) {
-      *out = NDArray(src.shape(), src.ctx(), true);
+      *out = NDArray(dshape, src.ctx(), true);
     } else {
       CHECK(out->ctx() == src.ctx()) << "target context mismatch";
-      CHECK(out->shape() == src.shape()) << "target shape mismatch";
+      CHECK(out->shape() == dshape) << "target shape mismatch "
+      << out->shape() << " vs. " << dshape;
     }
     // important: callback must always capture by value
     NDArray ret = *out;
@@ -303,6 +317,9 @@ void TBlobOpRegEntryImpl::DoRegisterUnary() {
       .set_num_mutate_vars(1)
       .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
       .add_argument("src", "NDArray", "Source input to the function");
+}
+
+void TBlobOpRegEntryImpl::RegisterUnarySymbolic() {
   // register the operator
   auto op_factory = [this]() {
     TBlobUnaryOpProp *prop = new TBlobUnaryOpProp();
@@ -314,7 +331,6 @@ void TBlobOpRegEntryImpl::DoRegisterUnary() {
       .set_body(op_factory)
       .add_argument("src", "Symbol", "Source symbolic input to the function");
 }
-
 TBlobOpRegEntry& TBlobOpRegistry::__REGISTER_OR_FIND__(const std::string &name) {
   if (fmap_.count(name) != 0) return *fmap_.at(name);
   TBlobOpRegEntry *e = new TBlobOpRegEntryImpl();
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
index 731e53900415..d6f5b1644b74 100644
--- a/src/common/tblob_op_registry.h
+++ b/src/common/tblob_op_registry.h
@@ -73,10 +73,12 @@ class TBlobOpRegEntry {
    * \param dev_mask The device mask of the function can act on.
    * \param funary The unary function that peforms the operation.
    * \param inplace_in_out Whether do inplace optimization on in and out.
+   * \param register_symbolic Whether register a symbolic operator as well.
    */
   virtual TSelf& set_function(int dev_mask,
                               UnaryFunction funary,
-                              bool inplace_in_out) = 0;
+                              bool inplace_in_out,
+                              bool register_symbolic = true) = 0;
   /*!
    * \brief set gradient of the function of this function.
    * \param dev_mask The device mask of the function can act on.
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 45e3e42f2495..0d3323620814 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -64,6 +64,24 @@ void UnaryBackwardUseOut_(const arg::OutGrad& out_grad,
          out_grad.data.FlatTo2D<xpu, real_t>());
 }
 
+// return a shape of scalar
+TShape ScalarShape(const TShape& ishape) {
+  return mshadow::Shape1(1);
+}
+
+template<typename xpu>
+void L2Norm(const TBlob &src,
+            TBlob *ret,
+            OpReqType req,
+            RunContext ctx) {
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
+  mshadow::Tensor<xpu, 1> in =
+      src.get_with_shape<xpu, 1, real_t>(mshadow::Shape1(src.shape_.Size()));
+  mshadow::VectorDot(out, in, in);
+  out = mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out);
+}
+
 // Register all unary operations here
 // Square
 struct square_grad {
@@ -77,8 +95,7 @@ MXNET_REGISTER_TBLOB_FUN(square, XPU)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_grad>, true)
 .describe("Take square of the src");
 
-
-// Square root
+// square root
 struct square_root_grad {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return 0.5f / a;
@@ -88,6 +105,13 @@ MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, square_root_grad>, true)
 .describe("Take square root of the src");
+
+// L2 norm
+MXNET_REGISTER_TBLOB_FUN(norm, XPU)
+.set_function(XPU::kDevMask, L2Norm<XPU>, false, false)
+.set_shape_infer(ScalarShape)
+.describe("Take L2 norm of the src."
+          "The result will be ndarray of shape (1,) on the same device.");
 }  // namespace ndarray
 }  // namespace mxnet
 #endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index d41c47662e2f..c00350a3ad28 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -33,8 +33,9 @@ def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10):
     else:
         out2 = npuf(*numpy_arg)
     assert out1.shape == out2.shape
-    dif = reldiff(out1.asnumpy(), out2)
-    assert reldiff(out1.asnumpy(), out2) < 1e-6
+    if isinstance(out1, mx.nd.NDArray):
+        out1 = out1.asnumpy()
+    assert reldiff(out1, out2) < 1e-6
 
 
 def random_ndarray(dim):
@@ -54,7 +55,7 @@ def test_ndarray_elementwise():
             check_with_uniform(lambda x, y: x / y, 2, dim)
             check_with_uniform(mx.nd.sqrt, 2, dim, np.sqrt, rmin=0)
             check_with_uniform(mx.nd.square, 2, dim, np.square, rmin=0)
-
+            check_with_uniform(lambda x: mx.nd.norm(x).asscalar(), 1, dim, np.linalg.norm)
 
 def test_ndarray_negate():
     npy = np.random.uniform(-10, 10, (2,3,4))

From 7033d178bcbfcc4fe71479ab5e754c8b5dc80daf Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 26 Oct 2015 20:54:53 -0700
Subject: [PATCH 103/122] Update mshadow

---
 mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mshadow b/mshadow
index f2d0e252fe3b..27ba6a635e81 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit f2d0e252fe3b6891322ceb565389ea6c05e3b402
+Subproject commit 27ba6a635e81ac6e9f0f30a1ab1bf1d32e56f7d8

From e13dbd3899f449000d05a9fc79295511daaa586c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 26 Oct 2015 22:21:40 -0600
Subject: [PATCH 104/122] fix

---
 ps-lite                          | 2 +-
 src/ndarray/unary_function-inl.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ps-lite b/ps-lite
index 7121aa1bdb67..504faa73a826 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 7121aa1bdb673f047c7600eb4347fd2911021710
+Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 0d3323620814..b3a54653fd5f 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -65,8 +65,9 @@ void UnaryBackwardUseOut_(const arg::OutGrad& out_grad,
 }
 
 // return a shape of scalar
-TShape ScalarShape(const TShape& ishape) {
-  return mshadow::Shape1(1);
+inline TShape ScalarShape(const TShape& ishape) {
+  mshadow::index_t shape[] = {1};
+  return TShape(shape, shape + 1);
 }
 
 template<typename xpu>

From 87c24928b8dc944bbb3fd2507040d964e5bf9f5a Mon Sep 17 00:00:00 2001
From: Kublai-Jing <kublai.jing@gmail.com>
Date: Tue, 27 Oct 2015 10:31:27 -0400
Subject: [PATCH 105/122] add exp, log for ndarray

---
 src/ndarray/unary_function-inl.h | 12 ++++++++++++
 src/operator/mshadow_op.h        | 25 +++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index b3a54653fd5f..7148cb4c7185 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -107,6 +107,18 @@ MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, square_root_grad>, true)
 .describe("Take square root of the src");
 
+// exp
+MXNET_REGISTER_TBLOB_FUN(exp, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::exp>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::exp_grad>, true)
+.describe("Take exp of the src");
+
+//log
+MXNET_REGISTER_TBLOB_FUN(log, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::log>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::log_grad>, true)
+.describe("Take log of the src");
+
 // L2 norm
 MXNET_REGISTER_TBLOB_FUN(norm, XPU)
 .set_function(XPU::kDevMask, L2Norm<XPU>, false, false)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index c8ca495d3349..97e02951ab7c 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -80,6 +80,31 @@ struct tanh_grad {
   }
 };
 
+
+struct exp {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return expf(a);
+  }
+};
+struct exp_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return expf(a);
+  }
+};
+
+struct log {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return logf(a);
+  }
+};
+struct log_grad	{
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f/a;
+  }
+};
+
+
+
 struct square {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return a * a;

From 891f4f51700e33112ec1196f89becf784a7ab917 Mon Sep 17 00:00:00 2001
From: Kublai-Jing <kublai.jing@gmail.com>
Date: Tue, 27 Oct 2015 10:56:58 -0400
Subject: [PATCH 106/122] change to UseIN_

---
 src/ndarray/unary_function-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 7148cb4c7185..68ed5e43187d 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -104,19 +104,19 @@ struct square_root_grad {
 };
 MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
-.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, square_root_grad>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_root_grad>, true)
 .describe("Take square root of the src");
 
 // exp
 MXNET_REGISTER_TBLOB_FUN(exp, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::exp>, true)
-.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::exp_grad>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::exp_grad>, true)
 .describe("Take exp of the src");
 
 //log
 MXNET_REGISTER_TBLOB_FUN(log, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::log>, true)
-.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::log_grad>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::log_grad>, true)
 .describe("Take log of the src");
 
 // L2 norm

From e88187b2c40209fe46219f718ea68f97fb14f481 Mon Sep 17 00:00:00 2001
From: Kublai-Jing <kublai.jing@gmail.com>
Date: Tue, 27 Oct 2015 11:49:19 -0400
Subject: [PATCH 107/122] fix lint issue

---
 src/ndarray/unary_function-inl.h | 3 ++-
 src/operator/mshadow_op.h        | 9 ++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 68ed5e43187d..d6e21640505c 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -102,6 +102,7 @@ struct square_root_grad {
     return 0.5f / a;
   }
 };
+
 MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_root_grad>, true)
@@ -113,7 +114,7 @@ MXNET_REGISTER_TBLOB_FUN(exp, XPU)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::exp_grad>, true)
 .describe("Take exp of the src");
 
-//log
+// log
 MXNET_REGISTER_TBLOB_FUN(log, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::log>, true)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::log_grad>, true)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 97e02951ab7c..58cef216be16 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -80,12 +80,12 @@ struct tanh_grad {
   }
 };
 
-
 struct exp {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return expf(a);
   }
 };
+
 struct exp_grad {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return expf(a);
@@ -97,14 +97,13 @@ struct log {
     return logf(a);
   }
 };
-struct log_grad	{
+
+struct log_grad {
   MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f/a;
+    return logf(a);
   }
 };
 
-
-
 struct square {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return a * a;

From 93a036df8671774e93511f025fcdcf8370a84cf5 Mon Sep 17 00:00:00 2001
From: Kublai-Jing <kublai.jing@gmail.com>
Date: Tue, 27 Oct 2015 12:39:43 -0400
Subject: [PATCH 108/122] UseOut for exp, fix grad for log

---
 src/ndarray/unary_function-inl.h | 2 +-
 src/operator/mshadow_op.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index d6e21640505c..dcf0a6868197 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -111,7 +111,7 @@ MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
 // exp
 MXNET_REGISTER_TBLOB_FUN(exp, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::exp>, true)
-.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::exp_grad>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::identity>, true)
 .describe("Take exp of the src");
 
 // log
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 58cef216be16..069b04e7252c 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -100,7 +100,7 @@ struct log {
 
 struct log_grad {
   MSHADOW_XINLINE static real_t Map(real_t a) {
-    return logf(a);
+    return 1.0f/a;
   }
 };
 

From 064cab6132e835fdd326dbc7520106ff5abc1df1 Mon Sep 17 00:00:00 2001
From: Kaixhin <design@kaixhin.com>
Date: Tue, 27 Oct 2015 09:37:19 +0000
Subject: [PATCH 109/122] Add Docker images to installation docs

---
 doc/build.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/doc/build.md b/doc/build.md
index e7cef14fc87f..b3b354d19f49 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -17,6 +17,7 @@ Contents
   - Introduces how to build mxnet with advanced features such as HDFS/S3 support, CUDNN
 - [Python Package Installation](#python-package-installation)
 - [R Package Installation](#r-package-installation)
+- [Docker Images](#docker-images)
 
 Build MXNet Library
 -------------------
@@ -146,3 +147,18 @@ Hopefully, we will now have mxnet on R!
 ## Note on Library Build
 We isolate the library build with Rcpp end to maximize the portability
   - MSVC is needed on windows to build the mxnet library, because of CUDA compatiblity issue of toolchains.
+
+Docker Images
+-------------
+Builds of MXNet are available as [Docker](https://www.docker.com/whatisdocker) images:
+[MXNet Docker (CPU)](https://hub.docker.com/r/kaixhin/mxnet/) or [MXNet Docker (CUDA)](https://hub.docker.com/r/kaixhin/cuda-mxnet/).
+These are updated on a weekly basis with the latest builds of MXNet. Examples of running bash in a Docker container
+are as follows:
+
+```bash
+sudo docker run -it kaixhin/mxnet
+sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0 kaixhin/cuda-mxnet:7.0
+```
+
+For a guide to Docker, see the [official docs](https://docs.docker.com/userguide/). For more details on how to use the
+MXNet Docker images, including requirements for CUDA support, consult the [source project](https://github.com/Kaixhin/dockerfiles).

From 7542da6b0d73f22ff9b3b06823ec4560a78d0782 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 27 Oct 2015 13:54:33 -0600
Subject: [PATCH 110/122] [OP] update convolution stride

---
 ps-lite                        |  2 +-
 src/operator/convolution-inl.h | 24 ++++++++++--------------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/ps-lite b/ps-lite
index 504faa73a826..0cc04093f7c9 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
+Subproject commit 0cc04093f7c9e07155f585552f31a90715bacef6
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index eccb4df9448c..74dca3f1a792 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -100,15 +100,15 @@ class ConvolutionOp : public Operator {
         temp_col = unpack_patch2col(data.Slice(i, i + step),
                                     param_.kernel[0],
                                     param_.kernel[1],
-                                    param_.stride[0]);
-        // TODO(bing): make mshadow support dual stride
+                                    param_.stride[0],
+                                    param_.stride[1]);
       } else {
         temp_col = unpack_patch2col(pad(data.Slice(i, i + step),
                                         param_.pad[0], param_.pad[1]),
                                     param_.kernel[0],
                                     param_.kernel[1],
-                                    param_.stride[0]);
-        // TODO(bing): make mshadow support dual stride
+                                    param_.stride[0],
+                                    param_.stride[1]);
       }
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
@@ -173,17 +173,17 @@ class ConvolutionOp : public Operator {
                                                       shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst.shape_);
       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        // TODO(bing): dual stride
         temp_col = unpack_patch2col(data.Slice(i, i + step),
                                      param_.kernel[0],
                                      param_.kernel[1],
-                                     param_.stride[0]);
+                                     param_.stride[0],
+                                     param_.stride[1]);
       } else {
-        // TODO(bing): dual stride
         temp_col = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
                                      param_.kernel[0],
                                      param_.kernel[1],
-                                     param_.stride[0]);
+                                     param_.stride[0],
+                                     param_.stride[1]);
       }
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
@@ -302,10 +302,6 @@ class ConvolutionProp : public OperatorProperty {
     out_shape->push_back(dshape);
     const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
     const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-    const index_t kstride = static_cast<index_t>(param_.stride[0]);
-    // TODO(bing) : support dual stride
-    CHECK_EQ(param_.stride[0], param_.stride[1])
-        << "Only support same stride now";
     CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
     CHECK_EQ(param_.num_filter % param_.num_group, 0) \
@@ -317,8 +313,8 @@ class ConvolutionProp : public OperatorProperty {
     CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
         << "kernel size exceed input";
     (*out_shape)[kOut][1] = param_.num_filter;
-    (*out_shape)[kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / kstride + 1;
-    (*out_shape)[kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / kstride + 1;
+    (*out_shape)[kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / param_.stride[0] + 1;
+    (*out_shape)[kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / param_.stride[1] + 1;
     return true;
   }
 

From d34a7be0afb27625d08063837796e3c25ed8da23 Mon Sep 17 00:00:00 2001
From: Kublai-Jing <kublai.jing@gmail.com>
Date: Tue, 27 Oct 2015 16:46:43 -0400
Subject: [PATCH 111/122] delete exp_grad

---
 src/ndarray/unary_function-inl.h | 27 ++++++---------------------
 src/operator/mshadow_op.h        | 20 +++++++++++++-------
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index dcf0a6868197..c084bb4a4e95 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -82,50 +82,35 @@ void L2Norm(const TBlob &src,
   mshadow::VectorDot(out, in, in);
   out = mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out);
 }
-
 // Register all unary operations here
-// Square
-struct square_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 2.0f * a;
-  }
-};
 // The true means inplace can be enabled.
+// square
 MXNET_REGISTER_TBLOB_FUN(square, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square>, true)
-.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_grad>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::square_grad>, true)
 .describe("Take square of the src");
-
-// square root
-struct square_root_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 0.5f / a;
-  }
-};
-
+// sqrt
 MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
-.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, square_root_grad>, true)
-.describe("Take square root of the src");
-
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::square_root_grad>, true)
+.describe("Take sqrt of the src");
 // exp
 MXNET_REGISTER_TBLOB_FUN(exp, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::exp>, true)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::identity>, true)
 .describe("Take exp of the src");
-
 // log
 MXNET_REGISTER_TBLOB_FUN(log, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::log>, true)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::log_grad>, true)
 .describe("Take log of the src");
-
 // L2 norm
 MXNET_REGISTER_TBLOB_FUN(norm, XPU)
 .set_function(XPU::kDevMask, L2Norm<XPU>, false, false)
 .set_shape_infer(ScalarShape)
 .describe("Take L2 norm of the src."
           "The result will be ndarray of shape (1,) on the same device.");
+
 }  // namespace ndarray
 }  // namespace mxnet
 #endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 069b04e7252c..94db8d78cde0 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -86,12 +86,6 @@ struct exp {
   }
 };
 
-struct exp_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return expf(a);
-  }
-};
-
 struct log {
   MSHADOW_XINLINE static real_t Map(real_t a) {
     return logf(a);
@@ -100,7 +94,7 @@ struct log {
 
 struct log_grad {
   MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f/a;
+    return 1.0f / a;
   }
 };
 
@@ -110,6 +104,12 @@ struct square {
   }
 };
 
+struct square_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 2.0f * a;
+  }
+};
+
 /*! \brief used for generate Bernoulli mask */
 struct threshold {
   MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
@@ -131,6 +131,12 @@ struct square_root {
   }
 };
 
+struct square_root_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 0.5f / a;
+  }
+};
+
 }  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet

From 053ba9203e47fb0c5dbb441e5becc47e87fd049c Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 27 Oct 2015 20:01:35 -0600
Subject: [PATCH 112/122] Delete

---
 src/operator/param.h | 73 --------------------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 src/operator/param.h

diff --git a/src/operator/param.h b/src/operator/param.h
deleted file mode 100644
index 9b08c197a160..000000000000
--- a/src/operator/param.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file param.h
- * \brief Common operator parameters
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_PARAM_H_
-#define MXNET_OPERATOR_PARAM_H_
-
-#include <cstring>
-
-namespace mxnet {
-namespace op {
-/*! \brief possible parameter for each operator */
-struct Param {
-  /*! \brief number of hidden layers */
-  int num_hidden;
-  /*! \brief number of output channel */
-  int num_channel;
-  /*! \brief number of parallel group */
-  int num_group;
-  /*! \brief kernel height */
-  int kernel_y;
-  /*! \brief kernel width */
-  int kernel_x;
-  /*! \brief stride in y dimension*/
-  int stride_y;
-  /*! \brief stride in x dimension */
-  int stride_x;
-  /*! \brief padding in y dimension */
-  int pad_y;
-  /*! \brief padding in x dimension */
-  int pad_x;
-  /*! \brief whether not include bias term */
-  int no_bias;
-  /*! \brief maximum temp_col_size allowed in each layer */
-  int temp_col_max;
-  /*! \brief reserved fields, for future compatibility */
-  int reserved[64];
-
-  // constructor
-  Param() {
-    memset(this, 0, sizeof(Param));
-  }
-
-  inline void SetParam(const char *name, const char* val) {
-    if (!strcmp(name, "num_hidden")) num_hidden = atoi(val);
-    if (!strcmp(name, "num_channel")) num_channel = atoi(val);
-    if (!strcmp(name, "num_group")) num_group = atoi(val);
-    if (!strcmp(name, "kernel_size")) {
-      kernel_y = kernel_x = atoi(val);
-    }
-    if (!strcmp(name, "kernel_height")) kernel_y = atoi(val);
-    if (!strcmp(name, "kernel_width")) kernel_x = atoi(val);
-    if (!strcmp(name, "stride")) {
-      stride_y = stride_x = atoi(val);
-    }
-    if (!strcmp(name, "stride_y")) stride_y = atoi(val);
-    if (!strcmp(name, "stride_x")) stride_x = atoi(val);
-
-    if (!strcmp(name, "pad")) {
-      pad_y = pad_x  = atoi(val);
-    }
-    if (!strcmp(name, "pad_y")) pad_y = atoi(val);
-    if (!strcmp(name, "pad_x")) pad_x = atoi(val);
-    if (!strcmp(name, "no_bias")) no_bias = atoi(val);
-    if (!strcmp(name, "temp_col_max")) temp_col_max = atoi(val) << 18;
-  }
-};  // struct Param
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_PARAM_H_

From ef16e8a85d4efe674536af4b69ae6dbbc4307d86 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 27 Oct 2015 23:15:18 -0600
Subject: [PATCH 113/122] add namespace

---
 src/operator/activation-inl.h                 |  32 ++--
 src/operator/activation.cc                    |   6 +-
 src/operator/batch_norm-inl.h                 | 108 +++++++-------
 src/operator/block_grad-inl.h                 |  12 +-
 src/operator/concat-inl.h                     |  22 +--
 src/operator/convolution-inl.h                |  50 ++++---
 src/operator/cudnn_activation-inl.h           |  40 ++---
 src/operator/cudnn_convolution-inl.h          |  32 ++--
 src/operator/cudnn_lrn-inl.h                  |  16 +-
 src/operator/cudnn_pooling-inl.h              |  20 +--
 src/operator/dropout-inl.h                    |  28 ++--
 src/operator/elementwise_binary_op-inl.h      | 116 +++++++--------
 src/operator/elementwise_binary_op.cc         |   2 +-
 src/operator/elementwise_binary_op.cu         |   2 +-
 src/operator/elementwise_sum-inl.h            |  36 ++---
 src/operator/fully_connected-inl.h            |  42 +++---
 src/operator/leaky_relu-inl.h                 | 138 +++++++++---------
 src/operator/lrn-inl.h                        |  25 ++--
 src/operator/pooling-inl.h                    |  41 +++---
 src/operator/pooling.cc                       |   6 +-
 src/operator/pooling.cu                       |   6 +-
 src/operator/regression_output-inl.h          |  31 ++--
 src/operator/regression_output.cc             |  12 +-
 src/operator/regression_output.cu             |   6 +-
 src/operator/reshape-inl.h                    |  29 ++--
 src/operator/slice_channel-inl.h              |  22 +--
 src/operator/softmax-inl.h                    |  45 +++---
 .../python/multi-node/dist_async_inception.py |   0
 tests/python/multi-node/dist_async_lenet.py   |   0
 tests/python/multi-node/dist_async_mlp.py     |   0
 .../multi-node/dist_imagenet_inception.py     |   0
 .../python/multi-node/dist_sync_inception.py  |   0
 tests/python/multi-node/dist_sync_kvstore.py  |   0
 tests/python/multi-node/dist_sync_lenet.py    |   0
 tests/python/multi-node/dist_sync_mlp.py      |   0
 tests/python/multi-node/local_inception.py    |   0
 tests/python/multi-node/local_lenet.py        |   0
 tests/python/multi-node/local_mlp.py          |   0
 tests/python/multi-node/test_data.py          |   0
 39 files changed, 489 insertions(+), 436 deletions(-)
 mode change 100644 => 100755 src/operator/activation-inl.h
 mode change 100644 => 100755 src/operator/activation.cc
 mode change 100644 => 100755 src/operator/batch_norm-inl.h
 mode change 100644 => 100755 src/operator/block_grad-inl.h
 mode change 100644 => 100755 src/operator/concat-inl.h
 mode change 100644 => 100755 src/operator/convolution-inl.h
 mode change 100644 => 100755 src/operator/cudnn_activation-inl.h
 mode change 100644 => 100755 src/operator/cudnn_convolution-inl.h
 mode change 100644 => 100755 src/operator/cudnn_lrn-inl.h
 mode change 100644 => 100755 src/operator/cudnn_pooling-inl.h
 mode change 100644 => 100755 src/operator/dropout-inl.h
 mode change 100644 => 100755 src/operator/elementwise_binary_op-inl.h
 mode change 100644 => 100755 src/operator/elementwise_binary_op.cc
 mode change 100644 => 100755 src/operator/elementwise_binary_op.cu
 mode change 100644 => 100755 src/operator/elementwise_sum-inl.h
 mode change 100644 => 100755 src/operator/fully_connected-inl.h
 mode change 100644 => 100755 src/operator/leaky_relu-inl.h
 mode change 100644 => 100755 src/operator/lrn-inl.h
 mode change 100644 => 100755 src/operator/pooling-inl.h
 mode change 100644 => 100755 src/operator/pooling.cc
 mode change 100644 => 100755 src/operator/pooling.cu
 mode change 100644 => 100755 src/operator/regression_output-inl.h
 mode change 100644 => 100755 src/operator/regression_output.cc
 mode change 100644 => 100755 src/operator/regression_output.cu
 mode change 100644 => 100755 src/operator/reshape-inl.h
 mode change 100644 => 100755 src/operator/slice_channel-inl.h
 mode change 100644 => 100755 src/operator/softmax-inl.h
 mode change 100755 => 100644 tests/python/multi-node/dist_async_inception.py
 mode change 100755 => 100644 tests/python/multi-node/dist_async_lenet.py
 mode change 100755 => 100644 tests/python/multi-node/dist_async_mlp.py
 mode change 100755 => 100644 tests/python/multi-node/dist_imagenet_inception.py
 mode change 100755 => 100644 tests/python/multi-node/dist_sync_inception.py
 mode change 100755 => 100644 tests/python/multi-node/dist_sync_kvstore.py
 mode change 100755 => 100644 tests/python/multi-node/dist_sync_lenet.py
 mode change 100755 => 100644 tests/python/multi-node/dist_sync_mlp.py
 mode change 100755 => 100644 tests/python/multi-node/local_inception.py
 mode change 100755 => 100644 tests/python/multi-node/local_lenet.py
 mode change 100755 => 100644 tests/python/multi-node/local_mlp.py
 mode change 100755 => 100644 tests/python/multi-node/test_data.py

diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
old mode 100644
new mode 100755
index e18dbe68fb25..6280c1664e84
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -21,18 +21,20 @@ namespace mxnet {
 namespace op {
 // Declare enumeration of input order to make code more intuitive.
 // // These enums are only visible within this header
+namespace activation {
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
 enum ActivationOpType {kReLU, kSigmoid, kTanh};
+}  // activation
 
 struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
   int act_type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
     DMLC_DECLARE_FIELD(act_type)
-    .add_enum("relu", kReLU)
-    .add_enum("sigmoid", kSigmoid)
-    .add_enum("tanh", kTanh)
+    .add_enum("relu", activation::kReLU)
+    .add_enum("sigmoid", activation::kSigmoid)
+    .add_enum("tanh", activation::kTanh)
     .describe("Activation function to be applied.");
   }
 };
@@ -54,9 +56,9 @@ class ActivationOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[kOut], F<ForwardOp>(data));
+    Tensor<xpu, 2> data = in_data[activation::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[activation::kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[activation::kOut], F<ForwardOp>(data));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -72,10 +74,10 @@ class ActivationOp : public Operator {
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
     CHECK_EQ(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> m_out_grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> m_out_data = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> m_in_grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(m_in_grad, req[kData], F<BackwardOp>(m_out_data) * m_out_grad);
+    Tensor<xpu, 2> m_out_grad = out_grad[activation::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_in_grad = in_grad[activation::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(m_in_grad, req[activation::kData], F<BackwardOp>(m_out_data) * m_out_grad);
   }
 };  // class ActivationOp
 
@@ -99,7 +101,7 @@ class ActivationProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(activation::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -122,9 +124,9 @@ class ActivationProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
 #if MXNET_USE_CUDNN == 1
-    return {out_grad[kOut], out_data[kOut], in_data[kData]};
+    return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]};
 #else
-    return {out_grad[kOut], out_data[kOut]};
+    return {out_grad[activation::kOut], out_data[activation::kOut]};
 #endif  // MXNET_USE_CUDNN
   }
 
@@ -133,13 +135,13 @@ class ActivationProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[activation::kOut], in_grad[activation::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[activation::kData], out_data[activation::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
old mode 100644
new mode 100755
index 04a8da24eed9..019ac4bf0bb7
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -12,11 +12,11 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(ActivationParam param) {
   switch (param.act_type) {
-    case kReLU:
+    case activation::kReLU:
       return new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad>();
-    case kSigmoid:
+    case activation::kSigmoid:
       return new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
-    case kTanh:
+    case activation::kTanh:
       return new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
     default:
       LOG(FATAL) << "unknown activation type";
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
old mode 100644
new mode 100755
index 50f89878e615..f031058f899e
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -19,10 +19,13 @@
 
 namespace mxnet {
 namespace op {
+
+namespace batchnorm {
 enum BatchNormOpInputs {kData, kGamma, kBeta};
 enum BatchNormOpOutputs {kOut, kOutNoAffine, kMean, kVar};
 enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};
 enum BatchNormBackResource {kTempSpace};
+}  // namespace batchnorm
 
 struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
   float eps;
@@ -57,47 +60,48 @@ class BatchNormOp : public Operator {
     } else {
       CHECK_GE(out_data.size(), 1);
       CHECK_GE(req.size(), 1);
-      CHECK_EQ(req[kOut], kWriteTo);
+      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
     }
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const real_t scale = static_cast<real_t>(in_data[kData].shape_[1]) /
-                         static_cast<real_t>(in_data[kData].shape_.Size());
+    const real_t scale = static_cast<real_t>(in_data[batchnorm::kData].shape_[1]) /
+                         static_cast<real_t>(in_data[batchnorm::kData].shape_.Size());
     Tensor<xpu, 4> data;
     Tensor<xpu, 4> out, out_no_affine;
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0],
+                               in_data[batchnorm::kData].shape_[1], 1, 1);
+      data = in_data[batchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[batchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       if (ctx.is_train) {
-        out_no_affine = out_data[kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
+        out_no_affine = out_data[batchnorm::kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
+      data = in_data[batchnorm::kData].get<xpu, 4, real_t>(s);
+      out = out_data[batchnorm::kOut].get<xpu, 4, real_t>(s);
       if (ctx.is_train) {
-        out_no_affine = out_data[kOutNoAffine].get<xpu, 4, real_t>(s);
+        out_no_affine = out_data[batchnorm::kOutNoAffine].get<xpu, 4, real_t>(s);
       }
     }
-    Tensor<xpu, 1> slope = in_data[kGamma].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> bias = in_data[kBeta].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_mean = aux_states[kMovingMean].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_var = aux_states[kMovingVar].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> slope = in_data[batchnorm::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> bias = in_data[batchnorm::kBeta].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
     // cal
     if (ctx.is_train) {
-      Tensor<xpu, 1> mean = out_data[kMean].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 1> var = out_data[kVar].get<xpu, 1, real_t>(s);
-      Assign(mean, req[kMean], scale * sumall_except_dim<1>(data));
-      Assign(var, req[kVar], scale * sumall_except_dim<1>(
+      Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 1> var = out_data[batchnorm::kVar].get<xpu, 1, real_t>(s);
+      Assign(mean, req[batchnorm::kMean], scale * sumall_except_dim<1>(data));
+      Assign(var, req[batchnorm::kVar], scale * sumall_except_dim<1>(
                F<mshadow_op::square>(data - broadcast<1>(mean, data.shape_))));
-      Assign(out_no_affine, req[kOutNoAffine], (data - broadcast<1>(mean, data.shape_)) /
+      Assign(out_no_affine, req[batchnorm::kOutNoAffine], (data - broadcast<1>(mean, data.shape_)) /
              F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)));
-      Assign(out, req[kOut], out_no_affine * broadcast<1>(slope, out.shape_) +
+      Assign(out, req[batchnorm::kOut], out_no_affine * broadcast<1>(slope, out.shape_) +
              broadcast<1>(bias, out.shape_));
       moving_mean = moving_mean * param_.momentum + mean * (1 - param_.momentum);
       moving_var = moving_var * param_.momentum + var * (1 - param_.momentum);
     } else {
-      Assign(out, req[kOut], broadcast<1>(slope /
+      Assign(out, req[batchnorm::kOut], broadcast<1>(slope /
                                           F<mshadow_op::square_root>(moving_var + param_.eps),
                                           data.shape_) * data +
              broadcast<1>(bias - (slope * moving_mean) /
@@ -121,31 +125,32 @@ class BatchNormOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data, grad, grad_in;
     Tensor<xpu, 4> out, out_no_affine;
-    const real_t scale = static_cast<real_t>(out_data[kOut].shape_[1]) /
-                         static_cast<real_t>(out_data[kOut].shape_.Size());
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_data[kOut].shape_[0], out_data[kOut].shape_[1], 1, 1);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      grad_in = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out_no_affine = out_data[kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
+    const real_t scale = static_cast<real_t>(out_data[batchnorm::kOut].shape_[1]) /
+                         static_cast<real_t>(out_data[batchnorm::kOut].shape_.Size());
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_data[batchnorm::kOut].shape_[0],
+                               out_data[batchnorm::kOut].shape_[1], 1, 1);
+      data = in_data[batchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad = out_grad[batchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad_in = in_grad[batchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[batchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out_no_affine = out_data[batchnorm::kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
-      grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-      grad_in = in_grad[kData].get<xpu, 4, real_t>(s);
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
-      out_no_affine = out_data[kOutNoAffine].get<xpu, 4, real_t>(s);
+      data = in_data[batchnorm::kData].get<xpu, 4, real_t>(s);
+      grad = out_grad[batchnorm::kOut].get<xpu, 4, real_t>(s);
+      grad_in = in_grad[batchnorm::kData].get<xpu, 4, real_t>(s);
+      out = out_data[batchnorm::kOut].get<xpu, 4, real_t>(s);
+      out_no_affine = out_data[batchnorm::kOutNoAffine].get<xpu, 4, real_t>(s);
     }
 
-    Tensor<xpu, 1> mean = out_data[kMean].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> var = out_data[kVar].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> slope = in_data[kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> var = out_data[batchnorm::kVar].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> slope = in_data[batchnorm::kGamma].get<xpu, 1, real_t>(s);
     // Tensor<xpu, 1> bias = in_data[kBeta].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> gslope = in_grad[kGamma].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> gbias = in_grad[kBeta].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> gslope = in_grad[batchnorm::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> gbias = in_grad[batchnorm::kBeta].get<xpu, 1, real_t>(s);
     // get requested temp space
-    Tensor<xpu, 2> workspace = ctx.requested[kTempSpace].get_space<xpu>(
+    Tensor<xpu, 2> workspace = ctx.requested[batchnorm::kTempSpace].get_space<xpu>(
         mshadow::Shape2(3, out.shape_[1]), s);
     Tensor<xpu, 1> gmean = workspace[0];
     Tensor<xpu, 1> gvar = workspace[1];
@@ -162,9 +167,9 @@ class BatchNormOp : public Operator {
     tmp *= gvar;
     gmean += tmp;
     // assign
-    Assign(gslope, req[kGamma], sumall_except_dim<1>(grad * out_no_affine));
-    Assign(gbias, req[kBeta], sumall_except_dim<1>(grad));
-    Assign(grad_in, req[kData], (grad * broadcast<1>(slope, data.shape_)) *
+    Assign(gslope, req[batchnorm::kGamma], sumall_except_dim<1>(grad * out_no_affine));
+    Assign(gbias, req[batchnorm::kBeta], sumall_except_dim<1>(grad));
+    Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
            broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
            broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
                                                                                  data.shape_)) +
@@ -224,9 +229,14 @@ class BatchNormProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut],
-            out_data[kOut], out_data[kOutNoAffine], out_data[kMean], out_data[kVar],
-            in_data[kData], in_data[kGamma], in_data[kBeta]
+    return {out_grad[batchnorm::kOut],
+            out_data[batchnorm::kOut],
+            out_data[batchnorm::kOutNoAffine],
+            out_data[batchnorm::kMean],
+            out_data[batchnorm::kVar],
+            in_data[batchnorm::kData],
+            in_data[batchnorm::kGamma],
+            in_data[batchnorm::kBeta]
            };
   }
 
@@ -235,7 +245,7 @@ class BatchNormProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[batchnorm::kOut], in_grad[batchnorm::kData]}};
   }
 
   std::vector<ResourceRequest> BackwardResource(
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
old mode 100644
new mode 100755
index 0b34d691b244..012dc7a2da63
--- a/src/operator/block_grad-inl.h
+++ b/src/operator/block_grad-inl.h
@@ -19,8 +19,10 @@
 namespace mxnet {
 namespace op {
 
+namespace blockgrad {
 enum BlockGradientOpInputs {kData};
 enum BlockGradientOpOutputs {kOut};
+}  // namespace blockgrad
 
 template<typename xpu>
 class BlockGradientOp : public Operator {
@@ -35,8 +37,8 @@ class BlockGradientOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[blockgrad::kOut].FlatTo2D<xpu, real_t>(s);
     out = F<mshadow_op::identity>(data);
   }
 
@@ -50,7 +52,7 @@ class BlockGradientOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad = in_grad[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
     grad = 0.f;
   }
 };  // class BlockGradientOp
@@ -72,7 +74,7 @@ class BlockGradientProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1);
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(blockgrad::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -97,7 +99,7 @@ class BlockGradientProp : public OperatorProperty {
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
       const std::vector<int> &in_data,
       const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[blockgrad::kData], out_data[blockgrad::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
old mode 100644
new mode 100755
index a8821588b7ab..3e9c812603e3
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -20,8 +20,10 @@
 namespace mxnet {
 namespace op {
 
+namespace concat_enum {
 enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
 enum ConcatOpOutputs {kOut};
+}  // namespace concat_enum
 
 struct ConcatParam : public dmlc::Parameter<ConcatParam> {
   int num_args;
@@ -46,24 +48,24 @@ class ConcatOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), size_);
     CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(req[concat_enum::kOut], kWriteTo);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > data(size_);
     Tensor<xpu, 4> out;
-    if (in_data[kData0].ndim() == 2) {
+    if (in_data[concat_enum::kData0].ndim() == 2) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
         Shape<4> dshape = Shape4(in_data[i].shape_[0], in_data[i].shape_[1], 1, 1);
         data[i] = in_data[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         dim += in_data[i].shape_[1];
       }
-      Shape<4> dshape_out = Shape4(in_data[kData0].shape_[0], dim, 1, 1);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
+      Shape<4> dshape_out = Shape4(in_data[concat_enum::kData0].shape_[0], dim, 1, 1);
+      out = out_data[concat_enum::kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
         data[i] = in_data[i].get<xpu, 4, real_t>(s);
       }
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
+      out = out_data[concat_enum::kOut].get<xpu, 4, real_t>(s);
     }
     Concatenate(data, &out);
   }
@@ -82,7 +84,7 @@ class ConcatOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > grad_in(size_);
     Tensor<xpu, 4> grad;
-    if (out_grad[kOut].ndim() == 2) {
+    if (out_grad[concat_enum::kOut].ndim() == 2) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
         Shape<4> dshape = Shape4(in_grad[i].shape_[0], in_grad[i].shape_[1], 1, 1);
@@ -90,14 +92,14 @@ class ConcatOp : public Operator {
         dim += in_grad[i].shape_[1];
         CHECK_EQ(req[i], kWriteTo);
       }
-      Shape<4> dshape_out = Shape4(in_grad[kData0].shape_[0], dim, 1, 1);
-      grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
+      Shape<4> dshape_out = Shape4(in_grad[concat_enum::kData0].shape_[0], dim, 1, 1);
+      grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
         grad_in[i] = in_grad[i].get<xpu, 4, real_t>(s);
         CHECK_EQ(req[i], kWriteTo);
       }
-      grad = out_grad[kOut].get<xpu, 4, real_t>(s);
+      grad = out_grad[concat_enum::kOut].get<xpu, 4, real_t>(s);
     }
     Split(grad, &grad_in);
   }
@@ -133,7 +135,7 @@ class ConcatProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-    TShape dshape = in_shape->at(kData0);
+    TShape dshape = in_shape->at(concat_enum::kData0);
     if (dshape.ndim() == 0) return false;
     CHECK_GT(dshape.ndim(), 1);
     for (int i = 1; i < param_.num_args; ++i) {
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
old mode 100644
new mode 100755
index 74dca3f1a792..29a9288b2870
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -21,9 +21,11 @@
 namespace mxnet {
 namespace op {
 
+namespace conv {
 enum ConvolutionOpInputs {kData, kWeight, kBias};
 enum ConvolutionOpOutputs {kOut};
 enum ConvolutionOpResource {kTempSpace};
+}
 
 struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   TShape kernel;
@@ -68,24 +70,24 @@ class ConvolutionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(req[conv::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[conv::kData].get<xpu, 4, real_t>(s);
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
                param_.num_filter / param_.num_group,
                data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 3> wmat = in_data[conv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 4> out = out_data[conv::kOut].get<xpu, 4, real_t>(s);
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
     const index_t nbatch = data.size(0);
-    Tensor<xpu, 1> workspace = ctx.requested[kTempSpace].get_space<xpu>(
+    Tensor<xpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<xpu>(
         Shape1(this->InitTemp(data.shape_, out.shape_)), s);
     for (index_t i = 0; i < nbatch; i += nstep_) {
       const index_t step = std::min(nstep_, nbatch - i);
@@ -124,7 +126,7 @@ class ConvolutionOp : public Operator {
     }
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
-      Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 1> bias = in_data[conv::kBias].get<xpu, 1, real_t>(s);
       out += broadcast<1>(bias, out.shape_);
     }
   }
@@ -143,24 +145,24 @@ class ConvolutionOp : public Operator {
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
-    CHECK_EQ(in_data[kWeight].CheckContiguous(), true);
+    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
     // get data
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[conv::kData].get<xpu, 4, real_t>(s);
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
                param_.num_filter / param_.num_group,
                data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
-    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> gdata = in_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> gwmat = in_grad[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 3> wmat = in_data[conv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 4> grad = out_grad[conv::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gdata = in_grad[conv::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 3> gwmat = in_grad[conv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
     const index_t nbatch = data.size(0);
-    Tensor<xpu, 1> workspace = ctx.requested[kTempSpace].get_space<xpu>(
+    Tensor<xpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<xpu>(
               Shape1(this->InitTemp(data.shape_, grad.shape_)), s);
     for (index_t i = 0; i < nbatch; i += nstep_) {
       const index_t step = std::min(nstep_, nbatch - i);
@@ -190,12 +192,12 @@ class ConvolutionOp : public Operator {
         Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         if (i == 0) {
           Tensor<xpu, 2> tmp_gwmat = gwmat[gid];
-          Assign(tmp_gwmat, req[kWeight], dot(temp_dst[gid], tmpc.T()));
+          Assign(tmp_gwmat, req[conv::kWeight], dot(temp_dst[gid], tmpc.T()));
         } else {
           gwmat[gid] += dot(temp_dst[gid], tmpc.T());
         }
       }
-      if (req[kData] == kWriteTo || req[kData] == kWriteInplace) {
+      if (req[conv::kData] == kWriteTo || req[conv::kData] == kWriteInplace) {
         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
           Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
           tmpc = dot(wmat[gid].T(), temp_dst[gid]);
@@ -220,8 +222,8 @@ class ConvolutionOp : public Operator {
       }
     }
     if (!param_.no_bias) {
-      Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
-      Assign(gbias, req[kBias], sumall_except_dim<1>(grad));
+      Tensor<xpu, 1> gbias = in_grad[conv::kBias].get<xpu, 1, real_t>(s);
+      Assign(gbias, req[conv::kBias], sumall_except_dim<1>(grad));
     }
   }
 
@@ -288,15 +290,15 @@ class ConvolutionProp : public OperatorProperty {
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
     }
-    const TShape &dshape = (*in_shape)[kData];
+    const TShape &dshape = (*in_shape)[conv::kData];
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 4) \
         << "Input data should be 4D in batch-num_filter-y-x";
     SHAPE_ASSIGN_CHECK(*in_shape,
-                       kWeight,
+                       conv::kWeight,
                        Shape4(param_.num_filter, dshape[1], param_.kernel[0], param_.kernel[1]));
     if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_filter));
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -312,9 +314,9 @@ class ConvolutionProp : public OperatorProperty {
         << "incorrect stride size: " << param_.stride;
     CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
         << "kernel size exceed input";
-    (*out_shape)[kOut][1] = param_.num_filter;
-    (*out_shape)[kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / param_.stride[0] + 1;
-    (*out_shape)[kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / param_.stride[1] + 1;
+    (*out_shape)[conv::kOut][1] = param_.num_filter;
+    (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / param_.stride[0] + 1;
+    (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / param_.stride[1] + 1;
     return true;
   }
 
@@ -332,7 +334,7 @@ class ConvolutionProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], in_data[kData], in_data[kWeight]};
+    return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]};
   }
 
   std::vector<ResourceRequest> ForwardResource(
diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
old mode 100644
new mode 100755
index cf5c49f9fd59..7e6acea7c952
--- a/src/operator/cudnn_activation-inl.h
+++ b/src/operator/cudnn_activation-inl.h
@@ -20,13 +20,13 @@ class CuDNNActivationOp : public Operator {
     init_cudnn_ = false;
     dtype_ = CUDNN_DATA_FLOAT;
     switch (param_.act_type) {
-      case kReLU:
+      case activation::kReLU:
         mode_ = CUDNN_ACTIVATION_RELU;
         break;
-      case kSigmoid:
+      case activation::kSigmoid:
         mode_ = CUDNN_ACTIVATION_SIGMOID;
         break;
-      case kTanh:
+      case activation::kTanh:
         mode_ = CUDNN_ACTIVATION_TANH;
         break;
       default:
@@ -51,13 +51,14 @@ class CuDNNActivationOp : public Operator {
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
-      data = in_data[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+    if (in_data[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+                               in_data[activation::kData].shape_[1], 1, 1);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data[activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
     } else {
-      data = in_data[kData].get<gpu, 4, real_t>(s);
-      out = out_data[kOut].get<gpu, 4, real_t>(s);
+      data = in_data[activation::kData].get<gpu, 4, real_t>(s);
+      out = out_data[activation::kOut].get<gpu, 4, real_t>(s);
     }
     float alpha = 1.0f;
     float beta = 0.0f;
@@ -104,17 +105,18 @@ class CuDNNActivationOp : public Operator {
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> output_data;
     Tensor<gpu, 4> input_grad;
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
-      data = in_data[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      grad = out_grad[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      output_data = out_data[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+    if (in_data[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+                               in_data[activation::kData].shape_[1], 1, 1);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
     } else {
-      data = in_data[kData].get<gpu, 4, real_t>(s);
-      output_data = out_data[kOut].get<gpu, 4, real_t>(s);
-      grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-      input_grad = in_grad[kData].get<gpu, 4, real_t>(s);
+      data = in_data[activation::kData].get<gpu, 4, real_t>(s);
+      output_data = out_data[activation::kOut].get<gpu, 4, real_t>(s);
+      grad = out_grad[activation::kOut].get<gpu, 4, real_t>(s);
+      input_grad = in_grad[activation::kData].get<gpu, 4, real_t>(s);
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     CHECK_EQ(cudnnActivationBackward(s->dnn_handle_,
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
old mode 100644
new mode 100755
index 2a89e7ee72bc..123187608237
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -47,16 +47,16 @@ class CuDNNConvolutionOp : public Operator {
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> wmat = in_data[kWeight].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[conv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> wmat = in_data[conv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[conv::kOut].get<gpu, 4, real_t>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(wmat.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
-    Tensor<gpu, 1> workspace = ctx.requested[kTempSpace].get_space<gpu>(
+    Tensor<gpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<gpu>(
       mshadow::Shape1(forward_workspace_), s);
     CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
                                      &alpha,
@@ -73,7 +73,7 @@ class CuDNNConvolutionOp : public Operator {
                                      out.dptr_), CUDNN_STATUS_SUCCESS);
     if (!param_.no_bias) {
       beta = 1.0f;
-      Tensor<gpu, 1> bias = in_data[kBias].get<gpu, 1, real_t>(s);
+      Tensor<gpu, 1> bias = in_data[conv::kBias].get<gpu, 1, real_t>(s);
       CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
                               CUDNN_ADD_SAME_C,
                               &alpha,
@@ -100,17 +100,17 @@ class CuDNNConvolutionOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     // TODO(bing): think about how to support add to
-    CHECK_EQ(req[kWeight], kWriteTo);
+    CHECK_EQ(req[conv::kWeight], kWriteTo);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> wmat = in_data[kWeight].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> gwmat = in_grad[kWeight].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> gdata = in_grad[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 1> workspace = ctx.requested[kTempSpace].get_space<gpu>(
+    Tensor<gpu, 4> grad = out_grad[conv::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> wmat = in_data[conv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> gwmat = in_grad[conv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[conv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> gdata = in_grad[conv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<gpu>(
       mshadow::Shape1(backward_workspace_), s);
     if (!param_.no_bias) {
-      Tensor<gpu, 1> gbias = in_grad[kBias].get<gpu, 1, real_t>(s);
+      Tensor<gpu, 1> gbias = in_grad[conv::kBias].get<gpu, 1, real_t>(s);
       CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
                                             &alpha,
                                             out_desc_,
@@ -160,8 +160,8 @@ class CuDNNConvolutionOp : public Operator {
       size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(real_t));
       size_t back_size = 0;
       size_t back_size_w = 0;
-      Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> data = in_data[conv::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[conv::kOut].get<gpu, 4, real_t>(s);
       CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
@@ -196,7 +196,7 @@ class CuDNNConvolutionOp : public Operator {
                                           out.shape_[2],
                                           out.shape_[3]), CUDNN_STATUS_SUCCESS);
       if (!param_.no_bias) {
-        Tensor<gpu, 1> bias = in_data[kBias].get<gpu, 1, real_t>(s);
+        Tensor<gpu, 1> bias = in_data[conv::kBias].get<gpu, 1, real_t>(s);
         CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
                                             CUDNN_TENSOR_NCHW,
                                             dtype_,
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
old mode 100644
new mode 100755
index eb520b2fbe68..e14c9f742eaa
--- a/src/operator/cudnn_lrn-inl.h
+++ b/src/operator/cudnn_lrn-inl.h
@@ -38,8 +38,8 @@ class CuDNNLocalResponseNormOp : public Operator {
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
     if (!init_cudnn_) {
       this->Init(s, in_data, out_data);
     }
@@ -72,10 +72,10 @@ class CuDNNLocalResponseNormOp : public Operator {
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> output_data = out_data[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> input_grad = in_grad[kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> grad = out_grad[lrn_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> output_data = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> input_grad = in_grad[lrn_enum::kData].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     CHECK_EQ(cudnnLRNCrossChannelBackward(s->dnn_handle_,
                                           lrn_desc_,
@@ -101,8 +101,8 @@ class CuDNNLocalResponseNormOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
       unsigned lrn_n = param_.nsize;
       double alpha = param_.alpha;
       double beta = param_.beta;
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
old mode 100644
new mode 100755
index 67958ed46f26..3a56b6e545b8
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -22,10 +22,10 @@ class CuDNNPoolingOp : public Operator {
     // TODO(xxx): fp16
     dtype_ = CUDNN_DATA_FLOAT;
     switch (param_.pool_type) {
-      case kMaxPooling:
+      case pool_enum::kMaxPooling:
         mode_ = CUDNN_POOLING_MAX;
         break;
-      case kAvgPooling:
+      case pool_enum::kAvgPooling:
         mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
         break;
       default:
@@ -49,8 +49,8 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     if (!init_cudnn_) {
       this->Init(s, in_data, out_data);
@@ -85,10 +85,10 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_grad.size(), 1);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> m_out_grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_out_data = out_data[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_grad = in_grad[kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     float alpha = 1.0f;
     float beta = 0.0f;
@@ -115,8 +115,8 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
       CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
old mode 100644
new mode 100755
index 877eab61226b..fa76bd38ccf6
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -17,9 +17,11 @@
 #include "./operator_common.h"
 #include "./mshadow_op.h"
 
+namespace dropout {
 enum DropoutOpInputs {kData};
 enum DropoutOpOutputs {kOut, kMask};
 enum DropoutOpForwardResource {kRandom};
+}  // namespace dropout
 
 namespace mxnet {
 namespace op {
@@ -52,15 +54,15 @@ class DropoutOp : public Operator {
       CHECK_EQ(out_data.size(), 2);
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[dropout::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[dropout::kOut].FlatTo2D<xpu, real_t>(s);
     if (ctx.is_train) {
-      Tensor<xpu, 2> mask = out_data[kMask].FlatTo2D<xpu, real_t>(s);
-      Random<xpu> *prnd = ctx.requested[kRandom].get_random<xpu>(s);
+      Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
+      Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu>(s);
       mask = F<mshadow_op::threshold>(prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_);
-      Assign(out, req[kOut], data * mask);
+      Assign(out, req[dropout::kOut], data * mask);
     } else {
-      Assign(out, req[kOut], F<mshadow_op::identity>(data));
+      Assign(out, req[dropout::kOut], F<mshadow_op::identity>(data));
     }
   }
 
@@ -76,10 +78,10 @@ class DropoutOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> mask = out_data[kMask].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> gdata = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(gdata, req[kData], grad * mask);
+    Tensor<xpu, 2> grad = out_grad[dropout::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> gdata = in_grad[dropout::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(gdata, req[dropout::kData], grad * mask);
   }
 
  private:
@@ -128,7 +130,7 @@ class DropoutProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], out_data[kMask]};
+    return {out_grad[dropout::kOut], out_data[dropout::kMask]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -136,13 +138,13 @@ class DropoutProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[dropout::kOut], in_grad[dropout::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[dropout::kData], out_data[dropout::kOut]}};
   }
 
   std::vector<ResourceRequest> ForwardResource(
diff --git a/src/operator/elementwise_binary_op-inl.h b/src/operator/elementwise_binary_op-inl.h
old mode 100644
new mode 100755
index 6b1cf62242bd..4a751146b769
--- a/src/operator/elementwise_binary_op-inl.h
+++ b/src/operator/elementwise_binary_op-inl.h
@@ -19,31 +19,33 @@
 namespace mxnet {
 namespace op {
 
+namespace elembinary {
 enum ElementWiseBinaryOpInputs {kLhs, kRhs};
 enum ElementWiseBinaryOpOutputs {kOut};
 enum ElementWiseBinaryOpType {kPlus, kMinus, kMul, kDiv};
+}  // elembinary
 
 template<typename Op>
-inline ElementWiseBinaryOpType GetOpType();
+inline elembinary::ElementWiseBinaryOpType GetOpType();
 
 template<typename Op>
 inline const char* GetOpTypeString();
 
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::plus>() {
-  return kPlus;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::plus>() {
+  return elembinary::kPlus;
 }
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::minus>() {
-  return kMinus;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::minus>() {
+  return elembinary::kMinus;
 }
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::mul>() {
-  return kMul;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::mul>() {
+  return elembinary::kMul;
 }
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::div>() {
-  return kDiv;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::div>() {
+  return elembinary::kDiv;
 }
 
 template<>
@@ -78,10 +80,10 @@ class ElementWiseBinaryOp : public Operator {
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> lhs = in_data[kLhs].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> rhs = in_data[kRhs].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[kOut], F<ForwardOp>(lhs, rhs));
+    Tensor<xpu, 2> lhs = in_data[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> rhs = in_data[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[elembinary::kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[elembinary::kOut], F<ForwardOp>(lhs, rhs));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -98,37 +100,37 @@ class ElementWiseBinaryOp : public Operator {
     CHECK_EQ(req.size(), 2);
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> m_out_grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> lhs_grad = in_grad[kLhs].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> rhs_grad = in_grad[kRhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_out_grad = out_grad[elembinary::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> lhs_grad = in_grad[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> rhs_grad = in_grad[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
     switch (GetOpType<ForwardOp>()) {
-      case kPlus: {
-        Assign(lhs_grad, req[kLhs], F<mshadow_op::identity>(m_out_grad));
-        Assign(rhs_grad, req[kRhs], F<mshadow_op::identity>(m_out_grad));
+      case elembinary::kPlus: {
+        Assign(lhs_grad, req[elembinary::kLhs], F<mshadow_op::identity>(m_out_grad));
+        Assign(rhs_grad, req[elembinary::kRhs], F<mshadow_op::identity>(m_out_grad));
         break;
       }
-      case kMinus: {
-        Assign(lhs_grad, req[kLhs], F<mshadow_op::identity>(m_out_grad));
-        Assign(rhs_grad, req[kRhs], F<mshadow_op::negation>(m_out_grad));
+      case elembinary::kMinus: {
+        Assign(lhs_grad, req[elembinary::kLhs], F<mshadow_op::identity>(m_out_grad));
+        Assign(rhs_grad, req[elembinary::kRhs], F<mshadow_op::negation>(m_out_grad));
         break;
       }
-      case kMul: {
-        Tensor<xpu, 2> lhs_data = in_data[kLhs].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> rhs_data = in_data[kRhs].FlatTo2D<xpu, real_t>(s);
+      case elembinary::kMul: {
+        Tensor<xpu, 2> lhs_data = in_data[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> rhs_data = in_data[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
         // rhs cannot do inplace
-        CHECK_NE(req[kRhs], kWriteInplace);
-        Assign(rhs_grad, req[kRhs], lhs_data * m_out_grad);
-        Assign(lhs_grad, req[kLhs], rhs_data * m_out_grad);
+        CHECK_NE(req[elembinary::kRhs], kWriteInplace);
+        Assign(rhs_grad, req[elembinary::kRhs], lhs_data * m_out_grad);
+        Assign(lhs_grad, req[elembinary::kLhs], rhs_data * m_out_grad);
         break;
       }
-      case kDiv: {
-        Tensor<xpu, 2> lhs_data = in_data[kLhs].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> rhs_data = in_data[kRhs].FlatTo2D<xpu, real_t>(s);
+      case elembinary::kDiv: {
+        Tensor<xpu, 2> lhs_data = in_data[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> rhs_data = in_data[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
         // rhs cannot do inplace
-        CHECK_NE(req[kRhs], kWriteInplace);
-        Assign(rhs_grad, req[kRhs],
+        CHECK_NE(req[elembinary::kRhs], kWriteInplace);
+        Assign(rhs_grad, req[elembinary::kRhs],
                F<mshadow_op::negation>(m_out_grad * lhs_data) / F<mshadow_op::square>(rhs_data));
-        Assign(lhs_grad, req[kLhs], m_out_grad / rhs_data);
+        Assign(lhs_grad, req[elembinary::kLhs], m_out_grad / rhs_data);
         break;
       }
     }
@@ -137,15 +139,15 @@ class ElementWiseBinaryOp : public Operator {
 
 
 template<typename xpu>
-inline Operator* CreateElementWiseBinaryOp_(ElementWiseBinaryOpType type) {
+inline Operator* CreateElementWiseBinaryOp_(elembinary::ElementWiseBinaryOpType type) {
   switch (type) {
-    case kPlus:
+    case elembinary::kPlus:
       return new ElementWiseBinaryOp<xpu, mshadow::op::plus>();
-    case kMinus:
+    case elembinary::kMinus:
       return new ElementWiseBinaryOp<xpu, mshadow::op::minus>();
-    case kMul:
+    case elembinary::kMul:
       return new ElementWiseBinaryOp<xpu, mshadow::op::mul>();
-    case kDiv:
+    case elembinary::kDiv:
       return new ElementWiseBinaryOp<xpu, mshadow::op::div>();
   }
   LOG(FATAL) << "uknown op type";
@@ -154,7 +156,7 @@ inline Operator* CreateElementWiseBinaryOp_(ElementWiseBinaryOpType type) {
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateElementWiseBinaryOp(ElementWiseBinaryOpType type);
+Operator* CreateElementWiseBinaryOp(elembinary::ElementWiseBinaryOpType type);
 
 #if DMLC_USE_CXX11
 template<typename ForwardOp>
@@ -173,14 +175,14 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2) << "Input:[lhs, rhs]";
-    if (in_shape->at(kLhs).ndim() != 0) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kRhs, in_shape->at(kLhs));
-    } else if (in_shape->at(kRhs).ndim() != 0) {
-      in_shape->at(kLhs) = in_shape->at(kRhs);
+    if (in_shape->at(elembinary::kLhs).ndim() != 0) {
+      SHAPE_ASSIGN_CHECK(*in_shape, elembinary::kRhs, in_shape->at(elembinary::kLhs));
+    } else if (in_shape->at(elembinary::kRhs).ndim() != 0) {
+      in_shape->at(elembinary::kLhs) = in_shape->at(elembinary::kRhs);
     } else {
       return false;
     }
-    const TShape &dshape = in_shape->at(kLhs);
+    const TShape &dshape = in_shape->at(elembinary::kLhs);
     out_shape->clear();
     out_shape->push_back(dshape);
     return true;
@@ -204,12 +206,12 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
     switch (GetOpType<ForwardOp>()) {
-      case kPlus:
-      case kMinus:
-        return {out_grad[kOut]};
-      case kMul:
-      case kDiv:
-        return {out_grad[kOut], in_data[kLhs], in_data[kRhs]};
+      case elembinary::kPlus:
+      case elembinary::kMinus:
+        return {out_grad[elembinary::kOut]};
+      case elembinary::kMul:
+      case elembinary::kDiv:
+        return {out_grad[elembinary::kOut], in_data[elembinary::kLhs], in_data[elembinary::kRhs]};
     }
     LOG(FATAL) << "not reached";
     return {};
@@ -221,12 +223,12 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
     switch (GetOpType<ForwardOp>()) {
-      case kPlus:
-      case kMinus:
+      case elembinary::kPlus:
+      case elembinary::kMinus:
         return {};
-      case kMul:
-      case kDiv:
-        return {{out_grad[kOut], in_grad[kLhs]}};
+      case elembinary::kMul:
+      case elembinary::kDiv:
+        return {{out_grad[elembinary::kOut], in_grad[elembinary::kLhs]}};
     }
     LOG(FATAL) << "not reached";
     return {};
@@ -235,7 +237,7 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kLhs], out_data[kOut]}};
+    return {{in_data[elembinary::kLhs], out_data[elembinary::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const override;
diff --git a/src/operator/elementwise_binary_op.cc b/src/operator/elementwise_binary_op.cc
old mode 100644
new mode 100755
index 0485707ffc18..940dce2beec6
--- a/src/operator/elementwise_binary_op.cc
+++ b/src/operator/elementwise_binary_op.cc
@@ -8,7 +8,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateElementWiseBinaryOp<cpu>(ElementWiseBinaryOpType type) {
+Operator* CreateElementWiseBinaryOp<cpu>(elembinary::ElementWiseBinaryOpType type) {
   return CreateElementWiseBinaryOp_<cpu>(type);
 }
 
diff --git a/src/operator/elementwise_binary_op.cu b/src/operator/elementwise_binary_op.cu
old mode 100644
new mode 100755
index ba8991707f12..90d85ae20a18
--- a/src/operator/elementwise_binary_op.cu
+++ b/src/operator/elementwise_binary_op.cu
@@ -8,7 +8,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateElementWiseBinaryOp<gpu>(ElementWiseBinaryOpType type) {
+Operator* CreateElementWiseBinaryOp<gpu>(elembinary::ElementWiseBinaryOpType type) {
   return CreateElementWiseBinaryOp_<gpu>(type);
 }
 }  // namespace op
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
old mode 100644
new mode 100755
index 213add51357a..d9c4c0e36206
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -21,8 +21,10 @@
 namespace mxnet {
 namespace op {
 
+namespace elemsum {
 enum ElementWiseSumOpInputs {kData0, kData1, kData2, kData3};
 enum ElementWiseSumOpOutputs {kOut};
+}  // namespace elemsum
 
 struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
   int num_args;
@@ -47,35 +49,35 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), size_);
     CHECK_EQ(out_data.size(), 1);
-    if (req[kOut] == kNullOp) return;
+    if (req[elemsum::kOut] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
     switch (size_) {
       case 2: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1);
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], in_0 + in_1);
         break;
       }
       case 3: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1 + in_2);
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2);
         break;
       }
       case 4: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1 + in_2 + in_3);
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2 + in_3);
         break;
       }
       default: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], F<mshadow_op::identity>(in_0));
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], F<mshadow_op::identity>(in_0));
         for (int i = 1; i < size_; ++i) {
           out += in_data[i].FlatTo2D<xpu, real_t>(s);
         }
@@ -95,7 +97,7 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> ograd = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
     for (int i = 0; i < size_; ++i) {
       if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
       Tensor<xpu, 2> igrad = in_grad[i].FlatTo2D<xpu, real_t>(s);
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
old mode 100644
new mode 100755
index 6fec9f5d13a5..262aba95d0fb
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -21,8 +21,10 @@ namespace op {
 
 // Declare enumeration of input order to make code more intuitive.
 // These enums are only visible within this header
+namespace fullc {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
 enum FullyConnectedOpOutputs {kOut};
+}  // fullc
 
 struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   int num_hidden;
@@ -55,7 +57,7 @@ class FullyConnectedOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(req[fullc::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
@@ -67,12 +69,12 @@ class FullyConnectedOp : public Operator {
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif  // __CUDACC__
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[fullc::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> wmat = in_data[fullc::kWeight].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> out = out_data[fullc::kOut].FlatTo2D<xpu, real_t>(s);
     out = dot(data, wmat.T());
     if (!param_.no_bias) {
-      Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 1> bias = in_data[fullc::kBias].get<xpu, 1, real_t>(s);
       out += repmat(bias, data.size(0));
     }
   }
@@ -93,26 +95,26 @@ class FullyConnectedOp : public Operator {
     // TODO(bing): check the BLAS Handle, be careful
     //  maybe need blas handle from context
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[fullc::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> wmat = in_data[fullc::kWeight].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> grad = out_grad[fullc::kOut].FlatTo2D<xpu, real_t>(s);
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
     //  backprop
-    CHECK_NE(req[kWeight], kWriteInplace) << "cannot write weight inplace";
+    CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
     // gradient of weight
-    Tensor<xpu, 2> gwmat = in_grad[kWeight].get<xpu, 2, real_t>(s);
-    Assign(gwmat, req[kWeight], dot(grad.T(), data));
+    Tensor<xpu, 2> gwmat = in_grad[fullc::kWeight].get<xpu, 2, real_t>(s);
+    Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
     // gradient of bias
     if (!param_.no_bias) {
-      Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
-      Assign(gbias, req[kBias], sum_rows(grad));
+      Tensor<xpu, 1> gbias = in_grad[fullc::kBias].get<xpu, 1, real_t>(s);
+      Assign(gbias, req[fullc::kBias], sum_rows(grad));
     }
     // gradient of data
-    Tensor<xpu, 2> gdata = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(gdata, req[kData], dot(grad, wmat));
+    Tensor<xpu, 2> gdata = in_grad[fullc::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(gdata, req[fullc::kData], dot(grad, wmat));
   }
 
  private:
@@ -151,16 +153,16 @@ class FullyConnectedProp : public OperatorProperty {
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
     }
-    const TShape &dshape = (*in_shape)[kData];
+    const TShape &dshape = (*in_shape)[fullc::kData];
     // require data to be known
     if (dshape.ndim() ==  0) return false;
 
     index_t num_input = 0;
     mshadow::Shape<2> ishape = dshape.FlatTo2D();
     num_input = ishape[1];
-    SHAPE_ASSIGN_CHECK(*in_shape, kWeight, Shape2(param_.num_hidden, num_input));
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input));
     if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_hidden));
+      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden));
     }
     out_shape->clear();
     out_shape->push_back(Shape2(dshape[0], param_.num_hidden));
@@ -182,7 +184,7 @@ class FullyConnectedProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], in_data[kData], in_data[kWeight]};
+    return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -190,7 +192,7 @@ class FullyConnectedProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{in_data[kData], in_grad[kData]}};
+    return {{in_data[fullc::kData], in_grad[fullc::kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
old mode 100644
new mode 100755
index dc2c45127a03..4bdb65ef415a
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -21,10 +21,12 @@
 namespace mxnet {
 namespace op {
 
+namespace leakyrelu {
 enum LeakyReLUOpInputs {kData, kGamma};
 enum LeakyReLUOpOutputs {kOut, kMask};
 enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU};
 enum LeakyReLUOpResource {kRandom};
+}  // namespace leakyrelu
 
 struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
   // use int for enumeration
@@ -33,10 +35,10 @@ struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
   float lower_bound;
   float upper_bound;
   DMLC_DECLARE_PARAMETER(LeakyReLUParam) {
-    DMLC_DECLARE_FIELD(act_type).set_default(kLeakyReLU)
-    .add_enum("rrelu", kRReLU)
-    .add_enum("leaky", kLeakyReLU)
-    .add_enum("prelu", kPReLU)
+    DMLC_DECLARE_FIELD(act_type).set_default(leakyrelu::kLeakyReLU)
+    .add_enum("rrelu", leakyrelu::kRReLU)
+    .add_enum("leaky", leakyrelu::kLeakyReLU)
+    .add_enum("prelu", leakyrelu::kPReLU)
     .describe("Activation function to be applied.");
     DMLC_DECLARE_FIELD(slope).set_default(0.25f)
     .describe("Init slope for the activation. (For leaky only)");
@@ -67,46 +69,48 @@ class LeakyReLUOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    size_t expected = param_.act_type == kPReLU ? 2 : 1;
+    size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
     CHECK_EQ(in_data.size(), expected);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data;
     Tensor<xpu, 4> out;
     Tensor<xpu, 4> mask;
     Tensor<xpu, 1> weight;
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[leakyrelu::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[leakyrelu::kData].shape_[0],
+                               in_data[leakyrelu::kData].shape_[1], 1, 1);
+      data = in_data[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get<xpu, 4, real_t>(s);
+      data = in_data[leakyrelu::kData].get<xpu, 4, real_t>(s);
+      out = out_data[leakyrelu::kOut].get<xpu, 4, real_t>(s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get<xpu, 4, real_t>(s);
       }
     }
     switch (param_.act_type) {
-      case kLeakyReLU: {
-        Assign(out, req[kOut], F<mshadow_op::xelu>(data, param_.slope));
+      case leakyrelu::kLeakyReLU: {
+        Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, param_.slope));
         break;
       }
-      case kPReLU: {
-        weight = in_data[kGamma].get<xpu, 1, real_t>(s);
-        Assign(out, req[kOut], F<mshadow_op::xelu>(data, broadcast<1>(weight, out.shape_)));
+      case leakyrelu::kPReLU: {
+        weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
+        Assign(out, req[leakyrelu::kOut],
+               F<mshadow_op::xelu>(data, broadcast<1>(weight, out.shape_)));
         break;
       }
-      case kRReLU: {
+      case leakyrelu::kRReLU: {
         if (ctx.is_train) {
-          Random<xpu>* prnd = ctx.requested[kRandom].get_random<xpu>(s);
+          Random<xpu>* prnd = ctx.requested[leakyrelu::kRandom].get_random<xpu>(s);
           mask = prnd->uniform(mask.shape_);
           mask = mask * (param_.upper_bound - param_.lower_bound) + param_.lower_bound;
-          Assign(out, req[kOut], F<mshadow_op::xelu>(data, mask));
+          Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, mask));
         } else {
           const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f;
-          Assign(out, req[kOut], F<mshadow_op::xelu>(data, slope));
+          Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, slope));
         }
         break;
       }
@@ -125,7 +129,7 @@ class LeakyReLUOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): double check
-    size_t expected = param_.act_type == kPReLU ? 2 : 1;
+    size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data.size(), expected);
@@ -137,42 +141,43 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 4> mask;
     Tensor<xpu, 1> weight;
     Tensor<xpu, 1> grad_weight;
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
-      grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      gdata = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      output = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[leakyrelu::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[leakyrelu::kData].shape_[0],
+                               in_data[leakyrelu::kData].shape_[1], 1, 1);
+      grad = out_grad[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      gdata = in_grad[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      output = out_data[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
-      if (param_.act_type == kPReLU) {
-        data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      if (param_.act_type == leakyrelu::kPReLU) {
+        data = in_data[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
     } else {
-      grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-      gdata = in_grad[kData].get<xpu, 4, real_t>(s);
-      output = out_data[kOut].get<xpu, 4, real_t>(s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get<xpu, 4, real_t>(s);
+      grad = out_grad[leakyrelu::kOut].get<xpu, 4, real_t>(s);
+      gdata = in_grad[leakyrelu::kData].get<xpu, 4, real_t>(s);
+      output = out_data[leakyrelu::kOut].get<xpu, 4, real_t>(s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get<xpu, 4, real_t>(s);
       }
-      if (param_.act_type == kPReLU) {
-        data = in_data[kData].get<xpu, 4, real_t>(s);
+      if (param_.act_type == leakyrelu::kPReLU) {
+        data = in_data[leakyrelu::kData].get<xpu, 4, real_t>(s);
       }
     }
     switch (param_.act_type) {
-      case kLeakyReLU: {
-        Assign(gdata, req[kData], F<mshadow_op::xelu_grad>(output, param_.slope) * grad);
+      case leakyrelu::kLeakyReLU: {
+        Assign(gdata, req[leakyrelu::kData], F<mshadow_op::xelu_grad>(output, param_.slope) * grad);
         break;
       }
-      case kPReLU: {
-        weight = in_data[kGamma].get<xpu, 1, real_t>(s);
-        grad_weight = in_grad[kGamma].get<xpu, 1, real_t>(s);
+      case leakyrelu::kPReLU: {
+        weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
+        grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = sumall_except_dim<1>(F<prelu_grad>(data) * grad);
         gdata = F<mshadow_op::xelu_grad>(output, broadcast<1>(weight, data.shape_)) * grad;
         break;
       }
-      case kRReLU: {
-        Assign(gdata, req[kData], F<mshadow_op::xelu_grad>(output, mask) * grad);
+      case leakyrelu::kRReLU: {
+        Assign(gdata, req[leakyrelu::kData], F<mshadow_op::xelu_grad>(output, mask) * grad);
         break;
       }
       default:
@@ -202,19 +207,19 @@ class LeakyReLUProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    if (param_.act_type == kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU) {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, gamma]";
     } else {
       CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
     }
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(leakyrelu::kData);
     if (dshape.ndim() == 0) return false;
-    if (param_.act_type == kPReLU) {
-      in_shape->at(kGamma) = TShape(Shape1(dshape[1]));
+    if (param_.act_type == leakyrelu::kPReLU) {
+      in_shape->at(leakyrelu::kGamma) = TShape(Shape1(dshape[1]));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       out_shape->push_back(dshape);
     }
     return true;
@@ -235,12 +240,15 @@ class LeakyReLUProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    if (param_.act_type == kPReLU) {
-      return {out_grad[kOut], out_data[kOut], in_data[kData], in_data[kGamma]};
-    } else if (param_.act_type == kRReLU) {
-      return {out_grad[kOut], out_data[kMask], out_data[kOut]};
+    if (param_.act_type == leakyrelu::kPReLU) {
+      return {out_grad[leakyrelu::kOut],
+              out_data[leakyrelu::kOut],
+              in_data[leakyrelu::kData],
+              in_data[leakyrelu::kGamma]};
+    } else if (param_.act_type == leakyrelu::kRReLU) {
+      return {out_grad[leakyrelu::kOut], out_data[leakyrelu::kMask], out_data[leakyrelu::kOut]};
     } else {
-      return {out_grad[kOut], out_data[kData]};
+      return {out_grad[leakyrelu::kOut], out_data[leakyrelu::kData]};
     }
   }
 
@@ -249,21 +257,21 @@ class LeakyReLUProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[leakyrelu::kOut], in_grad[leakyrelu::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    if (param_.act_type == kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU) {
       return {};
     } else {
-      return {{in_data[kData], out_data[kOut]}};
+      return {{in_data[leakyrelu::kData], out_data[leakyrelu::kOut]}};
     }
   }
 
   std::vector<std::string> ListArguments() const override {
-    if (param_.act_type == kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU) {
       return {"data", "gamma"};
     } else {
       return {"data"};
@@ -271,7 +279,7 @@ class LeakyReLUProp : public OperatorProperty {
   }
 
   std::vector<std::string> ListOutputs() const override {
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       return {"output", "mask"};
     } else {
       return {"output"};
@@ -279,7 +287,7 @@ class LeakyReLUProp : public OperatorProperty {
   }
 
   int NumOutputs() const override {
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       return 2;
     } else {
       return 1;
@@ -292,7 +300,7 @@ class LeakyReLUProp : public OperatorProperty {
 
   virtual std::vector<ResourceRequest> ForwardResource(
       const std::vector<TShape> &in_shape) const {
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       return {ResourceRequest::kRandom};
     } else {
       return std::vector<ResourceRequest>();
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
old mode 100644
new mode 100755
index 93c0e346de42..35aac8fe73ae
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -18,8 +18,11 @@
 
 namespace mxnet {
 namespace op {
+
+namespace lrn_enum {
 enum LRNInputs {kData};
 enum LRNOutputs {kOut, kTmpNorm};
+}  // namespace lrn_enum
 
 struct LRNParam : public dmlc::Parameter<LRNParam> {
   float alpha;
@@ -58,11 +61,11 @@ class LocalResponseNormOp : public Operator {
     CHECK_EQ(param_.nsize % 2, 1) << "LRN only supports odd values for local_size";
     const real_t salpha = param_.alpha / param_.nsize;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[kTmpNorm].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
     tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
-    Assign(out, req[kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
+    Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -79,10 +82,10 @@ class LocalResponseNormOp : public Operator {
     CHECK_EQ(out_data.size(), 2);
     const real_t salpha = param_.alpha / param_.nsize;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[kTmpNorm].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad = out_grad[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_in = in_grad[lrn_enum::kData].get<xpu, 4, real_t>(s);
     grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
     grad_in += (- 2.0f * param_.beta * salpha) *
                chpool<red::sum>(grad * data *
@@ -138,9 +141,9 @@ class LocalResponseNormProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
 #if MXNET_USE_CUDNN == 1
-    return {out_grad[kOut], in_data[kData], out_data[kOut]};
+    return {out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], out_data[lrn_enum::kOut]};
 #else
-    return {out_grad[kOut], in_data[kData], out_data[kTmpNorm]};
+    return {out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], out_data[lrn_enum::kTmpNorm]};
 #endif
   }
 
@@ -152,7 +155,7 @@ class LocalResponseNormProp : public OperatorProperty {
 #if MXNET_USE_CUDNN == 1
     return {};
 #else
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[lrn_enum::kOut], in_grad[lrn_enum::kData]}};
 #endif
   }
 
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
old mode 100644
new mode 100755
index b7eb8e2f2634..1f3d76e1ab7a
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -20,9 +20,12 @@
 
 namespace mxnet {
 namespace op {
+
+namespace pool_enum {
 enum PoolingOpInputs {kData};
 enum PoolingOpOutputs {kOut};
 enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+}  // namespace pool_enum
 
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
@@ -36,9 +39,9 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .describe("pooling kernel size: (y, x)");
 
     DMLC_DECLARE_FIELD(pool_type)
-    .add_enum("max", kMaxPooling)
-    .add_enum("avg", kAvgPooling)
-    .add_enum("sum", kSumPooling)
+    .add_enum("max", pool_enum::kMaxPooling)
+    .add_enum("avg", pool_enum::kAvgPooling)
+    .add_enum("sum", pool_enum::kSumPooling)
     .describe("Pooling type to be applied.");
 
     int stride_shape[] = {1, 1};
@@ -70,23 +73,23 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
     // TODO(bing): dual stride in mshadow
     CHECK_EQ(param_.stride[0], param_.stride[1])
         << "Only same stride is supported now";
-    if (param_.pool_type == kMaxPooling || param_.pool_type == kSumPooling) {
+    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
       Assign(out,
-             req[kOut],
+             req[pool_enum::kOut],
              pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                            out_shape,
                            param_.kernel[0],
                            param_.kernel[1],
                            param_.stride[0]));
-    } else if (param_.pool_type == kAvgPooling) {
+    } else if (param_.pool_type == pool_enum::kAvgPooling) {
       Assign(out,
-             req[kOut],
+             req[pool_enum::kOut],
              (1.0f / (param_.kernel[0] * param_.kernel[1])) * \
              pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                            out_shape,
@@ -112,15 +115,15 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_grad.size(), 1);
     // TODO(bing): remove pad (0,0)
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> output_data = out_data[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> input_grad = in_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad = out_grad[pool_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> output_data = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> input_grad = in_grad[pool_enum::kData].get<xpu, 4, real_t>(s);
 
     mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
 
-    if (param_.pool_type == kMaxPooling || param_.pool_type == kSumPooling) {
-      Assign(input_grad, req[kData],
+    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
+      Assign(input_grad, req[pool_enum::kData],
              crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
                                   pad(grad, 0, 0),
@@ -130,8 +133,8 @@ class PoolingOp : public Operator {
                   in_shape,
                   param_.pad[0],
                   param_.pad[1]));
-    } else if (param_.pool_type == kAvgPooling) {
-      Assign(input_grad, req[kData],
+    } else if (param_.pool_type == pool_enum::kAvgPooling) {
+      Assign(input_grad, req[pool_enum::kData],
              (1.0f / param_.kernel[0] / param_.kernel[1]) *\
              crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
@@ -197,7 +200,7 @@ class PoolingProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], in_data[kData], out_data[kOut]};
+    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -208,7 +211,7 @@ class PoolingProp : public OperatorProperty {
 #if MXNET_USE_CUDNN == 1
     return {};
 #else
-    return {{in_data[kData], in_grad[kData]}};
+    return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}};
 #endif
   }
 
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
old mode 100644
new mode 100755
index 44f80f854468..598fe176e5db
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -11,11 +11,11 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param) {
   switch (param.pool_type) {
-    case kMaxPooling:
+    case pool_enum::kMaxPooling:
       return new PoolingOp<cpu, mshadow::red::maximum>(param);
-    case kAvgPooling:
+    case pool_enum::kAvgPooling:
       return new PoolingOp<cpu, mshadow::red::sum>(param);
-    case kSumPooling:
+    case pool_enum::kSumPooling:
       return new PoolingOp<cpu, mshadow::red::sum>(param);
     default:
       LOG(FATAL) << "unknown activation type";
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
old mode 100644
new mode 100755
index 9b59ffaba6ad..57b92427dfe5
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -18,11 +18,11 @@ Operator *CreateOp<gpu>(PoolingParam param) {
   return new CuDNNPoolingOp(param);
 #else
   switch (param.pool_type) {
-    case kMaxPooling:
+    case pool_enum::kMaxPooling:
       return new PoolingOp<gpu, mshadow::red::maximum>(param);
-    case kAvgPooling:
+    case pool_enum::kAvgPooling:
       return new PoolingOp<gpu, mshadow::red::sum>(param);
-    case kSumPooling:
+    case pool_enum::kSumPooling:
       return new PoolingOp<gpu, mshadow::red::sum>(param);
     default:
       LOG(FATAL) << "unknown activation type";
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
old mode 100644
new mode 100755
index 4c4bf6ffb625..479579d4b472
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -16,9 +16,12 @@
 
 namespace mxnet {
 namespace op {
+
+namespace reg_enum {
 enum RegressionOutputOpInputs {kData, kLabel};
 enum RegressionOutputOutputs {kOut};
 enum RegressionOutputType {kLinear, kLogistic};
+}  // reg_enum
 
 // Special Operator to output regression value in forward
 // And get gradient in calculation.
@@ -35,9 +38,9 @@ class RegressionOutputOp : public Operator {
     CHECK_EQ(in_data.size(), 2) << "RegressionOutputOp Input: [data, label]";
     CHECK_EQ(out_data.size(), 1) << "RegressionOutputOp Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[kOut], F<ForwardOp>(data));
+    Tensor<xpu, 2> data = in_data[reg_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[reg_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[reg_enum::kOut], F<ForwardOp>(data));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -54,19 +57,19 @@ class RegressionOutputOp : public Operator {
     CHECK_GE(in_grad.size(), 1);
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(grad, req[kData], F<BackwardOp>(out, reshape(label, grad.shape_)));
+    Tensor<xpu, 1> label = in_data[reg_enum::kLabel].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 2> out = out_data[reg_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad = in_grad[reg_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(grad, req[reg_enum::kData], F<BackwardOp>(out, reshape(label, grad.shape_)));
   }
 };
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateRegressionOutputOp(RegressionOutputType type);
+Operator* CreateRegressionOutputOp(reg_enum::RegressionOutputType type);
 
 #if DMLC_USE_CXX11
-template<RegressionOutputType type>
+template<reg_enum::RegressionOutputType type>
 class RegressionOutputProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
@@ -100,8 +103,8 @@ class RegressionOutputProp : public OperatorProperty {
 
   std::string TypeString() const override {
     switch (type) {
-      case kLinear: return "LinearRegressionOutput";
-      case kLogistic: return "LogisticRegressionOutput";
+      case reg_enum::kLinear: return "LinearRegressionOutput";
+      case reg_enum::kLogistic: return "LogisticRegressionOutput";
       default: LOG(FATAL) << "unknown type"; return "";
     }
   }
@@ -110,7 +113,7 @@ class RegressionOutputProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[kLabel], out_data[kOut]};
+    return {in_data[reg_enum::kLabel], out_data[reg_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -118,13 +121,13 @@ class RegressionOutputProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_data[kOut], in_grad[kData]}};
+    return {{out_data[reg_enum::kOut], in_grad[reg_enum::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[reg_enum::kData], out_data[reg_enum::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
old mode 100644
new mode 100755
index e10888d624e3..5d729c57f730
--- a/src/operator/regression_output.cc
+++ b/src/operator/regression_output.cc
@@ -10,11 +10,11 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator *CreateRegressionOutputOp<cpu>(RegressionOutputType type) {
+Operator *CreateRegressionOutputOp<cpu>(reg_enum::RegressionOutputType type) {
   switch (type) {
-    case kLinear:
+    case reg_enum::kLinear:
       return new RegressionOutputOp<cpu, mshadow::op::identity, mshadow::op::minus>();
-    case kLogistic:
+    case reg_enum::kLogistic:
       return new RegressionOutputOp<cpu, mshadow_op::sigmoid, mshadow::op::minus>();
     default:
       LOG(FATAL) << "unknown activation type " << type;
@@ -23,17 +23,17 @@ Operator *CreateRegressionOutputOp<cpu>(RegressionOutputType type) {
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-template<RegressionOutputType type>
+template<reg_enum::RegressionOutputType type>
 Operator *RegressionOutputProp<type>::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateRegressionOutputOp, type);
 }
 
-MXNET_REGISTER_OP_PROPERTY(LinearRegressionOutput, RegressionOutputProp<kLinear>)
+MXNET_REGISTER_OP_PROPERTY(LinearRegressionOutput, RegressionOutputProp<reg_enum::kLinear>)
 .describe("Use linear regression for final output, this is used on final output of a net.")
 .add_argument("data", "Symbol", "Input data to function.")
 .add_argument("label", "Symbol", "Input label to function.");
 
-MXNET_REGISTER_OP_PROPERTY(LogisticRegressionOutput, RegressionOutputProp<kLogistic>)
+MXNET_REGISTER_OP_PROPERTY(LogisticRegressionOutput, RegressionOutputProp<reg_enum::kLogistic>)
 .describe("Use Logistic regression for final output, this is used on final output of a net.\n"
           "Logistic regression is suitable for binary classification "
           "or probability prediction tasks.")
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
old mode 100644
new mode 100755
index c653b556278d..64968f2e968b
--- a/src/operator/regression_output.cu
+++ b/src/operator/regression_output.cu
@@ -10,11 +10,11 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator *CreateRegressionOutputOp<gpu>(RegressionOutputType type) {
+Operator *CreateRegressionOutputOp<gpu>(reg_enum::RegressionOutputType type) {
   switch (type) {
-    case kLinear:
+    case reg_enum::kLinear:
       return new RegressionOutputOp<gpu, mshadow::op::identity, mshadow::op::minus>();
-    case kLogistic:
+    case reg_enum::kLogistic:
       return new RegressionOutputOp<gpu, mshadow_op::sigmoid, mshadow::op::minus>();
     default:
       LOG(FATAL) << "unknown activation type " << type;
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
old mode 100644
new mode 100755
index d02d1cedebd1..12c2071a8c97
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -20,9 +20,10 @@
 namespace mxnet {
 namespace op {
 
+namespace reshape_enum {
 enum ReshapeOpInputs {kData};
 enum ReshapeOpOutputs {kOut};
-
+}  // namespace reshape_enum
 
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   TShape target_shape;
@@ -46,15 +47,15 @@ class ReshapeOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(req.size(), 1);
     CHECK_EQ(out_data.size(), 1);
-    if (req[kOut] == kNullOp) return;
+    if (req[reshape_enum::kOut] == kNullOp) return;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (data.dptr_ == out.dptr_) return;
     CHECK_EQ(data.shape_.Size(), out.shape_.Size());
-    Assign(out, req[kOut], reshape(data, out.shape_));
+    Assign(out, req[reshape_enum::kOut], reshape(data, out.shape_));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -67,17 +68,17 @@ class ReshapeOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req.size(), 1);
-    if (req[kData] == kNullOp) return;
+    if (req[reshape_enum::kData] == kNullOp) return;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad_in = in_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad_out = out_grad[kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad_in = in_grad[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad_out = out_grad[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
     if (grad_out.dptr_ == grad_in.dptr_) return;
     CHECK_EQ(grad_out.shape_.Size(), grad_in.shape_.Size());
-    Assign(grad_in, req[kData], reshape(grad_out, grad_in.shape_));
+    Assign(grad_in, req[reshape_enum::kData], reshape(grad_out, grad_in.shape_));
   }
 };  // class ReshapeOp
 
@@ -101,7 +102,7 @@ class ReshapeProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(reshape_enum::kData);
     if (dshape.ndim() == 0) return false;
     CHECK(param_.target_shape.Size() == dshape.Size())
         << "Target shape size is different to source. "
@@ -126,13 +127,13 @@ class ReshapeProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut]};
+    return {out_grad[reshape_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[reshape_enum::kData], out_data[reshape_enum::kOut]}};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -140,7 +141,7 @@ class ReshapeProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[reshape_enum::kOut], in_grad[reshape_enum::kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
@@ -166,7 +167,7 @@ class FlattenProp : public ReshapeProp {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(reshape_enum::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
     uint32_t target_dim = 1;
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
old mode 100644
new mode 100755
index ad0910df8731..05e3da199bda
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -21,8 +21,10 @@
 namespace mxnet {
 namespace op {
 
+namespace slice_enum {
 enum SliceChannelOpInputs {kData};
 enum SliceChannelOpOutputs {kOut0, kOut1, kOut2, kOut3, kOut4};
+}  // namespace slice_enum
 
 struct SliceChannelParam : public dmlc::Parameter<SliceChannelParam> {
   int num_outputs;
@@ -50,16 +52,17 @@ class SliceChannelOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > outputs(size_);
     Tensor<xpu, 4> data;
-    if (in_data[kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[slice_enum::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[slice_enum::kData].shape_[0],
+                               in_data[slice_enum::kData].shape_[1], 1, 1);
+      data = in_data[slice_enum::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       Shape<4> slice_shape = dshape;
       slice_shape[1] = dshape[1] / size_;
       for (int i = 0; i < size_; ++i) {
         outputs[i] = out_data[i].get_with_shape<xpu, 4, real_t>(slice_shape, s);
       }
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
+      data = in_data[slice_enum::kData].get<xpu, 4, real_t>(s);
       for (int i = 0; i < size_; ++i) {
         outputs[i] = out_data[i].get<xpu, 4, real_t>(s);
       }
@@ -81,19 +84,20 @@ class SliceChannelOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > grad_out(size_);
     Tensor<xpu, 4> grad;
-    if (out_grad[kOut0].ndim() == 2) {
-      Shape<4> slice_shape = Shape4(out_grad[kOut0].shape_[0], out_grad[kOut0].shape_[1], 1, 1);
+    if (out_grad[slice_enum::kOut0].ndim() == 2) {
+      Shape<4> slice_shape = Shape4(out_grad[slice_enum::kOut0].shape_[0],
+                                    out_grad[slice_enum::kOut0].shape_[1], 1, 1);
       for (int i = 0; i < size_; ++i) {
         grad_out[i] = out_grad[i].get_with_shape<xpu, 4, real_t>(slice_shape, s);
       }
       Shape<4> dshape = slice_shape;
       dshape[1] *= size_;
-      grad = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad = in_grad[slice_enum::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
     } else {
       for (int i = 0; i < size_; ++i) {
         grad_out[i] = out_grad[i].get<xpu, 4, real_t>(s);
       }
-      grad = in_grad[kData].get<xpu, 4, real_t>(s);
+      grad = in_grad[slice_enum::kData].get<xpu, 4, real_t>(s);
     }
     Concatenate(grad_out, &grad);
   }
@@ -135,7 +139,7 @@ class SliceChannelProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1);
-    TShape dshape = in_shape->at(kData);
+    TShape dshape = in_shape->at(slice_enum::kData);
     if (dshape.ndim() == 0) return false;
     CHECK_GT(dshape.ndim(), 1);
     CHECK_EQ(dshape[1] % param_.num_outputs, 0)
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
old mode 100644
new mode 100755
index 4a178f19d0aa..d1e5331d9d06
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -20,8 +20,10 @@
 namespace mxnet {
 namespace op {
 
+namespace softmax_enum {
 enum SoftmaxOpInputs {kData, kLabel};
 enum SoftmaxOpOutputs {kOut};
+}  // namespace softmax_enum
 
 struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   float grad_scale;
@@ -52,15 +54,15 @@ class SoftmaxOp : public Operator {
     CHECK_EQ(out_data.size(), 1) << "Softmax Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
-      int n = in_data[kData].size(0);
-      int k = in_data[kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[kData].Size()/n/k));
-      Tensor<xpu, 3> data = in_data[kData].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3> out = out_data[kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      int n = in_data[softmax_enum::kData].size(0);
+      int k = in_data[softmax_enum::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_enum::kData].Size()/n/k));
+      Tensor<xpu, 3> data = in_data[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
       Softmax(out, data);
     } else {
-      Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> data = in_data[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
       Softmax(out, data);
     }
   }
@@ -80,20 +82,20 @@ class SoftmaxOp : public Operator {
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
-      int n = out_data[kOut].size(0);
-      int k = out_data[kOut].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[kOut].Size()/n/k));
-      Tensor<xpu, 2> label = in_data[kLabel].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 3> out = out_data[kOut].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3> grad = in_grad[kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      int n = out_data[softmax_enum::kOut].size(0);
+      int k = out_data[softmax_enum::kOut].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmax_enum::kOut].Size()/n/k));
+      Tensor<xpu, 2> label = in_data[softmax_enum::kLabel].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> grad = in_grad[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
       SoftmaxGrad(grad, out, label);
       if (param_.grad_scale < 1.0) {
         grad *= param_.grad_scale;
       }
     } else {
-      Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 1> label = in_data[softmax_enum::kLabel].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> grad = in_grad[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
       SoftmaxGrad(grad, out, label);
       if (param_.grad_scale < 1.0) {
         grad *= param_.grad_scale;
@@ -132,9 +134,10 @@ class SoftmaxProp : public OperatorProperty {
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     if (param_.multi_output) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
+      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel,
+                         Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
     } else {
-      SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[0]));
+      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel, Shape1(dshape[0]));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -155,7 +158,7 @@ class SoftmaxProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[kLabel], out_data[kOut]};
+    return {in_data[softmax_enum::kLabel], out_data[softmax_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -163,13 +166,13 @@ class SoftmaxProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_data[kOut], in_grad[kData]}};
+    return {{out_data[softmax_enum::kOut], in_grad[softmax_enum::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[softmax_enum::kData], out_data[softmax_enum::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/tests/python/multi-node/dist_async_inception.py b/tests/python/multi-node/dist_async_inception.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_async_lenet.py b/tests/python/multi-node/dist_async_lenet.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_async_mlp.py b/tests/python/multi-node/dist_async_mlp.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_imagenet_inception.py b/tests/python/multi-node/dist_imagenet_inception.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_inception.py b/tests/python/multi-node/dist_sync_inception.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_kvstore.py b/tests/python/multi-node/dist_sync_kvstore.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_lenet.py b/tests/python/multi-node/dist_sync_lenet.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_mlp.py b/tests/python/multi-node/dist_sync_mlp.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/local_inception.py b/tests/python/multi-node/local_inception.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/local_lenet.py b/tests/python/multi-node/local_lenet.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/local_mlp.py b/tests/python/multi-node/local_mlp.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/test_data.py b/tests/python/multi-node/test_data.py
old mode 100755
new mode 100644

From 84405e6638d0942726719fb5f3bd1c8abf7bf910 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 28 Oct 2015 13:41:16 -0600
Subject: [PATCH 114/122] [Name] Convention

---
 example/imagenet/alexnet.py              |   2 +-
 example/imagenet/inception-full.py       |   4 +-
 example/imagenet/inception.py            |   2 +-
 example/notebooks/cifar-100.ipynb        |   8 +-
 example/notebooks/cifar-recipe.ipynb     |   6 +-
 python/mxnet/model.py                    | 162 +++++++++++------------
 python/mxnet/optimizer.py                |  14 +-
 src/operator/activation-inl.h            |   0
 src/operator/activation.cc               |   0
 src/operator/activation.cu               |   6 +-
 src/operator/batch_norm-inl.h            |   0
 src/operator/block_grad-inl.h            |   0
 src/operator/concat-inl.h                |   0
 src/operator/convolution-inl.h           |   0
 src/operator/cudnn_activation-inl.h      |   0
 src/operator/cudnn_convolution-inl.h     |   0
 src/operator/cudnn_lrn-inl.h             |   0
 src/operator/cudnn_pooling-inl.h         |   0
 src/operator/dropout-inl.h               |   0
 src/operator/elementwise_binary_op-inl.h |   0
 src/operator/elementwise_binary_op.cc    |   0
 src/operator/elementwise_binary_op.cu    |   0
 src/operator/elementwise_sum-inl.h       |   0
 src/operator/fully_connected-inl.h       |   0
 src/operator/leaky_relu-inl.h            |   0
 src/operator/lrn-inl.h                   |   0
 src/operator/pooling-inl.h               |   0
 src/operator/pooling.cc                  |   0
 src/operator/pooling.cu                  |   0
 src/operator/regression_output-inl.h     |   0
 src/operator/regression_output.cc        |   0
 src/operator/regression_output.cu        |   0
 src/operator/reshape-inl.h               |   0
 src/operator/slice_channel-inl.h         |   0
 src/operator/softmax-inl.h               |   0
 tests/python/train/test_conv.py          |   4 +-
 tests/python/train/test_mlp.py           |  10 +-
 37 files changed, 109 insertions(+), 109 deletions(-)
 mode change 100755 => 100644 src/operator/activation-inl.h
 mode change 100755 => 100644 src/operator/activation.cc
 mode change 100755 => 100644 src/operator/batch_norm-inl.h
 mode change 100755 => 100644 src/operator/block_grad-inl.h
 mode change 100755 => 100644 src/operator/concat-inl.h
 mode change 100755 => 100644 src/operator/convolution-inl.h
 mode change 100755 => 100644 src/operator/cudnn_activation-inl.h
 mode change 100755 => 100644 src/operator/cudnn_convolution-inl.h
 mode change 100755 => 100644 src/operator/cudnn_lrn-inl.h
 mode change 100755 => 100644 src/operator/cudnn_pooling-inl.h
 mode change 100755 => 100644 src/operator/dropout-inl.h
 mode change 100755 => 100644 src/operator/elementwise_binary_op-inl.h
 mode change 100755 => 100644 src/operator/elementwise_binary_op.cc
 mode change 100755 => 100644 src/operator/elementwise_binary_op.cu
 mode change 100755 => 100644 src/operator/elementwise_sum-inl.h
 mode change 100755 => 100644 src/operator/fully_connected-inl.h
 mode change 100755 => 100644 src/operator/leaky_relu-inl.h
 mode change 100755 => 100644 src/operator/lrn-inl.h
 mode change 100755 => 100644 src/operator/pooling-inl.h
 mode change 100755 => 100644 src/operator/pooling.cc
 mode change 100755 => 100644 src/operator/pooling.cu
 mode change 100755 => 100644 src/operator/regression_output-inl.h
 mode change 100755 => 100644 src/operator/regression_output.cc
 mode change 100755 => 100644 src/operator/regression_output.cu
 mode change 100755 => 100644 src/operator/reshape-inl.h
 mode change 100755 => 100644 src/operator/slice_channel-inl.h
 mode change 100755 => 100644 src/operator/softmax-inl.h

diff --git a/example/imagenet/alexnet.py b/example/imagenet/alexnet.py
index 9a74631a2174..dbf5e9a28ba4 100644
--- a/example/imagenet/alexnet.py
+++ b/example/imagenet/alexnet.py
@@ -59,4 +59,4 @@
     wd            = 0.00001)
 logging.basicConfig(level = logging.DEBUG)
 model.fit(X = train, eval_data = val,
-          epoch_end_callback = mx.callback.Speedometer(batch_size=batch_size))
+          batch_end_callback = mx.callback.Speedometer(batch_size=batch_size))
diff --git a/example/imagenet/inception-full.py b/example/imagenet/inception-full.py
index 71a7cfd16ef0..d703a6db59a2 100644
--- a/example/imagenet/inception-full.py
+++ b/example/imagenet/inception-full.py
@@ -97,5 +97,5 @@ def inception(nhidden, grad_scale):
 
 model.fit(X=train,
           eval_metric="acc",
-          epoch_end_callback=[mx.callback.Speedometer(batch_size), mx.callback.log_train_metric(100)],
-	  iter_end_callback=mx.callback.do_checkpoint(model_prefix))
+          batch_end_callback=[mx.callback.Speedometer(batch_size), mx.callback.log_train_metric(100)],
+	      epoch_end_callback=mx.callback.do_checkpoint(model_prefix))
diff --git a/example/imagenet/inception.py b/example/imagenet/inception.py
index 6d0dc36bddad..263f3a22733f 100644
--- a/example/imagenet/inception.py
+++ b/example/imagenet/inception.py
@@ -95,4 +95,4 @@ def inception(nhidden, grad_scale):
 
 model.fit(X=train, eval_data=val,
           eval_metric="acc",
-          epoch_end_callback=mx.callback.Speedometer(batch_size))
+          batch_end_callback=mx.callback.Speedometer(batch_size))
diff --git a/example/notebooks/cifar-100.ipynb b/example/notebooks/cifar-100.ipynb
index 48d6e00c5464..8e8c53a2d75b 100644
--- a/example/notebooks/cifar-100.ipynb
+++ b/example/notebooks/cifar-100.ipynb
@@ -503,8 +503,8 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
-    "          iter_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
+    "          batch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
+    "          epoch_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
    ]
   },
   {
@@ -597,8 +597,8 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
-    "          iter_end_callback=mx.callback.do_checkpoint(model_prefix))"
+    "          batch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
+    "          epoch_end_callback=mx.callback.do_checkpoint(model_prefix))"
    ]
   },
   {
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index 832b68687572..eae38dab736c 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -269,15 +269,15 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.callback.Speedometer(batch_size))\n",
+    "          batch_end_callback=mx.callback.Speedometer(batch_size))\n",
     "\n",
     "# if we want to save model after every round, we can add check_point call back\n",
     "# model_prefix = './cifar_'\n",
     "# model.fit(X=train_dataiter,\n",
     "#           eval_data=test_dataiter,\n",
     "#           eval_metric=\"accuracy\",\n",
-    "#           epoch_end_callback=mx.helper.Speedometer(batch_size),\n",
-    "#           iter_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
+    "#           batch_end_callback=mx.helper.Speedometer(batch_size),\n",
+    "#           epoch_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
    ]
   },
   {
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index d09f37eb8d90..e4cfbfbed35d 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -25,9 +25,9 @@
 except ImportError:
     SKLEARN_INSTALLED = False
 
-# Parameter to pass to epoch_end_callback
-EpochEndParam = namedtuple('EpochEndParams',
-                           ['iteration',
+# Parameter to pass to batch_end_callback
+BatchEndParam = namedtuple('BatchEndParams',
+                           ['epoch',
                             'nbatch',
                             'eval_metric'])
 
@@ -170,10 +170,10 @@ def _create_kvstore(kvstore, num_device, arg_params):
 
 def _train_multi_device(symbol, ctx, input_shape,
                         arg_params, aux_params,
-                        begin_round, end_round, optimizer,
+                        begin_epoch, end_epoch, optimizer,
                         kvstore, update_on_kvstore,
                         train_data, eval_data=None, eval_metric=None,
-                        iter_end_callback=None, epoch_end_callback=None,
+                        epoch_end_callback=None, batch_end_callback=None,
                         logger=None):
     """Internal training function on multiple devices.
 
@@ -196,11 +196,11 @@ def _train_multi_device(symbol, ctx, input_shape,
     aux_params : dict of str to NDArray
         Model parameter, dict of name to NDArray of net's auxiliary states.
 
-    begin_round : int
-        The begining training iteration.
+    begin_epoch : int
+        The begining training epoch.
 
-    end_round : int
-        The end training iteration.
+    end_epoch : int
+        The end training epoch.
 
     optimizer : Optimizer
         The optimization algorithm
@@ -214,11 +214,11 @@ def _train_multi_device(symbol, ctx, input_shape,
     eval_metric : EvalMetric
         A evaluation function.
 
-    iter_end_callback : callable(iteration, symbol, arg_params, aux_states)
-        A callback that is invoked at end of each iteration.
-        This can be used to checkpoint model each iteration.
+    epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
+        A callback that is invoked at end of each epoch.
+        This can be used to checkpoint model each epoch.
 
-    epoch_end_callback : callable(EpochEndParams)
+    batch_end_callback : callable(BatchEndParams)
         A callback that is invoked at end of each batch.
         This can be used to measure speed, get result from evaluation metric. etc.
 
@@ -261,7 +261,7 @@ def _train_multi_device(symbol, ctx, input_shape,
         texec.copy_params_from(arg_params, aux_params)
 
     # init optmizer
-    optimizer.begin_round(begin_round)
+    optimizer.begin_epoch(begin_epoch)
 
     if not update_on_kvstore:
         updater = get_updater(optimizer)
@@ -290,7 +290,7 @@ def _train_multi_device(symbol, ctx, input_shape,
     out_cpu_array = nd.zeros(merged_shape, cpu())
 
     # Now start training
-    for iteration in range(begin_round, end_round):
+    for epoch in range(begin_epoch, end_epoch):
         # Training phase
         tic = time.time()
         eval_metric.reset()
@@ -332,25 +332,25 @@ def _train_multi_device(symbol, ctx, input_shape,
                         updater(index*num_device+k, g, w)
 
             nbatch += 1
-            # epoch callback (for print purpose)
-            if epoch_end_callback != None:
-                epoch_end_params = EpochEndParam(iteration=iteration,
+            # batch callback (for print purpose)
+            if batch_end_callback != None:
+                batch_end_params = BatchEndParam(epoch=epoch,
                                                  nbatch=nbatch,
                                                  eval_metric=eval_metric)
-                if isinstance(epoch_end_callback, list):
-                    for call in epoch_end_callback:
-                        call(epoch_end_params)
+                if isinstance(batch_end_callback, list):
+                    for call in batch_end_callback:
+                        call(batch_end_params)
                 else:
-                    epoch_end_callback(epoch_end_params)
+                    batch_end_callback(batch_end_params)
             # evaluate at end, so out_cpu_array can lazy copy
             eval_metric.update(label, out_cpu_array)
 
-        # reset training data after iteration finish
+        # reset training data after epoch finish
         train_data.reset()
         name, value = eval_metric.get()
-        logger.info('Iteration[%d] Train-%s=%f', iteration, name, value)
+        logger.info('Epoch[%d] Train-%s=%f', epoch, name, value)
         toc = time.time()
-        logger.info('Iteration[%d] Time cost=%.3f', iteration, (toc - tic))
+        logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
         # evaluation
         if eval_data:
             eval_metric.reset()
@@ -367,9 +367,9 @@ def _train_multi_device(symbol, ctx, input_shape,
                 eval_metric.update(label, out_cpu_array)
             eval_data.reset()
             name, value = eval_metric.get()
-            logger.info('Iteration[%d] Validation-%s=%f', iteration, name, value)
+            logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
 
-        if iter_end_callback or iteration + 1 == end_round:
+        if epoch_end_callback or epoch + 1 == end_epoch:
             # copy data back to cpu
             for name, block in zip(arg_names, arg_blocks):
                 if name in arg_params:
@@ -379,17 +379,17 @@ def _train_multi_device(symbol, ctx, input_shape,
                 if name in aux_params:
                     weight = sum(w.copyto(cpu()) for w in block) / len(block)
                     weight.copyto(aux_params[name])
-        if iter_end_callback != None:
-            if isinstance(iter_end_callback, list):
-                for call in iter_end_callback:
-                    call(iteration, symbol, arg_params, aux_params)
+        if epoch_end_callback != None:
+            if isinstance(epoch_end_callback, list):
+                for call in epoch_end_callback:
+                    call(epoch, symbol, arg_params, aux_params)
             else:
-                iter_end_callback(iteration, symbol, arg_params, aux_params)
-    # end of all iterations
+                epoch_end_callback(epoch, symbol, arg_params, aux_params)
+    # end of all epochs
     return
 
 
-def save_checkpoint(prefix, iteration, symbol, arg_params, aux_params):
+def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
     """Checkpoint the model data into file.
 
     Parameters
@@ -397,8 +397,8 @@ def save_checkpoint(prefix, iteration, symbol, arg_params, aux_params):
     prefix : str
         Prefix of model name.
 
-    iteration : int
-        The iteration number of the model.
+    epoch : int
+        The epoch number of the model.
 
     symbol : Symbol
         The input symbol
@@ -412,17 +412,17 @@ def save_checkpoint(prefix, iteration, symbol, arg_params, aux_params):
     Notes
     -----
     - ``prefix-symbol.json`` will be saved for symbol.
-    - ``prefix-iteration.params`` will be saved for parameters.
+    - ``prefix-epoch.params`` will be saved for parameters.
     """
     symbol.save('%s-symbol.json' % prefix)
     save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()}
     save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()})
-    param_name = '%s-%04d.params' % (prefix, iteration)
+    param_name = '%s-%04d.params' % (prefix, epoch)
     nd.save(param_name, save_dict)
     logging.info('Saved checkpoint to \"%s\"', param_name)
 
 
-def load_checkpoint(prefix, iteration):
+def load_checkpoint(prefix, epoch):
     """Load model checkpoint from file.
 
     Parameters
@@ -430,8 +430,8 @@ def load_checkpoint(prefix, iteration):
     prefix : str
         Prefix of model name.
 
-    iteration : int
-        Iteration number of model we would like to load.
+    epoch : int
+        Epoch number of model we would like to load.
 
     Returns
     -------
@@ -447,10 +447,10 @@ def load_checkpoint(prefix, iteration):
     Notes
     -----
     - ``prefix-symbol.json`` will be saved for symbol.
-    - ``prefix-iteration.params`` will be saved for parameters.
+    - ``prefix-epoch.params`` will be saved for parameters.
     """
     symbol = sym.load('%s-symbol.json' % prefix)
-    save_dict = nd.load('%s-%04d.params' % (prefix, iteration))
+    save_dict = nd.load('%s-%04d.params' % (prefix, epoch))
     arg_params = {}
     aux_params = {}
     for k, v in save_dict.items():
@@ -476,8 +476,8 @@ class FeedForward(BASE_ESTIMATOR):
         The device context of training and prediction.
         To use multi GPU training, pass in a list of gpu contexts.
 
-    num_round : int, optional
-        Training parameter, number of training rounds(iterations).
+    num_epoch : int, optional
+        Training parameter, number of training epochs(epochs).
 
     optimizer : str or Optimizer, optional
         Training parameter, name or optimizer object for training.
@@ -501,19 +501,19 @@ class FeedForward(BASE_ESTIMATOR):
         If this is True, no error will be thrown when aux_params and arg_params
         contain extra parameters than needed.
 
-    begin_round : int,optional
-        The begining training iteration.
+    begin_epoch : int,optional
+        The begining training epoch.
 
     **kwargs : dict
         The additional keyword arguments passed to optimizer.
     """
     def __init__(self, symbol, ctx=None,
-                 num_round=None, optimizer='sgd',
+                 num_epoch=None, optimizer='sgd',
                  initializer=Uniform(0.01),
                  numpy_batch_size=128,
                  arg_params=None, aux_params=None,
                  allow_extra_params=False,
-                 begin_round=0,
+                 begin_epoch=0,
                  **kwargs):
         # check if symbol contain duplicated names.
         _check_arguments(symbol)
@@ -535,7 +535,7 @@ def __init__(self, symbol, ctx=None,
             ctx = [ctx]
         self.ctx = ctx
         # training parameters
-        self.num_round = num_round
+        self.num_epoch = num_epoch
         self.kwargs = kwargs.copy()
         self.optimizer = optimizer
         self.initializer = initializer
@@ -546,7 +546,7 @@ def __init__(self, symbol, ctx=None,
         # internal helper state
         self._pred_exec = None
         self._pred_exec_input = None
-        self.begin_round = begin_round
+        self.begin_epoch = begin_epoch
 
     @staticmethod
     def _is_data_arg(name):
@@ -667,7 +667,7 @@ def predict(self, X):
         return np.concatenate(outputs)
 
     def fit(self, X, y=None, eval_data=None, eval_metric='acc',
-            iter_end_callback=None, epoch_end_callback=None,
+            epoch_end_callback=None, batch_end_callback=None,
             kvstore='local', logger=None):
         """Fit the model.
 
@@ -691,11 +691,11 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
             Or a customize evaluation function that returns the statistics
             based on minibatch.
 
-        iter_end_callback : callable(iteration, symbol, arg_params, aux_states)
-            A callback that is invoked at end of each iteration.
-            This can be used to checkpoint model each iteration.
+        epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
+            A callback that is invoked at end of each epoch.
+            This can be used to checkpoint model each epoch.
 
-        epoch_end_callback: callable(iteration)
+        batch_end_callback: callable(epoch)
             A callback that is invoked at end of each batch
             For print purpose
 
@@ -716,7 +716,7 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
         X = self._init_iter(X, y, is_train=True)
         eval_data = self._init_eval_iter(eval_data)
         # Simply ignore the first example to get input_shape
-        # in first training round.
+        # in first training epoch.
         if not X.iter_next():
             X.reset()
             assert X.iter_next()
@@ -744,16 +744,16 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
         # do training
         _train_multi_device(self.symbol, self.ctx, input_shape,
                             self.arg_params, self.aux_params,
-                            begin_round=self.begin_round, end_round=self.num_round,
+                            begin_epoch=self.begin_epoch, end_epoch=self.num_epoch,
                             optimizer=optimizer,
                             train_data=X, eval_data=eval_data,
                             eval_metric=eval_metric,
-                            iter_end_callback=iter_end_callback,
                             epoch_end_callback=epoch_end_callback,
+                            batch_end_callback=batch_end_callback,
                             kvstore=kvstore, update_on_kvstore=update_on_kvstore,
                             logger=logger)
 
-    def save(self, prefix, iteration=None):
+    def save(self, prefix, epoch=None):
         """Checkpoint the model checkpoint into file.
 
         You can also use pickle to do the job if you only work on python.
@@ -773,15 +773,15 @@ def save(self, prefix, iteration=None):
         Notes
         -----
         - ``prefix-symbol.json`` will be saved for symbol.
-        - ``prefix-iteration.params`` will be saved for parameters.
+        - ``prefix-epoch.params`` will be saved for parameters.
         """
-        if iteration is None:
-            iteration = self.num_round
-        assert iteration is not None
-        save_checkpoint(prefix, iteration, self.symbol, self.arg_params, self.aux_params)
+        if epoch is None:
+            epoch = self.num_epoch
+        assert epoch is not None
+        save_checkpoint(prefix, epoch, self.symbol, self.arg_params, self.aux_params)
 
     @staticmethod
-    def load(prefix, iteration, ctx=None, **kwargs):
+    def load(prefix, epoch, ctx=None, **kwargs):
         """Load model checkpoint from file.
 
         Parameters
@@ -789,13 +789,13 @@ def load(prefix, iteration, ctx=None, **kwargs):
         prefix : str
             Prefix of model name.
 
-        iteration : int
-            Iteration number of model we would like to load.
+        epoch : int
+            epoch number of model we would like to load.
 
         ctx : Context or list of Context, optional
             The device context of training and prediction.
         kwargs : dict
-            other parameters for model, including num_round, optimizer and numpy_batch_size
+            other parameters for model, including num_epoch, optimizer and numpy_batch_size
 
         Returns
         -------
@@ -805,18 +805,18 @@ def load(prefix, iteration, ctx=None, **kwargs):
         Notes
         -----
         - ``prefix-symbol.json`` will be saved for symbol.
-        - ``prefix-iteration.params`` will be saved for parameters.
+        - ``prefix-epoch.params`` will be saved for parameters.
         """
-        symbol, arg_params, aux_params = load_checkpoint(prefix, iteration)
+        symbol, arg_params, aux_params = load_checkpoint(prefix, epoch)
         return FeedForward(symbol, ctx=ctx,
                            arg_params=arg_params, aux_params=aux_params,
-                           begin_round=iteration,
+                           begin_epoch=epoch,
                            **kwargs)
 
     @staticmethod
     def create(symbol, X, y=None, ctx=None,
-               num_round=None, optimizer='sgd', initializer=Uniform(0.01),
-               eval_data=None, eval_metric='acc', iter_end_callback=None,
+               num_epoch=None, optimizer='sgd', initializer=Uniform(0.01),
+               eval_data=None, eval_metric='acc', epoch_end_callback=None,
                kvstore='local', logger=None, **kwargs):
         """Functional style to create a model.
 
@@ -838,8 +838,8 @@ def create(symbol, X, y=None, ctx=None,
             The device context of training and prediction.
             To use multi GPU training, pass in a list of gpu contexts.
 
-        num_round : int, optional
-            Training parameter, number of training rounds(iterations).
+        num_epoch : int, optional
+            Training parameter, number of training epochs(epochs).
 
         optimizer : str or Optimizer, optional
             Training parameter, name or optimizer object for training.
@@ -855,9 +855,9 @@ def create(symbol, X, y=None, ctx=None,
             Or a customize evaluation function that returns the statistics
             based on minibatch.
 
-        iter_end_callback : callable(iteration, symbol, arg_params, aux_states)
-            A callback that is invoked at end of each iteration.
-            This can be used to checkpoint model each iteration.
+        epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
+            A callback that is invoked at end of each epoch.
+            This can be used to checkpoint model each epoch.
 
         kvstore: KVStore or str, optional
            The KVStore or a string kvstore type:
@@ -869,10 +869,10 @@ def create(symbol, X, y=None, ctx=None,
 
            In default uses 'local', often no need to change for single machiine.
         """
-        model = FeedForward(symbol, ctx=ctx, num_round=num_round,
+        model = FeedForward(symbol, ctx=ctx, num_epoch=num_epoch,
                             optimizer=optimizer, initializer=initializer, **kwargs)
         model.fit(X, y, eval_data=eval_data, eval_metric=eval_metric,
-                  iter_end_callback=iter_end_callback,
+                  epoch_end_callback=epoch_end_callback,
                   kvstore=kvstore,
                   logger=logger)
         return model
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 105fc5646f75..ccfb99eb6019 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -49,18 +49,18 @@ def create_optimizer(name, rescale_grad=1, **kwargs):
             raise ValueError('Cannot find optimizer %s' % name)
 
     def __init__(self, rescale_grad=1):
-        self.iteration = 0
+        self.epoch = 0
         self.rescale_grad = rescale_grad
 
-    def begin_round(self, iteration):
-        """Function called to notify beginning of iteration.
+    def begin_epoch(self, epoch):
+        """Function called to notify beginning of epoch.
 
         Parameters
         ----------
-        iteration : int
-            The iteration number.
+        epoch : int
+            The epoch number.
         """
-        self.iteration = iteration
+        self.epoch = epoch
 
     def create_state(self, index, weight):
         """Create additional optimizer state such as momentum.
@@ -141,7 +141,7 @@ def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         if self.lr_scheduler != None:
-            lr = self.lr_scheduler(self.iteration)
+            lr = self.lr_scheduler(self.epoch)
         else:
             lr = self.lr
 
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
old mode 100755
new mode 100644
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index 2c9c29c04f45..51cac51c70f4 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -18,11 +18,11 @@ Operator *CreateOp<gpu>(ActivationParam param) {
   return new CuDNNActivationOp(param);
 #else
   switch(param.act_type) {
-    case kReLU:
+    case activation::kReLU:
       return new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad>();
-    case kSigmoid:
+    case activation::kSigmoid:
       return new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
-    case kTanh:
+    case activation::kTanh:
       return new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
     default:
       LOG(FATAL) << "unknown activation";
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/elementwise_binary_op-inl.h b/src/operator/elementwise_binary_op-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/elementwise_binary_op.cc b/src/operator/elementwise_binary_op.cc
old mode 100755
new mode 100644
diff --git a/src/operator/elementwise_binary_op.cu b/src/operator/elementwise_binary_op.cu
old mode 100755
new mode 100644
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
old mode 100755
new mode 100644
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
old mode 100755
new mode 100644
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
old mode 100755
new mode 100644
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
old mode 100755
new mode 100644
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
old mode 100755
new mode 100644
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
old mode 100755
new mode 100644
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index df2b7b98afb2..bc068153c24e 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -26,9 +26,9 @@
 fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
 softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
 
-num_round = 1
+num_epoch = 1
 model = mx.model.FeedForward(softmax, mx.cpu(),
-                             num_round=num_round,
+                             num_epoch=num_epoch,
                              learning_rate=0.1, wd=0.0001,
                              momentum=0.9)
 # check data
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 85266e12df52..5f1c27062066 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -20,7 +20,7 @@ def accuracy(label, pred):
     py = np.argmax(pred, axis=1)
     return np.sum(py == label) / float(label.size)
 
-num_round = 4
+num_epoch = 4
 prefix = './mlp'
 
 #check data
@@ -46,9 +46,9 @@ def test_mlp():
         X=train_dataiter,
         eval_data=val_dataiter,
         eval_metric=mx.metric.np(accuracy),
-        iter_end_callback=mx.callback.do_checkpoint(prefix),
+        epoch_end_callback=mx.callback.do_checkpoint(prefix),
         ctx=[mx.cpu(i) for i in range(2)],
-        num_round=num_round,
+        num_epoch=num_epoch,
         learning_rate=0.1, wd=0.0004,
         momentum=0.9)
 
@@ -78,7 +78,7 @@ def test_mlp():
     assert np.sum(np.abs(prob - prob2)) == 0
 
     # load model from checkpoint
-    model3 = mx.model.FeedForward.load(prefix, num_round)
+    model3 = mx.model.FeedForward.load(prefix, num_epoch)
     prob3 = model3.predict(val_dataiter)
     assert np.sum(np.abs(prob - prob3)) == 0
 
@@ -88,7 +88,7 @@ def test_mlp():
     prob4 = model4.predict(val_dataiter)
     assert np.sum(np.abs(prob - prob4)) == 0
 
-    for i in range(num_round):
+    for i in range(num_epoch):
         os.remove('%s-%04d.params' % (prefix, i + 1))
     os.remove('%s-symbol.json' % prefix)
     os.remove('%s-0128.params' % prefix)

From 85225c87847a3e83035a5866788e1710199e2553 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 28 Oct 2015 21:07:31 -0700
Subject: [PATCH 115/122] [API]  make predict more standalone without relying
 on filesys

---
 include/mxnet/c_predict_api.h                 | 16 +++++++-----
 include/mxnet/ndarray.h                       | 12 ++++-----
 predict/python/mxnet_predict.py               | 26 ++++++++++++-------
 src/c_api/c_api.cc                            | 10 +++++--
 src/c_api/c_predict_api.cc                    | 22 ++++++++++------
 src/io/iter_normalize.h                       | 14 +++++++---
 src/ndarray/ndarray.cc                        |  6 ++---
 tests/python/predict/mxnet_predict_example.py | 10 ++++---
 8 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 26ca23247627..e5671da33cbc 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -37,8 +37,9 @@ typedef void *NDListHandle;
 MXNET_DLL const char* MXGetLastError();
 /*!
  * \brief create a predictor
- * \param symbol_file The path to the symbol file.
- * \param param_file the path to the parameter file.
+ * \param symbol_json_str The JSON string of the symbol.
+ * \param param_bytes The in-memory raw bytes of parameter ndarray file.
+ * \param param_size The size of parameter ndarray file.
  * \param dev_type The device type, 1: cpu, 2:gpu
  * \param dev_id The device id of the predictor.
  * \param num_input_nodes Number of input nodes to the net,
@@ -53,8 +54,9 @@ MXNET_DLL const char* MXGetLastError();
  * \param out The created predictor handle.
  * \return 0 when success, -1 when failure.
  */
-MXNET_DLL int MXPredCreate(const char* symbol_file,
-                           const char* param_file,
+MXNET_DLL int MXPredCreate(const char* symbol_json_str,
+                           const char* param_bytes,
+                           size_t param_size,
                            int dev_type, int dev_id,
                            mx_uint num_input_nodes,
                            const char** input_keys,
@@ -114,12 +116,14 @@ MXNET_DLL int MXPredFree(PredictorHandle handle);
 /*!
  * \brief Create a NDArray List by loading from ndarray file.
  *     This can be used to load mean image file.
- * \param nd_file The path to the ndarray file to load.
+ * \param nd_file_bytes The byte contents of nd file to be loaded.
+ * \param nd_file_size The size of the nd file to be loaded.
  * \param out The out put NDListHandle
  * \param out_length Length of the list.
  * \return 0 when success, -1 when failure.
  */
-MXNET_DLL int MXNDListCreate(const char* nd_file,
+MXNET_DLL int MXNDListCreate(const char* nd_file_bytes,
+                             size_t nd_file_size,
                              NDListHandle *out,
                              mx_uint* out_length);
 /*!
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index fce3d61f6855..ec8c856d84c1 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -243,21 +243,21 @@ class NDArray {
     ptr_->CheckAndAlloc();
   }
   /*!
-   * \brief Save list of narray into the file.
-   * \param fname name of the file.
+   * \brief Save list of narray into the Stream.x
+   * \param fo The stream of output.
    * \param data the NDArrays to be saved.
    * \param names the name of the NDArray, optional, can be zero length.
    */
-  static void Save(const std::string& fname,
+  static void Save(dmlc::Stream* fo,
                    const std::vector<NDArray>& data,
                    const std::vector<std::string>& names);
   /*!
-   * \brief Load list of narray into from the file.
-   * \param fname name of the file.
+   * \brief Load list of narray into from the stream.
+   * \param fi The stream of the input file.
    * \param data the NDArrays to be loaded
    * \param keys the name of the NDArray, if saved in the file.
    */
-  static void Load(const std::string& fname,
+  static void Load(dmlc::Stream* fi,
                    std::vector<NDArray>* data,
                    std::vector<std::string>* keys);
 
diff --git a/predict/python/mxnet_predict.py b/predict/python/mxnet_predict.py
index 69e83bae011c..4abeefcb77d8 100644
--- a/predict/python/mxnet_predict.py
+++ b/predict/python/mxnet_predict.py
@@ -69,11 +69,11 @@ class Predictor(object):
 
     Parameters
     ----------
-    symbol_file : str
+    symbol_json_str : str
         Path to the symbol file.
 
-    param_file : str
-        Path to the parameter file.
+    param_raw_bytes : str, bytes
+        The raw parameter bytes.
 
     input_shapes : dict of str to tuple
         The shape of input data
@@ -84,7 +84,8 @@ class Predictor(object):
     dev_id : int, optional
         The device id of the predictor.
     """
-    def __init__(self, symbol_file, param_file, input_shapes,
+    def __init__(self, symbol_file,
+                 param_raw_bytes, input_shapes,
                  dev_type="cpu", dev_id=0):
         dev_type = devstr2type[dev_type]
         indptr = [0]
@@ -97,8 +98,11 @@ def __init__(self, symbol_file, param_file, input_shapes,
             sdata.extend(v)
             indptr.append(len(sdata))
         handle = PredictorHandle()
+        param_raw_bytes = bytearray(param_raw_bytes)
+        ptr = (ctypes.c_char * len(param_raw_bytes)).from_buffer(param_raw_bytes)
         _check_call(_LIB.MXPredCreate(
-            c_str(symbol_file), c_str(param_file),
+            c_str(symbol_file),
+            ptr, len(param_raw_bytes),
             ctypes.c_int(dev_type), ctypes.c_int(dev_id),
             mx_uint(len(indptr) - 1),
             c_array(ctypes.c_char_p, keys),
@@ -162,13 +166,13 @@ def get_output(self, index):
         return data
 
 
-def load_ndarray_file(nd_file):
+def load_ndarray_file(nd_bytes):
     """Load ndarray file and return as list of numpy array.
 
     Parameters
     ----------
-    nd_file : str
-        The name to the ndarray file.
+    nd_bytes : str or bytes
+        The internal ndarray bytes
 
     Returns
     -------
@@ -177,8 +181,11 @@ def load_ndarray_file(nd_file):
     """
     handle = NDListHandle()
     olen = mx_uint()
+    nd_bytes = bytearray(nd_bytes)
+    ptr = (ctypes.c_char * len(nd_bytes)).from_buffer(nd_bytes)
     _check_call(_LIB.MXNDListCreate(
-        c_str(nd_file), ctypes.byref(handle), ctypes.byref(olen)))
+        ptr, len(nd_bytes),
+        ctypes.byref(handle), ctypes.byref(olen)))
     keys = []
     arrs = []
 
@@ -193,6 +200,7 @@ def load_ndarray_file(nd_file):
         shape = tuple(pdata[:ndim.value])
         dbuffer = (mx_float * np.prod(shape)).from_address(ctypes.addressof(cptr.contents))
         ret = np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+        ret = np.array(ret, dtype=np.float32)
         keys.append(py_str(key.value))
         arrs.append(ret)
     _check_call(_LIB.MXNDListFree(handle))
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1d5fef33768a..1f12c7943c7c 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -196,7 +196,10 @@ int MXNDArraySave(const char* fname,
       names[i] = keys[i];
     }
   }
-  mxnet::NDArray::Save(fname, data, names);
+  {
+    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname, "w"));
+    mxnet::NDArray::Save(fo.get(), data, names);
+  }
   API_END();
 }
 
@@ -210,7 +213,10 @@ int MXNDArrayLoad(const char* fname,
   API_BEGIN();
   std::vector<NDArray> data;
   std::vector<std::string> &names = ret->ret_vec_str;
-  mxnet::NDArray::Load(fname, &data, &names);
+  {
+    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
+    mxnet::NDArray::Load(fi.get(), &data, &names);
+  }
   ret->ret_handles.resize(data.size());
   for (size_t i = 0; i < data.size(); ++i) {
     NDArray *ptr = new NDArray();
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index c0d6deafcb60..27f63d69944a 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -3,10 +3,13 @@
  * \file c_predict_api.cc
  * \brief C predict API of mxnet
  */
+#include <dmlc/base.h>
+#include <dmlc/memory_io.h>
 #include <mxnet/c_predict_api.h>
 #include <mxnet/symbolic.h>
 #include <mxnet/ndarray.h>
 #include <memory>
+
 #include "./c_api_error.h"
 
 using namespace mxnet;
@@ -32,8 +35,9 @@ struct MXAPINDList {
   std::vector<mx_float> data;
 };
 
-int MXPredCreate(const char* symbol_file,
-                 const char* param_file,
+int MXPredCreate(const char* symbol_json_str,
+                 const char* param_bytes,
+                 size_t param_size,
                  int dev_type, int dev_id,
                  mx_uint num_input_nodes,
                  const char** input_keys,
@@ -45,18 +49,18 @@ int MXPredCreate(const char* symbol_file,
   Symbol sym;
   // load in the symbol.
   {
-    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(symbol_file, "r"));
-    dmlc::istream is(fi.get());
+    std::string json = symbol_json_str;
+    std::istringstream is(json);
     dmlc::JSONReader reader(&is);
     sym.Load(&reader);
-    is.set_stream(nullptr);
   }
   // load the parameters
   std::unordered_map<std::string, NDArray> arg_params, aux_params;
   {
     std::vector<NDArray> data;
     std::vector<std::string> names;
-    NDArray::Load(param_file, &data, &names);
+    dmlc::MemoryFixedSizeStream fi((void*)param_bytes, param_size);  // NOLINT(*)
+    NDArray::Load(&fi, &data, &names);
     CHECK_EQ(names.size(), data.size())
         << "Invalid param file format";
     for (size_t i = 0; i < names.size(); ++i) {
@@ -178,13 +182,15 @@ int MXPredFree(PredictorHandle handle) {
   API_END();
 }
 
-int MXNDListCreate(const char* nd_file,
+int MXNDListCreate(const char* nd_file_bytes,
+                   size_t nd_file_size,
                    NDListHandle *out,
                    mx_uint* out_length) {
   MXAPINDList* ret = new MXAPINDList();
   API_BEGIN();
   std::vector<NDArray> arrays;
-  NDArray::Load(nd_file,
+  dmlc::MemoryFixedSizeStream fi((void*)nd_file_bytes, nd_file_size);  // NOLINT(*)
+  NDArray::Load(&fi,
                 &(arrays),
                 &(ret->keys));
   if (ret->keys.size() == 0) {
diff --git a/src/io/iter_normalize.h b/src/io/iter_normalize.h
index add700095892..19d3050696e1 100644
--- a/src/io/iter_normalize.h
+++ b/src/io/iter_normalize.h
@@ -100,7 +100,10 @@ class ImageNormalizeIter : public IIterator<DataInst> {
         // use python compatible ndarray store format
         std::vector<NDArray> data;
         std::vector<std::string> keys;
-        NDArray::Load(param_.mean_img, &data, &keys);
+        {
+          std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(param_.mean_img.c_str(), "r"));
+          NDArray::Load(fi.get(), &data, &keys);
+        }
         CHECK_EQ(data.size(), 1)
             << "Invalid mean image file format";
         data[0].WaitToRead();
@@ -220,9 +223,12 @@ class ImageNormalizeIter : public IIterator<DataInst> {
     meanimg_ *= (1.0f / imcnt);
     // save as mxnet python compatible format.
     TBlob tmp = meanimg_;
-    NDArray::Save(param_.mean_img,
-                  {NDArray(tmp, 0)},
-                  {"mean_img"});
+    {
+      std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(param_.mean_img.c_str(), "w"));
+      NDArray::Save(fo.get(),
+                    {NDArray(tmp, 0)},
+                    {"mean_img"});
+    }
     if (param_.verbose) {
       LOG(INFO) << "Save mean image to " << param_.mean_img << "..";
     }
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 26a62fb60264..404c0891f984 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -507,10 +507,9 @@ bool NDArray::Load(dmlc::Stream *strm) {
 
 const uint64_t kMXAPINDArrayListMagic = 0x112;
 
-void NDArray::Save(const std::string& fname,
+void NDArray::Save(dmlc::Stream* fo,
                    const std::vector<NDArray>& data,
                    const std::vector<std::string>& names) {
-  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
   uint64_t header = kMXAPINDArrayListMagic, reserved = 0;
   fo->Write(&header, sizeof(header));
   fo->Write(&reserved, sizeof(reserved));
@@ -518,10 +517,9 @@ void NDArray::Save(const std::string& fname,
   fo->Write(names);
 }
 
-void NDArray::Load(const std::string& fname,
+void NDArray::Load(dmlc::Stream* fi,
                    std::vector<NDArray>* data,
                    std::vector<std::string>* keys) {
-  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
   uint64_t header, reserved;
   CHECK(fi->Read(&header))
       << "Invalid NDArray file format";
diff --git a/tests/python/predict/mxnet_predict_example.py b/tests/python/predict/mxnet_predict_example.py
index 8760c87a50f7..7eed3c72ceb8 100644
--- a/tests/python/predict/mxnet_predict_example.py
+++ b/tests/python/predict/mxnet_predict_example.py
@@ -1,7 +1,7 @@
 import sys, os
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append("../../predict/python/")
-sys.path.append("../../python/")
+sys.path.append("../../../predict/python/")
+sys.path.append("../../../python/")
 
 from mxnet_predict import Predictor, load_ndarray_file
 import mxnet as mx
@@ -14,8 +14,10 @@
 num_round = 39
 symbol_file = "%s-symbol.json" % prefix
 param_file = "%s-0039.params" % prefix
-predictor = Predictor(symbol_file, param_file, {'data':(1, 3, 224, 224)})
-mean_img = load_ndarray_file("Inception/mean_224.nd")["mean_img"]
+predictor = Predictor(open(symbol_file).read(),
+                      open(param_file).read(),
+                      {'data':(1, 3, 224, 224)})
+mean_img = load_ndarray_file(open("Inception/mean_224.nd").read())["mean_img"]
 
 synset = [l.strip() for l in open('Inception/synset.txt').readlines()]
 

From ed1aa8452a77642b8d921630adccaf6b274472ce Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 28 Oct 2015 21:27:26 -0700
Subject: [PATCH 116/122] [OP] Fix Operator norm

---
 src/ndarray/unary_function-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index c084bb4a4e95..1082fd826057 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -78,7 +78,7 @@ void L2Norm(const TBlob &src,
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
   mshadow::Tensor<xpu, 1> in =
-      src.get_with_shape<xpu, 1, real_t>(mshadow::Shape1(src.shape_.Size()));
+      src.get_with_shape<xpu, 1, real_t>(mshadow::Shape1(src.shape_.Size()), s);
   mshadow::VectorDot(out, in, in);
   out = mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out);
 }

From dcb2a1e95ee33f5547c3d57b8e44420eaaec3adc Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 28 Oct 2015 22:42:07 -0700
Subject: [PATCH 117/122] [EXECUTOR] Fix destructor

---
 ps-lite                      | 2 +-
 src/symbol/graph_executor.cc | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ps-lite b/ps-lite
index 0cc04093f7c9..7121aa1bdb67 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 0cc04093f7c9e07155f585552f31a90715bacef6
+Subproject commit 7121aa1bdb673f047c7600eb4347fd2911021710
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 59e821e372db..847d3d47adad 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -247,6 +247,7 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
 }
 
 GraphExecutor::~GraphExecutor() {
+  Engine::Get()->WaitForAll();
   // need to delete the operators before delete the NDArray they referenced.
   for (OpNode& node : op_nodes_) {
     node.DeleteOperator();

From 40600f19b23998d69fc996d6e16b0628648333f3 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Mon, 26 Oct 2015 20:59:16 -0700
Subject: [PATCH 118/122] quality option for im2rec

---
 tools/im2rec.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 89410bae2e6a..471287a15182 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -29,7 +29,8 @@ int main(int argc, char *argv[]) {
            "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"\
            "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to NSPLIT parts by position\n"\
            "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"
-           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it rectangular.\n");
+           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it rectangular.\n"
+           "\tquality=QUALITY[default=80] JPEG quality for encoding, 1-100.\n");
     return 0;
   }
   int label_width = 1;
@@ -37,6 +38,7 @@ int main(int argc, char *argv[]) {
   int nsplit = 1;
   int partid = 0;
   int center_crop = 0;
+  int quality = 80;
   for (int i = 4; i < argc; ++i) {
     char key[128], val[128];
     if (sscanf(argv[i], "%[^=]=%s", key, val) == 2) {
@@ -45,6 +47,7 @@ int main(int argc, char *argv[]) {
       if (!strcmp(key, "nsplit")) nsplit = atoi(val);
       if (!strcmp(key, "part")) partid = atoi(val);
       if (!strcmp(key, "center_crop")) center_crop = atoi(val);
+      if (!strcmp(key, "quality")) quality = atoi(val);
     }
   }
   if (new_size > 0) {
@@ -79,7 +82,8 @@ int main(int argc, char *argv[]) {
   std::vector<unsigned char> encode_buf;
   std::vector<int> encode_params;
   encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
-  encode_params.push_back(80);
+  encode_params.push_back(quality);
+  LOG(INFO) << "JPEG encoding quality: " << quality;
   dmlc::InputSplit::Blob line;
 
   while (flist->NextRecord(&line)) {

From d8129fdda433f1c95b26e14ddf6981d7e44f78c4 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Mon, 26 Oct 2015 23:11:37 -0700
Subject: [PATCH 119/122] python interface for RecordIO

---
 include/mxnet/c_api.h | 51 ++++++++++++++++++++++++++++++
 python/mxnet/base.py  |  1 +
 python/mxnet/io.py    | 42 ++++++++++++++++++++++++-
 src/c_api/c_api.cc    | 73 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index dd3c225f66bc..2bbda3ddbf0e 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -46,6 +46,8 @@ typedef void *DataIterCreator;
 typedef void *DataIterHandle;
 /*! \brief handle to KVStore */
 typedef void *KVStoreHandle;
+/*! \brief handle to RecordIO */
+typedef void *RecordIOHandle;
 /*!
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
@@ -883,4 +885,53 @@ MXNET_DLL int MXKVStoreSendCommmandToServers(KVStoreHandle handle,
                                              int cmd_id,
                                              const char* cmd_body);
 
+/**
+ * \brief Create a RecordIO writer object
+ * \param uri path to file
+ * \param out handle pointer to the created object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterCreate(const char *uri, RecordIOHandle *out);
+
+/**
+ * \brief Delete a RecordIO writer object
+ * \param handle handle to RecordIO object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterFree(RecordIOHandle handle);
+
+/**
+ * \brief Write a record to a RecordIO object
+ * \param handle handle to RecordIO object
+ * \param buf buffer to write
+ * \param size size of buffer
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterWriteRecord(RecordIOHandle *handle,
+                                          const char *buf, size_t size);
+
+/**
+ * \brief Create a RecordIO reader object
+ * \param uri path to file
+ * \param out handle pointer to the created object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderCreate(const char *uri, RecordIOHandle *out);
+
+/**
+ * \brief Delete a RecordIO reader object
+ * \param handle handle to RecordIO object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderFree(RecordIOHandle *handle);
+
+/**
+ * \brief Write a record to a RecordIO object
+ * \param handle handle to RecordIO object
+ * \param buf pointer to return buffer
+ * \param size point to size of buffer
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderReadRecord(RecordIOHandle *handle,
+                                        char const **buf, size_t *size);
 #endif  // MXNET_C_API_H_
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 7b0ce9d5a9bd..d6aec6509b85 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -54,6 +54,7 @@ def _load_lib():
 DataIterCreatorHandle = ctypes.c_void_p
 DataIterHandle = ctypes.c_void_p
 KVStoreHandle = ctypes.c_void_p
+RecordIOHandle = ctypes.c_void_p
 #----------------------------
 # helper function definition
 #----------------------------
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 5ec7de10bd82..0954deb9c993 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -11,7 +11,7 @@
 import logging
 from .base import _LIB
 from .base import c_array, c_str, mx_uint, py_str
-from .base import DataIterHandle, NDArrayHandle
+from .base import DataIterHandle, NDArrayHandle, RecordIOHandle
 from .base import check_call, ctypes2docstring
 from .ndarray import NDArray
 from .ndarray import array
@@ -258,6 +258,46 @@ def getpad(self):
         check_call(_LIB.MXDataIterGetPadNum(self.handle, ctypes.byref(pad)))
         return pad.value
 
+class MXRecordIO(object):
+    """Python interface for read/write RecordIO data formmat"""
+    def __init__(self, uri, flag):
+        uri = ctypes.c_char_p(uri)
+        self.handle = RecordIOHandle()
+        if flag == "w":
+            check_call(_LIB.MXRecordIOWriterCreate(uri, ctypes.byref(self.handle)))
+            self.writable = True
+        elif flag == "r":
+            check_call(_LIB.MXRecordIOReaderCreate(uri, ctypes.byref(self.handle)))
+            self.writable = False
+        else:
+            raise ValueError("Invalid flag %s"%flag)
+
+    def __del__(self):
+        if self.writable:
+            check_call(_LIB.MXRecordIOWriterFree(self.handle))
+        else:
+            check_call(_LIB.MXRecordIOReaderFree(self.handle))
+
+    def write(self, buf):
+        """Write a string buffer as a record"""
+        assert self.writable
+        check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
+                                                    ctypes.c_char_p(buf),
+                                                    ctypes.c_size_t(len(buf))))
+
+    def read(self):
+        """Read a record as string"""
+        assert not self.writable
+        buf = ctypes.c_char_p()
+        size = ctypes.c_size_t()
+        check_call(_LIB.MXRecordIOReaderReadRecord(self.handle,
+                                                   ctypes.byref(buf),
+                                                   ctypes.byref(size)))
+        buf = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char*size.value))
+        return buf.contents.raw
+
+
+
 
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1f12c7943c7c..aaac3ee61d08 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -7,6 +7,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/io.h>
 #include <dmlc/memory_io.h>
+#include <dmlc/recordio.h>
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/symbolic.h>
@@ -993,3 +994,75 @@ int MXKVStoreGetType(KVStoreHandle handle,
   *CHECK_NOTNULL(type) = static_cast<KVStore*>(handle)->type().c_str();
   API_END();
 }
+
+struct MXRecordIOContext {
+  dmlc::RecordIOWriter *writer;
+  dmlc::RecordIOReader *reader;
+  dmlc::Stream *stream;
+  std::string *read_buff;
+};
+
+int MXRecordIOWriterCreate(const char *uri,
+                           RecordIOHandle *out) {
+  API_BEGIN();
+  dmlc::Stream *stream = dmlc::Stream::Create(uri, "w");
+  MXRecordIOContext *context = new MXRecordIOContext;
+  context->writer = new dmlc::RecordIOWriter(stream);
+  context->reader = NULL;
+  context->stream = stream;
+  context->read_buff = NULL;
+  *out = reinterpret_cast<RecordIOHandle>(context);
+  API_END();
+}
+
+int MXRecordIOWriterFree(RecordIOHandle handle) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  delete context->writer;
+  delete context->stream;
+  API_END();
+}
+
+int MXRecordIOWriterWriteRecord(RecordIOHandle *handle,
+                                const char *buf, size_t size) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  context->writer->WriteRecord(reinterpret_cast<const void*>(buf), size);
+  API_END();
+}
+
+int MXRecordIOReaderCreate(const char *uri,
+                           RecordIOHandle *out) {
+  API_BEGIN();
+  dmlc::Stream *stream = dmlc::Stream::Create(uri, "r");
+  MXRecordIOContext *context = new MXRecordIOContext;
+  context->reader = new dmlc::RecordIOReader(stream);
+  context->writer = NULL;
+  context->stream = stream;
+  context->read_buff = new std::string();
+  *out = reinterpret_cast<RecordIOHandle>(context);
+  API_END();
+}
+
+int MXRecordIOReaderFree(RecordIOHandle *handle) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  delete context->reader;
+  delete context->stream;
+  delete context->read_buff;
+  API_END();
+}
+
+int MXRecordIOReaderReadRecord(RecordIOHandle *handle,
+                              char const **buf, size_t *size) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  context->reader->NextRecord(context->read_buff);
+  *buf = context->read_buff->c_str();
+  *size = context->read_buff->size();
+  API_END();
+}

From 052471e72cee530573e348d11754c1b7268b601e Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Tue, 27 Oct 2015 19:38:59 -0700
Subject: [PATCH 120/122] move recordio interface to recordio.py

---
 python/mxnet/__init__.py |  1 +
 python/mxnet/io.py       | 41 ------------------------
 python/mxnet/recordio.py | 67 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 41 deletions(-)
 create mode 100644 python/mxnet/recordio.py

diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 5b216a53596b..a036e003ba77 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -12,6 +12,7 @@
 from . import symbol as sym
 from . import symbol
 from . import io
+from . import recordio
 # use mx.nd as short for mx.ndarray
 from . import ndarray as nd
 # use mx.rnd as short for mx.random
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 0954deb9c993..c11c3d21194f 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -258,47 +258,6 @@ def getpad(self):
         check_call(_LIB.MXDataIterGetPadNum(self.handle, ctypes.byref(pad)))
         return pad.value
 
-class MXRecordIO(object):
-    """Python interface for read/write RecordIO data formmat"""
-    def __init__(self, uri, flag):
-        uri = ctypes.c_char_p(uri)
-        self.handle = RecordIOHandle()
-        if flag == "w":
-            check_call(_LIB.MXRecordIOWriterCreate(uri, ctypes.byref(self.handle)))
-            self.writable = True
-        elif flag == "r":
-            check_call(_LIB.MXRecordIOReaderCreate(uri, ctypes.byref(self.handle)))
-            self.writable = False
-        else:
-            raise ValueError("Invalid flag %s"%flag)
-
-    def __del__(self):
-        if self.writable:
-            check_call(_LIB.MXRecordIOWriterFree(self.handle))
-        else:
-            check_call(_LIB.MXRecordIOReaderFree(self.handle))
-
-    def write(self, buf):
-        """Write a string buffer as a record"""
-        assert self.writable
-        check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
-                                                    ctypes.c_char_p(buf),
-                                                    ctypes.c_size_t(len(buf))))
-
-    def read(self):
-        """Read a record as string"""
-        assert not self.writable
-        buf = ctypes.c_char_p()
-        size = ctypes.c_size_t()
-        check_call(_LIB.MXRecordIOReaderReadRecord(self.handle,
-                                                   ctypes.byref(buf),
-                                                   ctypes.byref(size)))
-        buf = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char*size.value))
-        return buf.contents.raw
-
-
-
-
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
     name = ctypes.c_char_p()
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
new file mode 100644
index 000000000000..dc07af796bde
--- /dev/null
+++ b/python/mxnet/recordio.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, fixme, too-many-arguments
+
+from __future__ import absolute_import
+
+import ctypes
+from .base import _LIB
+from .base import RecordIOHandle
+from .base import check_call
+
+class MXRecordIO(object):
+    """Python interface for read/write RecordIO data formmat
+
+    Parameters
+    ----------
+    uri : string
+        uri path to recordIO file.
+    flag : string
+        "r" for reading or "w" writing.
+    """
+    def __init__(self, uri, flag):
+        uri = ctypes.c_char_p(uri)
+        self.handle = RecordIOHandle()
+        if flag == "w":
+            check_call(_LIB.MXRecordIOWriterCreate(uri, ctypes.byref(self.handle)))
+            self.writable = True
+        elif flag == "r":
+            check_call(_LIB.MXRecordIOReaderCreate(uri, ctypes.byref(self.handle)))
+            self.writable = False
+        else:
+            raise ValueError("Invalid flag %s"%flag)
+
+    def __del__(self):
+        if self.writable:
+            check_call(_LIB.MXRecordIOWriterFree(self.handle))
+        else:
+            check_call(_LIB.MXRecordIOReaderFree(self.handle))
+
+    def write(self, buf):
+        """Write a string buffer as a record
+
+        Parameters
+        ----------
+        buf : string
+            buffer to write.
+        """
+        assert self.writable
+        check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
+                                                    ctypes.c_char_p(buf),
+                                                    ctypes.c_size_t(len(buf))))
+
+    def read(self):
+        """Read a record as string
+
+        Returns
+        ----------
+        buf : string
+            buffer read.
+        """
+        assert not self.writable
+        buf = ctypes.c_char_p()
+        size = ctypes.c_size_t()
+        check_call(_LIB.MXRecordIOReaderReadRecord(self.handle,
+                                                   ctypes.byref(buf),
+                                                   ctypes.byref(size)))
+        buf = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char*size.value))
+        return buf.contents.raw
\ No newline at end of file

From 7fb08dcfb3e18019e4330d64c3c441f4a91c6524 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Thu, 29 Oct 2015 00:14:37 -0700
Subject: [PATCH 121/122] lint fix

---
 python/mxnet/io.py       | 2 +-
 python/mxnet/recordio.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index c11c3d21194f..c9c96b57d0a9 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -11,7 +11,7 @@
 import logging
 from .base import _LIB
 from .base import c_array, c_str, mx_uint, py_str
-from .base import DataIterHandle, NDArrayHandle, RecordIOHandle
+from .base import DataIterHandle, NDArrayHandle
 from .base import check_call, ctypes2docstring
 from .ndarray import NDArray
 from .ndarray import array
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index dc07af796bde..5346230f5101 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 # pylint: disable=invalid-name, protected-access, fixme, too-many-arguments
 
+"""Python interface for DLMC RecrodIO data format"""
 from __future__ import absolute_import
 
 import ctypes
@@ -64,4 +65,4 @@ def read(self):
                                                    ctypes.byref(buf),
                                                    ctypes.byref(size)))
         buf = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char*size.value))
-        return buf.contents.raw
\ No newline at end of file
+        return buf.contents.raw

From b2a1d133dd91131873ff042e9aa58e449d2caa78 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 29 Oct 2015 10:31:26 -0600
Subject: [PATCH 122/122] [Example] fix convention

---
 example/cifar10/cifar10.py | 6 +++---
 example/mnist/lenet.py     | 4 ++--
 example/mnist/mlp.py       | 2 +-
 example/mnist/mlp_numpy.py | 2 +-
 ps-lite                    | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index ce51758ba962..efb1122504a0 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -61,7 +61,7 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 
 get_data.GetCifar10()
 batch_size = 128
-num_round = 10
+num_epoch = 10
 num_gpus = 1
 
 train_dataiter = mx.io.ImageRecordIter(
@@ -84,12 +84,12 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 def test_cifar():
     logging.basicConfig(level=logging.DEBUG)
     gpus = [mx.gpu(i) for i in range(num_gpus)]
-    model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_round = num_round,
+    model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_epoch=num_epoch,
                                  learning_rate=0.05, momentum=0.9, wd=0.0001,
                                  initializer=mx.init.Uniform(0.07))
 
     model.fit(X=train_dataiter, eval_data=test_dataiter,
-              epoch_end_callback=mx.callback.Speedometer(batch_size))
+              batch_end_callback=mx.callback.Speedometer(batch_size))
 
 if __name__ == "__main__":
     test_cifar()
diff --git a/example/mnist/lenet.py b/example/mnist/lenet.py
index 6d185f8d278c..40779150ccfb 100644
--- a/example/mnist/lenet.py
+++ b/example/mnist/lenet.py
@@ -33,7 +33,7 @@
 # dev = [mx.gpu(i) for i in range(2)]
 dev = mx.gpu()
 model = mx.model.FeedForward(
-    ctx = dev, symbol = lenet, num_round = 20,
+    ctx = dev, symbol = lenet, num_epoch = 20,
     learning_rate = 0.05, momentum = 0.9, wd = 0.00001)
 model.fit(X=train, eval_data=val,
-          epoch_end_callback=mx.callback.Speedometer(100))
+          batch_end_callback=mx.callback.Speedometer(100))
diff --git a/example/mnist/mlp.py b/example/mnist/mlp.py
index 7facf2d3bc50..0cfffe55cbe4 100644
--- a/example/mnist/mlp.py
+++ b/example/mnist/mlp.py
@@ -22,7 +22,7 @@
 logging.basicConfig(level=logging.DEBUG)
 
 model = mx.model.FeedForward(
-    ctx = mx.cpu(), symbol = mlp, num_round = 20,
+    ctx = mx.cpu(), symbol = mlp, num_epoch = 20,
     learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
 
 model.fit(X=train, eval_data=val)
diff --git a/example/mnist/mlp_numpy.py b/example/mnist/mlp_numpy.py
index af1cd011b148..114a6bf257d5 100644
--- a/example/mnist/mlp_numpy.py
+++ b/example/mnist/mlp_numpy.py
@@ -39,7 +39,7 @@
 logging.basicConfig(level=logging.DEBUG)
 
 model = mx.model.FeedForward(
-    ctx = mx.cpu(), symbol = mlp, num_round = 20,
+    ctx = mx.cpu(), symbol = mlp, num_epoch = 20,
     learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
 
 # train by using Numpy ndarray direcly
diff --git a/ps-lite b/ps-lite
index 7121aa1bdb67..0cc04093f7c9 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 7121aa1bdb673f047c7600eb4347fd2911021710
+Subproject commit 0cc04093f7c9e07155f585552f31a90715bacef6