From 0c77f9ff6da8854bda95bf95df7b8b369676d6a4 Mon Sep 17 00:00:00 2001
From: qcl6355 <qcl6355@gmail.com>
Date: Mon, 30 May 2016 12:37:26 +0800
Subject: [PATCH 001/126] update text_cnn_classification (auto learn word
 embedding)

---
 example/cnn_text_classification/text_cnn.py | 87 +++++++++++++++++----
 1 file changed, 72 insertions(+), 15 deletions(-)

diff --git a/example/cnn_text_classification/text_cnn.py b/example/cnn_text_classification/text_cnn.py
index c944ec5c9270..4ce48a94ac77 100644
--- a/example/cnn_text_classification/text_cnn.py
+++ b/example/cnn_text_classification/text_cnn.py
@@ -13,20 +13,28 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__) # get a logger to accuracies are printed
 
+logs = sys.stderr
+
 CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])
 
-def make_text_cnn(sentence_size, num_embed, batch_size, num_label=2, filter_list=[3, 4, 5], num_filter=100, dropout=0.):
+def make_text_cnn(sentence_size, num_embed, batch_size, vocab_size,
+        num_label=2, filter_list=[3, 4, 5], num_filter=100,
+        dropout=0., with_embedding=True):
+
     input_x = mx.sym.Variable('data') # placeholder for input
     input_y = mx.sym.Variable('softmax_label') # placeholder for output
 
     # embedding layer
-    # embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
-    # embed_layer = mx.sym.Reshape(data=embed_layer, target_shape=(1, 1, sentence_size, num_embed))
+    if not with_embedding:
+        embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
+        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
+    else:
+        conv_input = input_x
 
     # create convolution + (max) pooling layer for each filter operation
     pooled_outputs = []
     for i, filter_size in enumerate(filter_list):
-        convi = mx.sym.Convolution(data=input_x, kernel=(filter_size, num_embed), num_filter=num_filter)
+        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
         relui = mx.sym.Activation(data=convi, act_type='relu')
         pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
         pooled_outputs.append(pooli)
@@ -54,12 +62,18 @@ def make_text_cnn(sentence_size, num_embed, batch_size, num_label=2, filter_list
     return sm
 
 
-def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, dropout=0.5, initializer=mx.initializer.Uniform(0.1)):
-    cnn = make_text_cnn(sentence_size, num_embed, batch_size=batch_size, dropout=dropout)
+def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, vocab_size,
+        dropout=0.5, initializer=mx.initializer.Uniform(0.1), with_embedding=True):
+
+    cnn = make_text_cnn(sentence_size, num_embed, batch_size=batch_size,
+            vocab_size=vocab_size, dropout=dropout, with_embedding=with_embedding)
     arg_names = cnn.list_arguments()
 
     input_shapes = {}
-    input_shapes['data'] = (batch_size, 1, sentence_size, num_embed)
+    if with_embedding:
+        input_shapes['data'] = (batch_size, 1, sentence_size, num_embed)
+    else:
+        input_shapes['data'] = (batch_size, sentence_size)
 
     arg_shape, out_shape, aux_shape = cnn.infer_shape(**input_shapes)
     arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
@@ -88,7 +102,8 @@ def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, dropout=0.5, init
     return CNNModel(cnn_exec=cnn_exec, symbol=cnn, data=data, label=label, param_blocks=param_blocks)
 
 
-def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, batch_size, optimizer='rmsprop', max_grad_norm=5.0, learning_rate=0.001, epoch=200):
+def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, batch_size,
+        optimizer='rmsprop', max_grad_norm=5.0, learning_rate=0.0005, epoch=200):
     m = model
     # create optimizer
     opt = mx.optimizer.create(optimizer)
@@ -139,13 +154,25 @@ def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, bat
         # decay learning rate
         if iteration % 50 == 0 and iteration > 0:
             opt.lr *= 0.5
-            print >> sys.stderr, 'reset learning rate to %g' % opt.lr
+            print >> logs, 'reset learning rate to %g' % opt.lr
 
         # end of training loop
         toc = time.time()
-        print >> sys.stderr, 'Iter [%d] Train: Time: %.3f, Training Accuracy: %.3f' % (iteration, toc - tic, num_correct * 100 / float(num_total))
+        train_time = toc - tic
+        train_acc = num_correct * 100 / float(num_total)
+
+        # saving checkpoint
+        if (iteration + 1) % 10 == 0:
+            prefix = 'cnn'
+            m.symbol.save('checkpoint/%s-symbol.json' % prefix)
+            save_dict = {('arg:%s' % k) :v  for k, v in m.cnn_exec.arg_dict.items()}
+            save_dict.update({('aux:%s' % k) : v for k, v in m.cnn_exec.aux_dict.items()})
+            param_name = 'checkpoint/%s-%04d.params' % (prefix, iteration)
+            mx.nd.save(param_name, save_dict)
+            print >> logs, 'Saved checkpoint to %s' % param_name
+
 
-        # eval on dev set
+        # evaluate on dev set
         num_correct = 0
         num_total = 0
         for begin in range(0, X_dev_batch.shape[0], batch_size):
@@ -161,7 +188,9 @@ def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, bat
             num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
             num_total += len(batchY)
 
-        print >> sys.stderr, 'Dev Accuracy thus far: %.3f' % ( num_correct * 100 / float(num_total) )
+        dev_acc = num_correct * 100 / float(num_total)
+        print >> logs, 'Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
+                --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc)
 
 
 def main():
@@ -170,7 +199,6 @@ def main():
     word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
     x, y = data_helpers.load_data_with_word2vec(word2vec)
 
-
     # randomly shuffle data
     np.random.seed(10)
     shuffle_indices = np.random.permutation(np.arange(len(y)))
@@ -194,9 +222,38 @@ def main():
     print 'embedding size', num_embed
     batch_size = 50
 
-    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, dropout=0.5)
+    cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5)
+    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
+
+def train_without_pretrained_embedding():
+    x, y, vocab, vocab_inv = data_helpers.load_data()
+    vocab_size = len(vocab)
+
+    # randomly shuffle data
+    np.random.seed(10)
+    shuffle_indices = np.random.permutation(np.arange(len(y)))
+    x_shuffled = x[shuffle_indices]
+    y_shuffled = y[shuffle_indices]
+
+    # split train/dev set
+    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
+    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
+    print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
+    print 'train shape:', x_train.shape
+    print 'dev shape:', x_dev.shape
+    print 'vocab_size', vocab_size
+   
+    batch_size = 50
+    num_embed = 300
+    sentence_size = x_train.shape[1]
+
+    print 'batch size', batch_size
+    print 'sentence max words', sentence_size
+    print 'embedding size', num_embed
+
+    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, vocab_size, dropout=0.5, with_embedding=False)
     train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
 
 
 if __name__ == '__main__':
-    main()
+    train_without_pretrained_embedding()

From 7b5bb424c9f7a44c2f6a9fe8e07e6d5bf7844f26 Mon Sep 17 00:00:00 2001
From: qcl6355 <qcl6355@gmail.com>
Date: Mon, 30 May 2016 12:39:58 +0800
Subject: [PATCH 002/126] add chinese word segment module using lstm model

---
 example/lstm-word-segment/data_helper.py      | 102 ++++++
 example/lstm-word-segment/lstm.py             | 295 ++++++++++++++++++
 .../lstm-word-segment/predict/cpp/Makefile    |  31 ++
 .../predict/cpp/lstm-word-segment-predict     | Bin 0 -> 36554 bytes
 .../predict/cpp/lstm-word-segment-predict.cc  | 205 ++++++++++++
 .../predict/lstm-word-segment-predict.cc      | 220 +++++++++++++
 .../lstm-word-segment/predict/lstm_predict.py |  68 ++++
 .../predict/mxnet_predict.py                  | 210 +++++++++++++
 example/lstm-word-segment/train.py            |  54 ++++
 9 files changed, 1185 insertions(+)
 create mode 100755 example/lstm-word-segment/data_helper.py
 create mode 100644 example/lstm-word-segment/lstm.py
 create mode 100644 example/lstm-word-segment/predict/cpp/Makefile
 create mode 100755 example/lstm-word-segment/predict/cpp/lstm-word-segment-predict
 create mode 100644 example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc
 create mode 100644 example/lstm-word-segment/predict/lstm-word-segment-predict.cc
 create mode 100755 example/lstm-word-segment/predict/lstm_predict.py
 create mode 100644 example/lstm-word-segment/predict/mxnet_predict.py
 create mode 100755 example/lstm-word-segment/train.py

diff --git a/example/lstm-word-segment/data_helper.py b/example/lstm-word-segment/data_helper.py
new file mode 100755
index 000000000000..0632a7e7aba6
--- /dev/null
+++ b/example/lstm-word-segment/data_helper.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+import sys
+import codecs
+import numpy as np
+
+LabelVocab = {'B':0, 'M':1, 'E':2, 'S':3}
+
+def gold_to_conll(infile):
+    for line in codecs.open(infile, 'r', 'utf-8'):
+        words = line.strip().split()
+        for word in words:
+            num_chars = len(word)
+            for idx, char in enumerate(word):
+                char = char.encode('utf-8')
+                if num_chars == 1:
+                    print '%s\t%s' % (char, 'S')
+                else:
+                    if idx == 0:
+                        print '%s\t%s' % (char, 'B')
+                    elif idx == num_chars - 1:
+                        print '%s\t%s' % (char, 'E')
+                    else:
+                        print '%s\t%s' % (char, 'M')
+        print
+
+def load_data(infile, vocab=None, train=True):
+    if vocab is None:
+        vocab = {}
+        vocab['#_beg_#'] = 0
+        vocab['#_end_#'] = 1
+        vocab['#_unknown_#'] = 2
+    X_data = []
+    y_data = []
+    x = []
+    y = []
+    for line in open(infile):
+        line = line.strip()
+        if line == "": # begin a new sentence:
+            if len(x) != 0:
+                X_data.append(x)
+                y_data.append(y)
+                x = []
+                y = []
+        else:
+            w, label = line.split('\t')
+            y.append(LabelVocab[label])
+            if w not in vocab:
+                if train:
+                    vocab[w] = len(vocab)
+                    x.append(vocab[w])
+                else:
+                    x.append(vocab['#_unknown_#'])
+            else:
+                x.append(vocab[w])
+    
+    if len(x) != 0:
+        X_data.append(x)
+        y_data.append(y)
+    return X_data, y_data, vocab
+
+def reshape_data(sentences, labels, vocab, context_size=5, step=10):
+    padding_num = int((context_size - 1) / 2)
+    x = []
+    y = []
+    for sen, label in zip(sentences, labels):
+        predict_word_num = len(sen)
+        add_num = step - predict_word_num % step
+        for i in range(add_num):
+            sen.append(vocab['#_end_#'])
+            label.append(LabelVocab['S'])
+
+        for _ in range(padding_num):
+            sen.insert(0, vocab['#_beg_#'])
+            sen.append(vocab['#_end_#'])
+        
+        x_t = []
+        y_t = []
+        for i in range(padding_num, len(sen)-padding_num):
+            if len(x_t) == step:
+                x.append(x_t)
+                y.append(y_t)
+                x_t = []
+                y_t = []
+            x_t.append(sen[i-padding_num:i+padding_num+1])
+            y_t.append(label[i-padding_num])
+
+        if len(x_t) == step:
+            x.append(x_t)
+            y.append(y_t)
+    
+    return np.array(x), np.array(y)
+    
+
+if __name__ == '__main__':
+    test_path = "test.conll"
+    x, y, vocab = load_data(test_path)
+    print 'vocab size %d' % (len(vocab))
+    X_data, y_data = reshape_data(x, y, vocab)
+    print X_data.shape, y_data.shape
+    print X_data[0]
+    print y_data[0]
+    
diff --git a/example/lstm-word-segment/lstm.py b/example/lstm-word-segment/lstm.py
new file mode 100644
index 000000000000..20aced9f58bd
--- /dev/null
+++ b/example/lstm-word-segment/lstm.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python
+
+import sys
+import mxnet as mx
+import numpy as np
+import time
+import math
+from collections import namedtuple
+
+logs = sys.stderr
+
+LSTMState = namedtuple("LSTMState", ['c', 'h'])
+LSTMParam = namedtuple('LSTMParam', ['i2h_weight', 'i2h_bias', 'h2h_weight', 'h2h_bias'])
+LSTMModel = namedtuple('LSTMModel', ['lstm_exec', 'symbol', 'init_states', 'last_states', 'seq_data', 'seq_labels', 'seq_outputs', 'param_blocks'])
+
+
+def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout):
+    """LSTM Memory Unit"""
+    i2h = mx.sym.FullyConnected(data=indata, weight=param.i2h_weight, bias=param.i2h_bias,
+                                num_hidden=num_hidden * 4, name='t%d_l%d_i2h' % (seqidx, layeridx))
+    h2h = mx.sym.FullyConnected(data=prev_state.h, weight=param.h2h_weight, bias=param.h2h_bias,
+                                num_hidden=num_hidden * 4, name='t%d_l%d_h2h' % (seqidx, layeridx))
+    gates = i2h + h2h
+    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4, name='t%d_l%d_slice' % (seqidx, layeridx))
+
+    # input gate
+    input_gate = mx.sym.Activation(slice_gates[0], act_type='sigmoid')
+    input_transform = mx.sym.Activation(slice_gates[1], act_type='tanh')
+    # forget gate
+    forget_gate = mx.sym.Activation(slice_gates[2], act_type='sigmoid')
+    # output gate
+    output_gate = mx.sym.Activation(slice_gates[3], act_type='sigmoid')
+    next_c = (forget_gate * prev_state.c) + (input_gate * input_transform)
+    next_h = output_gate * mx.sym.Activation(next_c, act_type='tanh')
+
+    return LSTMState(c=next_c, h=next_h)
+
+
+def unroll_lstm(num_lstm_layer, num_hidden, step_size, context_size, vocab_size, num_embed, num_label, dropout=0.):
+    # initialize the parameter sysmbols
+    embed_weight = mx.sym.Variable('embed_weight')
+    cls_weight = mx.sym.Variable('cls_weight')
+    cls_bias = mx.sym.Variable('cls_bias')
+
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable('l%d_i2h_weight' % i),
+                                     i2h_bias=mx.sym.Variable('l%d_i2h_bias' % i),
+                                     h2h_weight=mx.sym.Variable('l%d_h2h_weight' % i),
+                                     h2h_bias=mx.sym.Variable('l%d_h2h_bias' % i)))
+        state = LSTMState(c=mx.sym.Variable('l%d_init_c' % i), h=mx.sym.Variable('l%d_init_h' % i))
+        last_states.append(state)
+
+    # embedding layer
+    # data = mx.sym.Variable('data')
+    # label = mx.sym.Variable('label')
+    # embed = mx.sym.Embedding(data=data, weight=embed_weight,
+    #         input_dim=vocab_size, output_dim=num_embed, name='embed')
+    # wordvec = mx.sym.SliceChannel(data=embed, num_outputs=context_size, squeeze_axis=1)
+    last_hidden = []
+    for seqidx in range(step_size):
+        # embedding layer
+        data = mx.sym.Variable("t%d_data" % seqidx)
+        hidden = mx.sym.Embedding(data=data, weight=embed_weight,
+                input_dim=vocab_size, output_dim=num_embed, name='t%d_embed' % seqidx)
+
+        # stack LSTM
+        for i in range(num_lstm_layer):
+            if i == 0:
+                dp = 0.
+            else:
+                dp = dropout
+            next_state = lstm(num_hidden, indata=hidden, prev_state=last_states[i],
+                              param=param_cells[i], seqidx=seqidx, layeridx=i, dropout=dropout)
+            hidden = next_state.h
+            last_states[i] = next_state
+
+        # decoder
+        if dropout > 0.:
+            hidden = mx.sym.Dropout(data=hidden, p=dropout)
+        last_hidden.append(hidden)
+
+    out_prob = []
+    for seqidx in range(step_size):
+        fc = mx.sym.FullyConnected(data=last_hidden[seqidx], weight=cls_weight,
+                bias=cls_bias, num_hidden=num_label, name='t%d_cls' % seqidx)
+        label = mx.sym.Variable('t%d_label' % seqidx)
+        sm = mx.sym.SoftmaxOutput(data=fc, label=label, name='t%d_sm' % seqidx)
+        out_prob.append(sm)
+
+    # concat = mx.sym.Concat(*last_hidden, dim=0)
+    # fc = mx.sym.FullyConnected(data=concat, weight=cls_weight, bias=cls_bias, num_hidden=num_label)
+    # label = mx.sym.Variable("label")
+    # sm = mx.sym.SoftmaxOutput(data=fc, label=label, name='sm')
+
+    # hidden_concat = mx.sym.Concat(*last_hidden, dim=0)
+    # use last hidden h as feature
+    # fc = mx.sym.FullyConnected(data=last_hidden[-1], weight=cls_weight, bias=cls_bias, num_hidden=num_label)
+    # sm = mx.sym.SoftmaxOutput(data=fc, label=label, name='sm')
+
+    # out_prob = [sm]
+
+    for i in range(num_lstm_layer):
+        state = last_states[i]
+        state = LSTMState(c=mx.sym.BlockGrad(state.c, name='l%d_last_c' % i),
+                          h=mx.sym.BlockGrad(state.h, name='l%d_last_h' % i))
+        last_states[i] = state
+
+    unpack_c = [state.c for state in last_states]
+    unpack_h = [state.h for state in last_states]
+    list_all = out_prob + unpack_c + unpack_h
+    return mx.sym.Group(list_all)
+
+
+def is_param_name(name):
+    return name.endswith('weight') or name.endswith('bias') or \
+        name.endswith('gamma') or name.endswith('beta')
+
+def setup_lstm_model(ctx, num_lstm_layer, step_size, context_size, num_hidden, num_embed, num_label,
+        batch_size, vocab_size, initializer, dropout=0.):
+
+    lstm_sym = unroll_lstm(num_lstm_layer=num_lstm_layer, num_hidden=num_hidden, step_size=step_size,
+                           context_size=context_size, vocab_size=vocab_size,
+                           num_embed=num_embed, num_label=num_label, dropout=dropout)
+
+    arg_names = lstm_sym.list_arguments()
+
+    input_shapes = {}
+    for name in arg_names:
+        if name.endswith('init_c') or name.endswith('init_h'):
+            input_shapes[name] = (batch_size, num_hidden)
+        elif name.endswith('data'):
+            input_shapes[name] = (batch_size, context_size)
+        elif name == 'label':
+            input_shapes[name] = (batch_size * step_size, )
+        elif name.endswith('label'):
+            input_shapes[name] = (batch_size,)
+        else:
+            pass
+
+    arg_shape, out_shape, aux_shape = lstm_sym.infer_shape(**input_shapes)
+    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
+    args_grad = {}
+    for shape, name, in zip(arg_shape, arg_names):
+        if is_param_name(name):
+            print >> logs, 'parameter argument', name, shape
+            args_grad[name] = mx.nd.zeros(shape, ctx)
+        else:
+            print >> logs, 'input argument', name, shape
+
+    lstm_exec = lstm_sym.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add')
+
+    param_blocks = []
+    arg_dict = dict(zip(arg_names, lstm_exec.arg_arrays))
+    for i, name in enumerate(arg_names):
+        if is_param_name(name):
+            initializer(name, arg_dict[name])
+            param_blocks.append( (i, arg_dict[name], args_grad[name], name) )
+        else:
+            assert name not in args_grad
+
+    out_dict = dict(zip(lstm_sym.list_outputs(), lstm_exec.outputs))
+
+    init_states = [LSTMState(c=arg_dict['l%d_init_c' % i],
+                             h=arg_dict['l%d_init_h' % i]) for i in range(num_lstm_layer)]
+    seq_data = [arg_dict['t%d_data' % i] for i in range(step_size)]
+    last_states = [LSTMState(c=out_dict['l%d_last_c_output' % i],
+                             h=out_dict['l%d_last_h_output' % i]) for i in range(num_lstm_layer)]
+    seq_outputs = [out_dict['t%d_sm_output' % i] for i in range(step_size)]
+    seq_labels = [arg_dict['t%d_label' % i] for i in range(step_size)]
+
+    return LSTMModel(lstm_exec=lstm_exec, symbol=lstm_sym, init_states=init_states,
+                     last_states=last_states, seq_data=seq_data, seq_labels=seq_labels,
+                     seq_outputs=seq_outputs, param_blocks=param_blocks)
+
+
+def set_lstm_inputs(m, x_batch, y_batch):
+    step_size = len(m.seq_data)
+    batch_size = m.seq_data[0].shape[0]
+    # print 'x batch shape %s' % str(x_batch[:, 0, :].shape)
+    # print 'y batch shape %s' % str(y_batch.shape)
+    for seqidx in range(step_size):
+        m.seq_data[seqidx][:] = x_batch[:, seqidx, :]
+        m.seq_labels[seqidx][:] = y_batch[:, seqidx]
+
+
+# shape : num-instance * context-size
+def train_lstm(model, X_train_batch, y_train_batch, X_val_batch, y_val_batch,
+        num_epoch, optimizer='RMSProp', max_grad_norm=5.0, learning_rate=0.001, **kwargs):
+    print >> logs, 'Training with train shape=%s' % str(X_train_batch.shape)
+    print >> logs, 'Training with dev shape=%s' % str(X_val_batch.shape)
+
+    m = model
+    batch_size = m.seq_data[0].shape[0]
+    step_size = len(m.seq_data)
+    print >> logs, 'batch_size=%d' % batch_size
+    print >> logs, 'step_size=%d' % step_size
+    eta = 1e-4
+
+    opt = mx.optimizer.create(optimizer, **kwargs)
+    opt.lr = learning_rate
+    updater = mx.optimizer.get_updater(opt)
+
+    for iteration in range(num_epoch):
+        # reset states
+        for state in m.init_states:
+            state.c[:] = 0.0
+            state.h[:] = 0.0
+
+        tic = time.time()
+        num_correct = 0.
+        num_total = 0.
+        for begin in range(0, X_train_batch.shape[0], batch_size):
+            batchX = X_train_batch[begin:begin+batch_size]
+            batchY = y_train_batch[begin:begin+batch_size]
+            if batchX.shape[0] != batch_size:
+                continue
+
+            # m.seq_data[:] = batchX
+            # m.seq_labels[:] = batchY
+            set_lstm_inputs(m, batchX, batchY)
+
+            m.lstm_exec.forward(is_train=True)
+
+            m.lstm_exec.backward()
+            # transfer the states
+            for init, last in zip(m.init_states, m.last_states):
+                last.c.copyto(init.c)
+                last.h.copyto(init.h)
+
+            # update parameters
+            norm = 0.
+            for idx, weight, grad, name in m.param_blocks:
+                grad /= batch_size
+                l2_norm = mx.nd.norm(grad).asscalar()
+                norm += l2_norm * l2_norm;
+            norm = math.sqrt(norm)
+            for idx, weight, grad, name in m.param_blocks:
+                if norm > max_grad_norm:
+                    grad *= (max_grad_norm / norm)
+                updater(idx, grad, weight)
+                # reset gradient to zero
+                grad[:] = 0.0
+
+            pred = np.array([np.argmax(ypred.asnumpy(), axis=1) for ypred in m.seq_outputs])
+            pred = pred.transpose()
+            num_correct += sum((batchY == pred).flatten())
+            num_total += batch_size * step_size
+
+        # end of training epoch
+        toc = time.time()
+        train_acc = num_correct * 100.0 / num_total
+
+        # saving checkpoint
+        prefix = 'lstm'
+        m.symbol.save('checkpoint/%s-symbol.json' % prefix)
+        save_dict = { ('arg:%s' % k) :v  for k, v in m.lstm_exec.arg_dict.items() if is_param_name(k) }
+        save_dict.update({('aux:%s' % k) : v for k, v in m.lstm_exec.aux_dict.items()})
+        param_name = 'checkpoint/%s-%04d.params' % (prefix, iteration)
+        mx.nd.save(param_name, save_dict)
+        print >> logs, 'Saved checkpoint to %s' % param_name
+
+        # evaluate on dev data
+        num_correct = 0.
+        num_total = 0.
+        for begin in range(0, X_val_batch.shape[0], batch_size):
+            batchX = X_val_batch[begin:begin+batch_size]
+            batchY = y_val_batch[begin:begin+batch_size]
+            if batchX.shape[0] != batch_size:
+                continue
+
+            # m.seq_data[:] = batchX
+            # m.seq_labels[:] = batchY
+            set_lstm_inputs(m, batchX, batchY)
+
+            m.lstm_exec.forward(is_train=False)
+            pred = np.array([np.argmax(ypred.asnumpy(), axis=1) for ypred in m.seq_outputs])
+            pred = pred.transpose()
+            num_correct += sum((batchY == pred).flatten())
+            num_total += batch_size * step_size
+    
+        dev_acc = num_correct * 100 / float(num_total)
+        print >> logs, 'Iter [%d] Train: Time: %.3fs, Training Accuracy:%.3f---Dev Accuracy thus far: %.3f' \
+            % (iteration, toc - tic, train_acc, dev_acc)
+    
+
+if __name__ == '__main__':
+    lstm_model = setup_lstm_model(ctx=mx.cpu(0), num_lstm_layer=1,
+                                  context_size = 7,
+                                  num_hidden=100, num_embed=300,
+                                  num_label=4, batch_size=50,
+                                  vocab_size=1000,
+                                  initializer=mx.initializer.Uniform(0.1),
+                                  dropout=0.5)
diff --git a/example/lstm-word-segment/predict/cpp/Makefile b/example/lstm-word-segment/predict/cpp/Makefile
new file mode 100644
index 000000000000..5047a15bba65
--- /dev/null
+++ b/example/lstm-word-segment/predict/cpp/Makefile
@@ -0,0 +1,31 @@
+# Special thanks to https://github.com/pertusa for the Makefile
+CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall
+
+# Added for openblas
+# export OPENBLAS_ROOT=/usr/local/opt/openblas
+
+# CFLAGS+= -I${OPENBLAS_ROOT}/include
+# LDFLAGS=-L${OPENBLAS_ROOT}/lib -lopenblas
+
+# Added for opencv
+# CFLAGS+= `pkg-config --cflags opencv`
+# LDFLAGS+=`pkg-config --libs opencv`
+
+# Added for mxnet
+export MXNET_ROOT=/home/lisheng.ls/mxnet
+
+CFLAGS+= -I$(MXNET_ROOT)/include 
+LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so
+
+lstm-word-segment-predict: lstm-word-segment-predict.o
+	g++ -O3 -o lstm-word-segment-predict lstm-word-segment-predict.o $(LDFLAGS)
+
+lstm-word-segment-predict.o: lstm-word-segment-predict.cc
+	g++ -O3 -c lstm-word-segment-predict.cc ${CFLAGS}
+	
+clean: 
+	rm lstm-word-segment-predict
+	rm -f *.d *.o
+
+lint:
+	python ../../../dmlc-core/scripts/lint.py mxnet "cpp" ./
diff --git a/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict b/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict
new file mode 100755
index 0000000000000000000000000000000000000000..950408da9309b5681c56777f595be9f8f4087810
GIT binary patch
literal 36554
zcmeHw3wTu3wf~u!Oo#}X0Z}6gjA>9%i5Uo(AeIS9;0#VMCE-zT>0~mQKqN0SGZ1`~
z*d*w27^T#|Ra^V#dRxoYwsKo*f!YwBL9dU1Lh(_FigJcX4Fw@SbAM}}y(ecTGgo@Q
z{`-CZ`}M%g-fQi(*Is+=wb$NfpUGKki(PY#27@Fnqx4OQpd(Wq45fWblEceDj?0rS
zkQ~wk=?W<o7&AT&Rzj(k6KOwZEz&ejXF{5SPo7aj<QW|-Bho80G+NLuAt9yI-r(in
zSuLHrL<2=CNj8R{dL--CWeh}G!7?J9MZ(}}yg*AuJra-V9pUwk@OmP(ars3m`bX^O
zQ_joFjSfHrJ1CKg*!N({o0m#I2vkwNZ-GY8x4^+NBHg2*C6S7H-$Ff-^L#HWCtS$e
zOQsi@%P&$f4%gS#RL;n~zP4&gZB2c!c}jEMj43m6v;7U(Q(3=>Khe%xu!yxJ^k@S_
zGLONBY=FvtanlC_hrT>^QA)b?^cTzTyX3RU`xlHt*(`jBmUKF4SB~L{D>oT8XF8+`
zX~ZZD>%uhKD5qpd`<69D6O!m1_{_nl2p<`r`S_IJa|=FneM?KEg-FYIzMQ8gUi&!R
z_)6upYTs6;|KVZAt>IJY|8cUae9XBE{cqg9^on<~c0RQI{f&3+e|y60iFG3i)83sh
z+w$qz13$f@ack}`-uvSk=gQ~i6d(F%a`7S8`foc<m2|z-?SDRX=A+-+JL2L;Mil<;
z(F_tT;q@#!Iyyngv#`fR_zN%z65)%J@cGfO#Pl~M;h&y_{$G>O+mgsLkc7|S;fdur
zl>~o0NxLVK(7&04K0k?^tCHYrlkn+F(y#xMME`$IBIh44X%mg(?MdiwNFt9LlOz$J
z%}MzGTax~&N#uVb3I9p3Sz7|T9Z1q{TatEvk_5jjiJoiWToUznVUl*2CgJ~T5<UMY
z37>;W<QbQQo}76iIWJ1m?zKtyY)`@`9Zob6|MDdC{Yl#GOk!78Ch6~#B=qJa{K+>b
zvXkeM&@WDcmo7})-=#_T-vfLi{VY#He?OLz(Nem!r^Eq@!RwpQbEY&>YMkd_$YG$P
zs|9+_MC!<N;Ct|T0eTxHjg>m9Di|^Y_Da`!d`6=a{o@@3Mz3qp-z!1y;PfrLWCPlv
zb~oMZV3;R3pUKdJ74&C36`WE_?m#)ww>TXPaS5lN&gmyDuh0nj_Y0hVcY%Xpp62*&
zj4#RG<Evm8E9dhKu7`N~Q<z^Qe`lpeU&{Glqp0<Z<6Ag>BT|yHXO4qmy4klRg}I-}
z$<**1EoCzKFB{U|nS8!<$I;&eJ|w@5w`=2!MgdQ9N?Z>D-^}$d#nB(*`q@(GU>qC<
zjnP@b-&?^DLO<W)^zq{*UB9BC&WBx-e}%7pd3LS;`nu+NUx2nz^kD#YPm+87D)&O)
z@*01@*Hl>R@%w##$?YsJaaZ}8IJ&%~u(qMzSMI5-^+|5`^16n4w?E)%3b@@GX+px1
zrKL^2s<}-*)Tybj2}tf+7nB9;Id0kGUlAZ)&N6>lAh*#|)8xFl%wJPn=5dxybr*vw
z&yD5-o|;;am-%Xa-hka7C<fkld(czs4m6-Dz}yvJMcD;qxo&%&yQHDY-Q=lX?sHQ?
zP5p9KFg3?r<g4}sYXe$g6)P1*rA;+;K6jIkYPcI4YHPe}G<G)(3hE9td1?ZFr>oLl
z?5fPc$6kzUy0_NnX)0bN@vb&Bt@bokY5k1$Am3f$X>?b5{0X}|jk@bT?_MVBq&sOJ
zsU&e1-S)5TA?s&NbMZn9i`K0ft1uP~O*C>CEPrva9sP6r>Kbc3ULShtUFr4&o1Ltv
zcvVs3E#;_Kjp6^Bs?R6R(duM2L)tB}L+{L@L{T*)s)j%}K%go<-k!!rUwu_^9Uq~2
zzQ8TPKw~gaw!+g0vrNo+s^I*;$yTS@p_ZC@zYnuESYLB{(C4misPYxp)s?czqfJYO
zqe-Y)I9FSlFW{_)*5b9u;ld`LC*YIdRQ!P|@3q&`J=qNSiN?SRlvKG_`qsF;8VzaO
zp9_D}2;I^!`2$T2Yl;`%jIr|;V?gRXb=t7lr(w#}cwsxWzRF;=(+k&O_pb0XiMi-4
zE*3U$b6H@zH?XGB2Ojom;Jdn}%2&^v4jW3U$|keBa!tVJFTHtH9kl}V4Q@AtY*-D;
zt#W&6Ya6_pQR!u@@i#X3eL5Kzl+{f4`+O^x7uSNtRJXIi4=3eYSyj`-WN(;WT^sbT
z5KYg}x8e2p1Jp#7r<4-ZM{xtR;Pce+;Yrxlg`ngb;ndD@cX^JxxNL^I^yac@?pkmw
zUks{TO>Z^T8q<W_FgV_351bzBJQLbqXs0gvnP$iX*PvJAwQ`;C-9po4fqc%s!Jm+G
zE_wRmQphn~Bh;kOM$qo2A*R_3MSE&}QLfZ@>CN6jGXRa~?tf<hxeb`!RFlZrINSn6
z9qgL5Q<F>fS6u2P<>>oV7@gu@(;e>N!Co6?40V8P9qeFqQ6Hb!$J%pYiuFDp)+DyN
zV6OcQ#*L=Q>@+Kx=B=qmqh$fK#4WZV7;raKYbzQwpo!$qg|DqD_M#m&!j)LT>)l>Y
zz`Ftj<>lze`PoNwo}$4r^6!B)=tU!z{e~vbavvYZ^BE%yE&AYI-rVeN^fmb#>OHkJ
zV74lU_2-7=z5vd1FmoYV-27%f0c%gx6tG93c_J+T2KU0UN+(wSvgz(}*cNh(XRa5z
zq3+UjNMMFT8q#{xQ0p(IWvYI)7^U-3m9XLSFbDF$^}gnSyK-d!OH!;!773GVm<CT<
zjPYaxh}A1PeAw@JeNCE=5!01*c+e<nvDFK#aD%F$i5mr+RlrwUyPR2)eiE>Wt&NVR
zs`K!;n$VaG<QlklhSTLrsH$a7jr9IC^NJRWI&GamZn@X%_7h~6+>7d0!-BCw_=9z9
zP4Q}E82Ug%t%L-9<jTQJY{tH_&R6I6vBX=~NKjqFDojx~ID;?Sr`CBee*p6(iCDt^
zd{v%+2ZWV=zeWdCeU&t?u+TjX<?~$5Ifd@2*;BKpYx$*1fzQph+hf4|7|5O{6)s&`
zFvsb(XHU~*V|ic)Z^S>`+OU-JMw~pF@z2OU1gA1AVZ^^-IGMzDpToEjVL6Qgw}l*!
z40xnOq2b8kW{F(}oRMAt*uXvn(LW8Q?-T}U?G4gw#=Q!iuiEe_y=o2ajD|}y@l6-i
zAI<KSwDREC8r*?hAkER0@72}U%FVj+ROxmIPyFff^#2H5Op_LHp32(EfQL!TIJ|<D
zr%Ee1{Ms`s(YX}q4h{zoJcFD`YUglZ@<!Y_8KsSY>FRe^u#~iy&vN{W--nT=#F;IX
zi2FIhi!)uqJ9PB(3Bj*U9o{LBNNwH<1`!`|4onDf|4sM;Lhwu6#}j_GKq3|Q$Aq7!
z!;AY%!bk6gk<X~8U{p9Bz(@^VHXWX9N?en4c#}XP&C%h>*2I;k!&BSha_I1+YjMds
zJjo=kQXL)x7QL40@Gz|CRiVSfyrS0%9i9(4D{Iu@M?|S5sab~~sl&JE@aa1IeLB2F
zhu@&X3tOh*M|Aj6I{HmIy#Bmsvkrfej(&>{PtP*MwN;0|SRj#h>hPE7@cVW6OLh2e
z9X>;cKcd55ro;E>@HpLwUaAg%c@&H1OFH}&I()wlKTe02JQWNg8=0WPr|IxE9p0+L
zU#Y`q=<u03yiJGyh7LbThtJaCb9DHLI((iEf0YjJ(BZGv;bk3uk`7<0!%x=Xm+J7>
z=<pRf{Ixp#3LXAB9llYApQ6Jz>+sn+e2Wf$y$*k$4xgjLZ_wfGI{YI#yi14Qq{Dwx
zhu^HjPu1bK=<w5Y_^mp8t`6U+!_Uy+_v`RC=<wY-e4Y+}M2FX}7d<-sOdY+d!}CX~
ztW<9OC`}HTKXKS3xxF)BR{za}2ZpVC(st`ABReqtJ&+7cJce(}1P5{iuOOmG&j7B8
zZxU=ExSPQT2{sbk$>0|W26HL0g~7WBCYK%A#Nh1&ldFzwVDN7UCKnxPVenG~)3l2;
zGWZFC$t6cB82lK)<ccGu4BkjEx!{O{!S@qPt~Zjy;O`SmE;nLh@Ert`tBqJ0+(a<B
z*oeg7+XyDt8tMBAh>4X1lS_^CF!)x2$(2UB8N85Sa-oq<2G1v$TxVnpgXa)TE;F);
z!7~XaR~gyB;Hd<Yi;T1|_!@%AHAWg4oJlab#7G5$#}Z7gFjC6k(FBtVj5ruPoM3W&
zksJma2_}~pu`&3w*?`H_MXU_|E5YLlmKc13U~+kpzAvf&?-6VxxQD@S5_~1W-3&fR
za3;Z>41SSda%qt*4BkaBxw6P625%>rTv%iSgMULXxvoeHgP$UpTvnu!!A}rOt}0T&
z;Kv9i7ZoXG@J52kHANf@zMo)nNs$}|f1hA-MG+f=?;x05P{hjMCW6WJL?i~^MliXY
zNZ%K1{0Syk6X{{_tpt;ciF7k~A;IJ-BApDLPcXTN$QB0AA(&i4WD|pD5<H#Y4Gf-2
zFu90G3xlsAm|R1ok-?b+lS_zHFnBD%<O(9C3?5A|xqyfRaH$*`CYzeo&}v)jD+YBL
zdT{IR>TJt|Hc&{?hJqy>G*=217c43$UsNXF^U+ewT{--GmX*@3wyb^wqhxt(T-inV
zG@eJ6NF}LdHlZ^(T73_#%iGZ&T@z$Q<0R*ITkhS1=_QA++e@`%<zw|yDB6BjR=Q;6
zgskjVms8d7vspIqB2oGvf~@?6rD%fCbg2WGR31yWPnyF?NN)A*gd`=LgczyY5|R{i
z67pQ?!ywrwmC0ePai%ir>?C4<i}E=lA8xi0EPmO8tb-M4`Vl`QQFnbFuOH&Z^z#th
zksQ8)>*uI?j!2a^xr+9x^9T+<l(mKQ622?T(ebMV4ru6?Of{;?iiKKGtR9B{l0!vV
z4$@XPtJSQkHqJ&v+?7?@kp?#EzsI2~I&uiTk)Z=wrPN}h`r2nCLm;b>2B;f4Q0;I^
z)Ce%TM*hI*Du}Ly=pJR5XBogomyl=R;p9*oN9yX|$*9}1Hh?PhP*y(@IrMCnM4bpd
zpVb4nU>glmvAt7Pin20jkaAGEjg(FUVFj?A^n-i`@;jKWin6Rk*~y7)D0l${pdl1p
z$nHVH*nn{wkrOZA_XAH$7_u)Rq16gmY8C%65Gh#zbCIWY3)wtgg6u40ufqpf#~`+d
zV@XEpDP5{HW+0x=Dg{vo)m7KQzmoxU10WSPMoVhrN;XglNue&J60qL{Ag{85<+q@e
zF2Zls<vT?_Wd89@HYnPm%ArTL_)x3;Elb<2a2YLk=1G>e&0uYLuCS1Xd=nU1I=Xkz
zt)lwSr!bF@RSqpw&lXv2W=s3kSfgE`pJ_%L`Xy`6ng3BBd#fYUA-R<Iu@AAd9Rh1P
zWRk<ps(jA?-R)Z1A17Sv?ld|7&%yVpB@&yuO)a{VpD|YILaezmETohQVM&chw~@BU
zOh~64+Q2`QwUm(XN6kppe>q2jKE%icC9Bp!=Ah1E#hP~HYumJ@clMi@ux>LLbrogN
zYQ!?+ma$DLqDC>0)DFa?l#Ky3`Uo{T1M9IILPh&qu5bl(Ta=Y2hgW>r+eD+<0wz>%
z3(}6)Aca$T$7On!@lh}Ar+Eh*a3XSs>ftLT>9pk<)+!a8F0hPBjnUzJ3>0*jBDd}}
zII+!>TR*?T()J{TaiQ~Dkdnk?1Fp&4GIcOtwXCa$>t=FpA68@ODhbarIzzMN&}vm3
zbCychGY@D#=v4N~=S~p^5=RcDbUT#;w6C1+3gyU3ZI(?|{_IjdFHt^rDla=jt1`p_
z<jg;3Y5P4yE(y&jX+IknMTPc*veM@~_lYzAK;Ru&>6JsoL8qB+$k}lSnsLDukb2cK
zBu~`uJg+&z9kxNWP!&>3%6PqA+EnauwmC8#Q1}w7-2Q^4&5GH)mDa32%T6Oq^1;0z
z5gJ#oLpzb*0%Mub0;$p08zC{a(o0nN#>J|oJr^}yiYqHez2}pGf%Z;U2zg8UIEMaE
z#xOEvV|baWc<1yZ92u`k%?2?tkF|w%9@6ZI8}l}r(;ej2)JM<IP-r%;*0OQY06Dxq
ztA%8wr2;9|56pAQ(WRaODMqXXd760-!iiEV%&?RLt?ydooe`sa?v2)WP5JvRbN9FQ
zo0V6TotAsA24(A~hR|3`TPd=krWD!o%sZ{08ZB)z7@VfTCQI8D3?8AuZ~{gKTM6#K
z<dML*?JA5+fwD_^wP5m##ma#Kv^qDO5=N<1OoF|OOtt0XV{&-bcKZv}mIm|H-ylqb
z+Pc#qw{}^PUD8*Ok0Spe8-UNMEw{a1ZK?bM`J?!}iqE-f%b7nTJ%MyT2oHns%)V;N
z>$?H;0eBtQhFyZi8z|k0&&xo(Mx?KS;za<bh;--bdqCWe&ygYnnD~#TA^#_QsHx+k
z4meUh|LeqH59$$n(bi!XOxp6ycd~57`Q3qu%Bj$Z`vJcZUSP=I>Ax6I>pmkP^7~c|
z3#I%9*&}eiG=gNxbW#3@F5koRmiFn$QK`V#PZryc2^&$~a4GN6UgDRmw)!dA52j8Z
zRf1g-d-Yk&f?36Z(Uu4Ph?ZuRSlU=x7PQkWXC@58p0vZ}t$@|!Vgf8y)wQPw2DXzB
z9ThN4?B~=vV5nW<DYp;cKJ#*hiF2r}@P8OeIK(u~_W-the=9BhA@iE)HZ;LjAS;#@
zwgTM*H>0gU7lJdbK$ea^CX9O9DbzyK3;&9ldLx#hS#vDy&6s}wjo|x;ww!3#%tvOn
zoWI+0?*S4!eBI;d61wxTS`RuoJVt{Jd)bD}sq7)Y&Uy%ylFHR5Awy(6W^VggOZ!CF
zy!{3H**~B}ij2lyw}bW$?DnDqWm4oU_PZUl&xm|PIY!5}&q)URk5pejz8dRWsRPPU
zHI1YUnY3N1a&(|HC-M&YFch66<-`^h(1Id{NZGB8Qjq9q38LO?W-F_<b09-$q1LfL
zt4I5^svAJ1)w_xD*ES>?=30&^jnZ7kdG~>h>cX8fGdcoh6M(g_f;N^8*2G<`i6Yu{
z3RDfDxc^kIz@0To&z8X*yrhq$eu7D@-8`s2rxNr(4ArPxKgQQZB!#j&atTw~ZZeK;
z5;3~8qJIM*jC=ysg-ZLw^9K~UU+IoSU{d-OBTy8xTKtIA$rmE^D`L=27$}zJ4bz-%
zcl$UI{4lnK-mPM$s^gix@nN`<X=iX4G;ty3QZQjvqAGrlV;ZBFpK(lS6tj^qP$L_L
zy>x>>bDxG`E5BR7Fz7`6>tCo9HWE7Q+k_Q^@gS3s55`o%A1$gE^O8OSGN|O9J#?dK
z@BEi#1>Y%HR`A^d_ip7><mXVX+Dzpbn%`ooQ2Ixnc4;XovL#-T<&Yy<;b}n_e>0Lz
zRF83X8+rQpAT?}G*Efl%1=anavP_WFJ5OptJ;dRafVabjXuD9_-+s{Y;J2{u3qI->
zD2~h~Ow6V>m@YQ7$0OIER5P&lgD|nRA4DtfK;_=U)wrTJ>~wCoBWrnuq#B{6?MF%+
zQU`%Mx-%W@{4VksDphZ=pTrLAyR!10+<L69wA|U%*$Gm)YiB7-x}<M7B=!G-#hL02
zgWRh(Y#s1aNQ>kt{Yd22{&ZRSV6}1C?yb-_rg&%I%(5L+G|<EMKU)bB^t5eMK9H4N
z^3D^pWka`opg)Mp+qKFEQCU{@Y4zXZ^;>Rwbec_)f}<9pruseTaoIleI-m@vchh;J
z+<McbYd{bf9~EKDq8;7KAf58TK=-o9ov4CwiTJgY9b?GKCuQ0J=cRIJUYg8yw6vFd
zU<X7|cG};vcUqQo_L{3VSSIZ{ZLyB{z9oCVWzvDu7I(L0-VwRA6N~WL_g&szSEwY-
ze$b_;PTJoV!`IN>_H|OlHF7AeC{&Qf_WMQoy8~r%=mKmZ1EUCwO|iEVm7Mvn2j1IZ
zMZ0RwM=%3aOqaU=LXf3t`y3<Ck)1B(giHC5R+#UAQVz=_dWTnQ_Yti-M_8W2+PEi8
zn<=}C?I-PTMXFeL9qi!7seHoDZj{e)d{b(qE&UBNCml4|o<2dgH|2_{HfbL}I+4RT
z6L5tlT7jcggy^i56?}gv4sbwyC3vC7#VS!HJnH~noO3WvRDtSX%kOY0RVfa-J(3y9
zbhoO$(@Rr>&LMzy96_6+eaVY_xY?>+9wiAk;tqYQM4fn2Ah)N1mrFUzNIR)(?VYSU
z<e40nw$~s49e3H!s!h=L*|w|-)W^p2Ix0Y;R=V}=T>#~_kyQNVH9&^}Pzr8eU07#q
z>LI4J!Q0p1aw@ml53<Esy#&Rr{R7HwOFQj9SmW9j<tHgQgrK|R@1aTBqXd79l}BS+
z&)Aw6pkJQvz-aREChf3t*2R}q)<sKOI?0bdLQegiF<~KCX<~h%zon(vGO6o`4J?A@
zkoqW7=JQ$i0ib%NrBI3LXQE-Kx#1)lLY=LsUSfIA9x^xK!ZBebE;=dH`=2y%&Z#ww
zGei%U8D#H4`NmS!a&H@xoa;RYehC^FP`}AY@5;)tpWRN!lIqJ5v_flVQC3b&&uM1{
zJ8)sjY{S~3WDto>PtsP6WmUgWTiX7FN->LdAxLOrk)xKQJCR&~+%K`_JKC^~-9eAI
z)t^y!L(>?*_PRd=f>~q7gW6sfVyfR`R3<V7SLj(2Ev>kz#W4vQE@=5)o)j2YV0msX
zZB}VFvg1SQU6=Zc<7oW5SWrk^^TERzN@eB&7`wOL?&t|_iuz+{B$7o-7Fj&AT21HT
zkT*qbCX!c)9kb=hB$TE@t2T91bs}11b=5nW3`bQzX*#q>W#bNV%-%6<nrgFGScTpM
zDtOgAZIF83)Jze2VD_dQ>HW1{|A7TDx-X(WFx?*kpU8B8G1NE3P}8#sqPD0{5w!uJ
zZd2C#YaI~Dj4Ry1dk^yf1F{iyA~87ux*d<uE%Z842X-0}VAM}h7WEI>!MIcT0_zi7
zRBW!$3|FW=tt9_1f$RCv)oFMkOWSQQ=?$^Vyrq2%**VUxoOE{OQi5sD(1<MiTLph~
zWI8aRZ&$@Rg7-B!|F8ZAOGnD6w$A2eQl)hp%m!Bep?Vd`7b;GZZ!ET2+GL=~?}f^<
ztbtLoVvRbs?d09CdZoz<h{Jabo}n#99csFDH|F4L)G(TB`T^Dzn{p&NfvYztUH2UC
zL96!+bmBXZF_CQerXI=%P?y~#v2~frht4ti)<I>1^3fvux61c}sC-5eF7gpZVz3>$
zOCmjt*R>O9`dizGfQzOB%uuv(AKD<3u+qZ&L$>LKMk9Ol=)0om5;&+RwNIdXA1`S?
zDTn7_SbhigVkc--x?vIz-p^TC)HCnH;O=CdY5`r8P}?%T$>)MqeTnG)lR!RW2W|Ft
z6p`0>;REandtwbOxK<e1@yNrd-P_5<3gpCz^m$Yy4>d)44$bKg%LB^%{)mh|Mxowr
z!DCJwkFqF_am1sDc(6Mpwt=-g*eFz`e)%5CJuzH62Dh77FU0&hgJ(}{|3c4_)K#cP
z3lS~;uFy2~!*>CPrYxDvHZeWGVzc53O<4d0?I4Jbb}*C|CnDItyi0^)|DqB=FlgvD
zMCAK*d5H`fm#kcu<x;Lop^25o5VQ#j&q`6Rpg|zpm%CE3#0G|%rwxqyT_!hM)imMN
zj~E8@+ed%~?pGa77@EB)S!~Myu9miTaKN6W1KhD^u>mp%^0sH9N=FYi?AjV+LrrHW
zH*0VaN(W{;O^%MG+9ITW0ZnbEUeMXZarJI$3~4(UD}~sUYUpa9U7>Fjh5TtH?Z+%_
z^jA_YkahIX8ibALF0I916E`}i>-Y}%Yq-aND^kvC0FD24b`)931-O%j%6n&;dhjS^
z%HWC$F~Rnc6c17vDNp@`3dw2i0rRB#WPXipVD=I{YBsBHQIC+5wZckheJinL1EeW^
z=;-N8M16`_-Lr@6Y|!sT@2}W%(iC+%T8T7~nTO2VCfZ;`wCn1ktpI<LdJ$3dfP(Ma
zbi>JcoPt>FHwN0WdR>;R?1fQ{88j+x;d%>JFC_6oH_8}=){isPy9k7PdY$^H90v>a
zg?BK&-$t&tgJ}*2Q0m1fb}2o*53yO)`Vn@nYjK=S2W~SlrsOQT3h5>U%8u?IgQ;-;
z)_>U@v+Q2lx}TqXuYH&K9yK2#^d8bJu)XgOPT_)bx&5p*ce}_AFumq!Gp1`EYcDv?
zeiAsXZDl39i_txR)*ra-TRq(s&LTI;jM0F0=ZCMe$qMc&3)A?aC7uzlw5r9>0Wuml
z_4uEo`?i8|TC3^q4=QNDvE&3iH3}54g&GDISS;4t*xT`m(c5unXjYb{xZ9}>QV>ok
zc4G{B)olZ~aGx*tx4*FVWlcMm(`f9B^TroO+4Vl7^NefZLud;_^7g*X?(eXz_>$kN
zvH=TT&^v~$d&*JBuAT%XZYF|#Y;_IYP18C&W-qBoEugZl;tT?iY8qzui8}qp=)(<@
zcq@=2lo2%d-p-z`6hcqe>J7s|>&)$}?UTW`o!Dt&-!dyp*Z7{d;fvrMS?iP$&1>q4
z(bR9WUY|mh-Y4bIkk&?1YbLF=z(z>a`=(xL+zfw1)ejXciT+eUInD0z@zCgcS7<r9
zFb{s_OV{Kc93dO!{C%shm3O{x#5zIK=1rI3eboXMva$>hzr@`%J9i>|yYO6#t#TOQ
z*|O4IGWiSV&M%C_Eq|Zo-d{t<&ipg<SgLE^7oc%k3ir4SJ^r<{WnTimE({JPdKw;u
zxh8)m@BGY&V*=;SV@A3CU|_UM`3m++cPl?eKZ&-)@R24S$)+49@fzP67%0J<g{cpy
zrvbPW{Hd7X6A~F&>kzSvF!1zyie?A|;|^YoHj3?MaYE4BzT-AHWOXEQ3Fo$xjYPf$
zvVtW}<s7NdsT?BPc!}*X$qME0S{t{-CH?e_&lOsorj9_@+Tf?pkdd(+Ihah473@}b
zS?<lIbt63M5V0qr7oa&>VY*z(XHI3G`qMWdSSV<<lc6#TafKJb2+syD?_I>~-x)H<
zN~%oXIVG5%vt3v!ymRTyC6p?kJM9dQIP3~tP}2TF@O8`qjBu)~1aTiTN^bq8WNG^u
zT1GvmlKYjc{3(cuc(<hf9?h~t8*I2GFd+pxl|MOm9yi?_PWdLzZRnULPY#W8Dr2w;
zLw1|oTAaa-S>Q&+x_~oDS12gO)(h?@Ih;xgb_hPJll(qA)A2eWw0tg@R-$Ab1_?av
zBI|r*OkVHLu+M>Tkt?7&^~N{I2D{Wkq!4eCIu<&^W>4M7(xd7%tl+r%SETBfq(Bq~
zdEv`Q2c0kNpUUkAg%7b7g)VW0Zb>Wg!qR%!fz>cNucFa{h%x6?#reAe-bCkBa{g=l
zyh>IAS!wE>Brj@TEQbK{ldI4rba17VVsXdW)qApnr?eldo4y2TYv+jG^g$8jaKVVE
z6D4`S*8XrOud~1PXrAovi}SL-SB}Km-*~|1V}DNkQbCzYMloAfUXep{=@E1&9i~$f
zPB~l>!Y(P+{z`Du6}3Oqi?zQS&@7G&=`l8(&AHQLhTQ&2a59#mGs5c=8evAP5khZd
zhrJ!uxW8Nk+Z)4e55^1cf}xp10>k!PN<T9{lV*OnH|%{{J4Y1u_lLt|QC;e8q|E*v
zBOSrSM$<sU5kG|j=JE9l!bsX3R{RMTxf0|t8-}X~Hw-ucpye$MbiJ<-&kn}>sCo%_
z>o*X1D$38_m_4vx7ZfaZDqrzmp|KyQVxHoM22UR%aq2O*=s_EP69Bis&R`FbRc6Z}
z*vrisQGU++mjWy0(B-tyjiM47g#A=6{}Alj1~XDOlM1i{S9S+7(Me&{@X_j*XiaRC
zx(c<Wl|3(YDSshPf0Jh7*51i{AHjYP;m6{mJ^v53srQ2gcDC)S$tj1i_l;d_NcZzV
zLaMev5OplHMY`I^(xd7Lq=WP4YxNU){qL@y`$#+}H_I8iluvjWbKTiBjE)z&9NJRf
z<>2Ze55Uzk9!n!UfVKA;F}_3fa~lNyZ}c<Rb!NA0z3-qMC7FiU+f8D}`qvI)4~vP}
zNf2OsG?%rXsiZb5tpv`Q)?*rDMRiT?lv{iHT?Q5RH-XWbKZ7+XM!74MdgjDWUfQ-Y
zPP<%6M4LD82_xmuR9UghT}JJAWfwLOJMn9z6Tx?#;ibe{o_s{!shUc{ckQ)CcURWP
zli(%xn{);QFEe;16I?9k_d1m$!DEA4&aYm>4gmL_SoBMjyU+o63o@Wdx)q7vZ-mFt
z+P~^r>__@xX_2qKqWRp_``lJQv`L-Q-`=_QCp04`sH?!YcZ0l*ig001{Nn0;SfHId
zKTKig9nMh9jlpP5G)tQY)U=oFeP_syY=UYdu7~skhUXDW3X}nw%4hi&!xf%bYPI1J
zQFzKyq-CzjAK2fLcYbN2&6^rLK&EywFkaXTdlGWC2s{SHyp5I(kR*5_ZVu_5hn?OH
zpGfWkvuroiO@Ae(zY0C|R|jnLhq<=E3ZISsd^g)xiWepLeKzk3pLeBAqt4F09{(Ap
zv7x3OFQ)JZ>ZbVD)KxarX5WUt6Xi@3)8*vk=Vv#1nml!WL1!B-*|M{<ZTO2V`YZ3@
zlJ=^CnKRiRnrrWW(B20iG3iJ$5Le?Z6TYf!Df;Kicx}YYnf@UDn7v6t<E!we%9ZXq
zPou>CTCg^U{av}+D~a3+sZ?6Tg+PK08oQ$D<)$0=07E}0&*>f*u%e(3<Curkih0wC
zyZuhw)Muo^Q6lXDUC*(Bfo7aCbmQRZz7*8M1B7PKZNT%<M%3%Uqx4RsrSx!}V5Ic4
zXv67&feNI3Nbf`1c?NVeJZDi({LT#wY(?4v16Gl4_zZG@|CY}O298h)Z!!t(RKT=z
zAWt{aW~6k{59sMK+_6wHG+Paqj~JG=!7$89c=q=^ko_7)p?xK$m!vFCpYbYWgA7u-
zb#8jbe9MKa(^{n2V{g1>T2>};p!!AlY&keEa4{hY(ye9bX@l?QyQ9>_BpGJP$#h62
zO&&NUkWw2Ncq&UjmP%<`<dEj1TOTmaNzVwI=A_%!rxc}6T4ydu&uLAapWc~bT45ZX
zo>P!MsUY138qm&3PrI4^fPpUJ^%6cA_(4N8As|PqabCLZw&Cg4g7mZkPB;riXYi>2
zokDcrv))*gp0UmZzO5-Mho@(tDr0SQL;+D<%0(6W0qYjje>y?^qVyc2JUrc2AlUGl
zBp<z-Y7%T>G1WsO>!{sUQ&GCjIG>S2PU8O%=qf<hO>}Yc5&z{0_*a4SCDiLjJqJ>%
z_aN!=0n%kSWls8}^`y&nsRikIt;6P}JDP{5=K%qQf*`8U;tP$N6Bf+X6iDOtHu!hL
zmUkmf*{Y4(gT~@$TgB9tliHH0t)g^?aR59adR$}B4UGpq^Qyo>%}2<S++mV?J&ngY
zvVm4Jb^4XzY~-O=P+)ktUav!yJ*aQR9QqZnzus6#;|aaCqJEC?$GYJl`F{fX4WOrY
zHKp{Nhy10c6hp!e6kSOCJ_4Uo%w0MyOi5ep&~%R>eZW+lK=-7NmlNt3eIkCdFdrPy
z#}dr(lpKv8pBLb_J_TBg&WjeasUg9<C=&C6_^k)OZpe2E`INiPPrgL9f$B`Y+!XjG
z_$c<ysX;rFqkV<axt$fIR~Y*e_H~{nVG(#<2Yy@NL(7;Si0SX39{|tR)S~oK;{kYq
z!dNd*c%f7dJ^}ENUl|zqH#~$$`MK!lyr^9cnFAZlrkZp}SD45$skfSc%fdKp!tuma
zgeXe4E`xs~xn}`C2|n6Kc%}z72iTaiIUq;pKuNlzHla$#penult`vM=MlsjGfFb*0
zls#iGZ%dV4Hkgl@rK5&v9jVgO#%{weQ>BN^W&m5wS0n$6R8ZpY(o7Q#mtT+UvdzYJ
zli>#@^TQ_7uTAFPnxqaB&@j=vlhSpX%qL9JC-HP+{iyhW!F<YK__YDqdyM9FM(GDe
zjh!YkWR-tqFyD89)NL@ooF;u>$PZs29X5syzetmwFq`rHvU%puhfBv(?|yN()HlqG
z;vc4s0e*j)Lz4CmH>2?2a5D<QE5$z>eHk!}Yd3_U7S&|_y%~Ko|Hv#ooMQeoMcQaa
z>mc}v;oCWq^t|Du;dkSu*G%T0kC#3$neQDhZA_v3zo(c7#z|XK%;&~QFQ%B^7$^NT
z#e86#bg$XGWt{ZmVG}lwlU^NW-ZV~%3^PA6PWn^YJH{_BllER{K0a3J8fixU?UCm8
zvC>9M9rE8F<5`y>9k_TT^6i(Hk^j{tuNwgT<uZtK`m&#!kl!-i3|TtI1M|=8pBnh5
z2L7pme`?^L8u-U(VAC`QYiP?<Ep6ejXdUZ6yW&%Nw+meY|5N&znGU9R%jjWxSBxH}
zcgxT<LQA6WF&)Wa@!nMWgKWCQ`_;v}C)n8;@c3gqcJ0S6S?OZ;tVr3jQ>6ISc?01e
zF*+!TzA<OTg&NFngQZ5326Hi`pK|%czL=fSp+383MJg_ALA9%*f~8_lb{}8OX}?An
zd!~z&&I{;jyv)H;fp5vsU?F!OE2TH?XqRXox9+SzTQ1e;ajwK*J{%HmGZ~!9%X#(x
z=ZScaeSAYR@Jx`dxjbFW(`ufs;^{h`KFrf6dHNhr_wn>FPe0`8S)Qiaw017x=~X<P
z!PB`sUCh&Jp048QI-Wkv(<gcQ98dT0^e|69<mp+Sre4Y0=jl~Eox#(&JYCGwYM!p*
z={lZ1%+n`%`W#R9@$@iHKji6Io~CB<_IY|0PiOFSE>9Qpw3?@@c)E_K5A*a%o<7IZ
zeLOwPQwbN|s5R3z32)w?Y|G8g%bsSNnq$w+$+z2WlNS1_Y_cby;iu$JmZWT)0pj(r
zNb$-qmaY&v925GQ8l~*|hJY^{@7S6W@GR%h^7>$QWv~X}0cxtGY=Y>8u2Oc@ntJfl
zQaaG1l2v#QT@4Nx^?(~?O}<(WRp7bC+JKZz?=zIL@uD$&v-g;0H#M+>z--?NcQwu*
z@vhk_)D;kz;%#QWX0NX?;AU^R)p)o)O--IP8V`}b&5Md?&r?_9MZE?zfv;@VuGR+L
zOeSS}8|vzO^#M}k-{B&^NWR#}dF2@$EGK?dF8m@dj{0o@e*>9e_=x)DMhBq5xDz%8
z4SorKBQSPk^z{Qq2P+D2hFJ_6{EDw%gd!X{G>G<vzkY-xj_|7DCQ0~7!EpgH<f}z}
z;m;dQ8bv<|hl~6@=}FWVewEJHh*Z=U>jQh^3GiY)K$cx%-4pP4ks)6!>I?sydZ|Y7
z6ekq&3p$amL^=6(0So`Wh1c)q^+|rxosgfSr8_uY)EDbX3-6#<Pel8ozUY5jTz#=V
zHSz^mtY=i8*6jHD4**9x5$maVFM0oXAaRLxT7$Gch&-iqwyLjh<9^*nZ%o0}$;VNH
zaDK-#B7F$3zJ5=pgHiNkvP3#|jh2dUQBS~+A)~J^_8)snGzzf}3;KBb-Nfq){dbo*
z7-=`JZzJZqE57~BC?gV4U&Lt<@mu!j>QXko{pWzw*WY@JgUF<<x3ENdNesLOM#qnX
zQYSL{`gFWbSKA_%NW0?Xk8l4V$kOo<!zA`=A|6b9e}(;M+$HH9WGuS+Vn4TknMM&$
z9}k}Z8Hp&`r@w5XtNT3be+*i(T~S}`XGFXn8y^^9-=clduYT0kw=dpD*zZOGE<vwB
zgRsDVi2{9nsiHz7<w;+h{zW~JrlMS5zt5vp?DJ@B;>Rl<9)ZflOY~o=)asw%_38PM
z{<7%-<VT@C{!2M=cv1hlI6TqPCDKvIN84s4+Lx_TOUDXAUKR&3K4R|Zs!I|%R$TXy
zapEF(L~E7MoA7@$zZ128-x?I)niIG2k8fY2O8jktba_U{kU|q?yI5!Sg(gTM*3Vdc
zO7uP^7H^K;zr^BG@y0HP9>ww|tkhzC(ifVb6|wHb;xCZIdJ>Bt9=(r=#Se{3LFFc_
z9Kx^1(j$IDrbCZnc~gS;8B}0`rx5;CU&(}3LHNyB{3!TAhaSc9CPdH>eokL#k}itg
zZ^q)sNWw40;xC4usnDZX-XvXucwdR|mrBAv=qs5dSfiNtx&o7QnIz`34h=nERjo|K
zU)&`RQj@)=NT&t{F<Or1gOTEAWI7l^m^ryk!i#f(t5~Tdy@u3)cIj_r=_0;#y@yXC
z{wIMq;I27dKc4|l{H<~44HhsYydw^u$?@^$6ZE{D<gvxcGmqhij#oYBvnh`M9gKeH
zxF_q^ql|u()WX+EbcbEPVECc^rNTt|`7NU#YA4SFFToDu?d@fb7waog(bdEF47H!L
zz~c}2hg=uHSti0~GW<xXGuOfD&@)B4nv>8!o&^6(;8}c>$_iHB#=a%#XcGGIaK4!s
z_s+QvM)@mF|1t1Z*#8+%1;=Q~S>O}NKNSmGqIfMH;1ki`1H3K4{92!c{+EpZp#K5L
zu0JH9e>Vw!APN2==sS^|R|8M_iI0~vhtZFe=<m?!qGy?O&Exd(@nc$&;I}b8T3!7&
z9w8=_ryuxHMvFv$IIq9V*dQg6e?k)c&A^kLIJlj(aAxgE=r;jRdWerpv>Nw!iQ0Xh
z;fKay+LeU<oh10-*f3DLjsgdhknUILn#A!fH#rzgzfGZw;szy>XGs$L<|O#%Isa~l
zgYm6q-`Ll3e9s&W-_7w)VIwmda*DW{94-9^@TBKXuIEx-aDvebs=@TjB=|fWkTU(x
zaWG=`+Z5!|ar2!h9@Q9zXQVM-xxkMyUMA6N5A>HK3H{?q@UJAnUyqyAMB`NgyiMOA
z&wGGRG+uWn!9M{!^E2KG#=&aP7;Q^J|8f%irI@;j+MSpLzd8y23E;_3#_toJVfe99
ze7vq_fhYaM$07Rz_Q{Flc{K_CB*TxCY%?890rZ;;y2ii{CE_zX3I5x_CyGNxe;~^I
zLsbRi+rqvj>G33do=Sp$mElK9^x6)(eC!+RQ4;#kli+9IhA&aSZU&ysi#R_}k%Yc6
z3BElE{$DvhC)2?t*OFbpXCmv!bZ7|veT?BpN_|BdemA$z<D)b=H_g}Zr#Sr|(J7KY
z-p?EWK2d+iUX&PbPl9&=Z{uz06MCa>BK|E5kNp52FC(wC0eB`S_rp(f{L{eGd=&53
z&*1&t$?4<cuJtnhdP%hWMPo=F=z;GKKjI{JF#J&ZM}fBuMN6BL(Em0G{ma0U{2~s3
z@F&NBXZxt-6|5D});Mg`NpIbm4t(n__Iuma4NX;3{J!OMz&xc9@!D#<fov~DlPL?-
z`kh0AvgqTnAQ%pd$KowV@RjL^L*%8O<OUGx!adJ*%bWt2+l}ARx}9$Kt@axbgsh%N
z-=g3iwE;x?iH7y!vAbBTmtZ|)LhQLJ`q8bk27zI;FmH$tr;GolMT-;B<7yBMilVrb
z1zZTqQ-j|V<J8{U6z~Uv_~|kQQ<-15(Cu=Tl_O-`U|>6Upw5jjJM{>8CqmfiP>4a-
z;9g$aP>G0hRfv1zr_g2+PU9PEeF6Mrc*eA8H-K{jA{OyXJLN}2kwDWLNqbUI6|AdU
zgDN^m`!z9pIKWBW?zsyKN{SH}u8896G0Azp=-UelN}Pp2QU{|AAmp4L!9t!yhv1gE
zbIXbmE2?0QtC-^B;Aw-y!9ts8;Y?Ua7anvbzQ8_pAp$5Zre8oKh#iH8N-#o*+$V+z
zVcYa$L?EKX@j)tc*N8zH8jj217>t?%`_TgGMdOZXv3$NJrcAu>h;?&5@rgv7r3BTZ
z@tt%r{HU*5#6s8{_3mpT_>t8L`=v;KiGvjW9pm>=bV3oKP{i-M1#yjdup5elbzU)`
zD5lR)ooIp4k_3esN<J7yj6!b3meUh15q5{9ibfoxP)M<)Zs#qatU@TOpx;LU7SV?U
zq1)mr#Ye!>+ZYcd#B1;%KL{O!2tu(qEnZoC)I%1ni3Jr(T%I^8;(3HMOk6QGoMKEE
zu(*g%u`19aBgJ*dUFB~;1kCy>3g!vLi%?RrRauEHQeSKoym;F4549Cvb3s`&T;9;4
z!3b-*8JHkw=-13a=9YA^LjM7?P#-5$%t8@G^KT9n7NeMKM3{<3)PzSC5#M5nw1L$P
zUu>Yop|tTUG2(*N!Qbc5im||+GY8A4uW2s+QU>$53;|-<+|uH2ab|oDvH5U5em8mS
zzIf><tXMoQMy6=&wEvyaWus2=yyH8@YYQRm;!)?hI#PVNL{;@se1|NF=NrQ}c5VCz
zMex<HkAo3>+5Rc&4zZKvu_)vE?nWqNu`#1X=<gN0Gp3pBd=dm+B6(2LSb7Acqb2e0
z97{2#4W^nQPGV}*NhAs)jkRSk0%troO?iXCS6N`xp+qc}V?3IIr^e&{hw(UL*pO>Y
z5X$*$_MKlB@-(IirpKh=W8+BO`YjSgCe)m(e(oh$49|D=#jlG0p!ug?8H@jCQHkm2
zU=6igS^9NH7gsh;mABe+5d3*j`P{t4KAPKfBO?MwM`KyDSu_|-GcLkv97nbS@(8*y
zJIk1OlII;+GnTt%g*>n_8};b!I2yJywgwM(>GcO=T^aSY?tkk2hu#Bdf(?zM9kWHS
z(-t9)QpDxtQj1ZG4RRW%$DuJ};|PXEn2ux5?soL^<^SWg>TjMaIoj<u-CE(4!WY0F
zI{$CjM&CmF&)7|=2*Ufdac7gnbB>!zuo@N&cqlp=OE<2_Sx&*s#TgPd33RAb?y00(
zteA+lsK=wr>Lx7vT`8O8LtGN?Sy_N0K*u#o#w{xF12o(flOXy(b{CO`69(;#=lG8~
zri+iv%cY3=gynS&_1dNv2Vi_bZt_8uq~^p*#`cE=M~?5%d564?uc$weh=xFqCHgM}
Cxwk_A

literal 0
HcmV?d00001

diff --git a/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc b/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc
new file mode 100644
index 000000000000..865bd1122df7
--- /dev/null
+++ b/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc
@@ -0,0 +1,205 @@
+/*!
+ * Copyright (c) 2016
+ * \file lstm-word-segment-predict.cc
+ * \brief C++ predict example of mxnet : lstm word segment
+ */
+
+#include <stdio.h>
+
+// Path for c_predict_api
+#include <mxnet/c_predict_api.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <unordered_map>
+
+using namespace std;
+
+class BufferFile {
+    public:
+        std::string file_path_;
+        int length_;
+        char *buffer_;
+
+        explicit BufferFile(std::string file_path): file_path_(file_path) {
+            std::ifstream ifs(file_path_.c_str(), std::ios::in | std::ios::binary);
+            if (!ifs) {
+                std::cerr << "Can't open the file. Please check " << file_path << ". \n";
+            }
+            ifs.seekg(0, std::ios::end);
+            length_ = ifs.tellg();
+            ifs.seekg(0, std::ios::beg);
+            std::cerr << file_path_.c_str() << " ... " << length_ << " bytes\n";
+
+            buffer_ = new char[sizeof(char) * length_];
+            ifs.read(buffer_, length_);
+            ifs.close();
+        }
+
+        int GetLength() {
+            return length_;
+        }
+
+        char *GetBuffer() {
+            return buffer_;
+        }
+
+        ~BufferFile() {
+            delete [] buffer_;
+            buffer_ = NULL;
+        }
+};
+
+char PrintOutputResult(const std::vector<float> &data, unordered_map<int, char> &idx2label) {
+    float best_acc = 0.0;
+    int best_idx = 0;
+
+    for (int i = 0; i < static_cast<int>(data.size()); ++i) {
+        if (data[i] > best_acc) {
+            best_acc = data[i];
+            best_idx = i;
+        }
+    }
+
+    return idx2label[best_idx];
+}
+
+void ReadVocabMap(const std::string &map_file, unordered_map<string, int> &dict) {
+    ifstream m_file(map_file.c_str());
+    if (!m_file) {
+        cerr << "open file " << map_file << " failed." << endl;
+    }
+
+    string line;
+    while (getline(m_file, line)) {
+        int idx = line.find_first_of('');
+        if (idx != -1) {
+            string w = line.substr(0, idx);
+            int index = atoi(line.substr(idx+1).c_str());
+            dict[w] = index;
+        }
+    }
+
+    m_file.close();
+}
+
+int GetUTF8Vec(const string &text, vector<std::string> &utf8_array) {
+    utf8_array.clear();
+    int idx = 0;
+    while (idx < text.size()) {
+        if ((text[idx] & 0x80) == 0) {  // single byte character
+            utf8_array.push_back(text.substr(idx, 1));
+            ++idx;
+        } else if ((text[idx] & 0xE0) == 0xC0) {  // double bytes
+            utf8_array.push_back(text.substr(idx, 2));
+            idx += 2;
+        } else if ((text[idx] & 0xF0) == 0xE0) {  // triple bytes
+            utf8_array.push_back(text.substr(idx, 3));
+            idx += 3;
+        } else {
+            ++idx;
+        }
+    }
+    return idx;
+}
+
+int main(int argc, char *argv[]) {
+    unordered_map<int, char> idx2label;
+    idx2label[0] = 'B';
+    idx2label[1] = 'M';
+    idx2label[2] = 'E';
+    idx2label[3] = 'S';
+
+    // load vocabulary
+    unordered_map<string, int> vocab_dict;
+    ReadVocabMap("../vocab_map", vocab_dict);
+
+    string symbol_file = "../../checkpoint/lstm-symbol.json";
+    string param_file = "../../checkpoint/lstm-0099.params";
+    BufferFile symbol_data(symbol_file);
+    BufferFile param_data(param_file);
+
+    int dev_type = 2; // 1: cpu, 2: gpu
+    int dev_id = 0; // arbitrary
+    mx_uint num_input_nodes = 3; // data, init_c, init_h
+    const char *input_key[3] = { "data", "l0_init_c", "l0_init_h" };
+    const char **input_keys = input_key;
+
+    mx_uint batch_size = 1;
+    mx_uint num_hidden = 300;
+    mx_uint context_size = 7;
+
+    const mx_uint input_shape_indptr[4] = {0, 2, 4, 6};
+    const mx_uint input_shape_data[6] = { batch_size, context_size, batch_size, num_hidden, batch_size, num_hidden };
+
+    PredictorHandle out = 0; // alias for void *
+
+    // Create Predictor
+    MXPredCreate((const char *)symbol_data.GetBuffer(),
+                (const char *)param_data.GetBuffer(),
+                static_cast<size_t>(param_data.GetLength()),
+                dev_type,
+                dev_id,
+                num_input_nodes,
+                input_keys,
+                input_shape_indptr,
+                input_shape_data,
+                &out);
+
+    vector<mx_float> init_c = vector<mx_float>(batch_size * num_hidden);
+    vector<mx_float> init_h = vector<mx_float>(batch_size * num_hidden);
+    vector<mx_float> data = vector<mx_float>(batch_size * context_size);
+    string input_str;
+    vector<string> utf8_arr;
+    int window = (int) (context_size - 1) / 2;
+    while (getline(cin, input_str)) {
+        utf8_arr.clear();
+        GetUTF8Vec(input_str, utf8_arr);
+
+        init_c.clear(); init_h.clear();
+        MXPredSetInput(out, "l0_init_c", init_c.data(), batch_size * num_hidden);
+        MXPredSetInput(out, "l0_init_h", init_h.data(), batch_size * num_hidden);
+        for (size_t i = 0; i < utf8_arr.size(); ++i) {
+            data.clear();
+            data.resize(batch_size * context_size);
+            for (int j = -window; j <= window; ++j) {
+                if (i+j < 0 || i+j >= utf8_arr.size()) {
+                    data[j+window] = vocab_dict["P"];
+                } else {
+                    if (vocab_dict.find(utf8_arr[i]) != vocab_dict.end()) {
+                        data[j+window] = vocab_dict[utf8_arr[i+j]];
+                    } else {
+                        data[j+window] = vocab_dict["U"];
+                    }
+                }
+            }
+            MXPredSetInput(out, "data", data.data(), batch_size * context_size);
+            // Do Predict
+            MXPredForward(out);
+            // Get Output
+            mx_uint output_index = 0;
+            mx_uint *shape = 0;
+            mx_uint shape_len;
+
+            MXPredGetOutputShape(out, output_index, &shape, &shape_len);
+            size_t size = 1;
+            for (mx_uint k = 0; k < shape_len; ++k) size *= shape[k];
+            vector<float> result(size);
+
+            MXPredGetOutput(out, output_index, &(result[0]), size);
+
+            // Print Output Label
+            char char_label = PrintOutputResult(result, idx2label);
+            switch(char_label) {
+                case 'B': cout << utf8_arr[i]; break;
+                case 'M': cout << utf8_arr[i]; break;
+                case 'E': cout << utf8_arr[i] << " "; break;
+                case 'S': cout << utf8_arr[i] << " "; break;
+            }
+        }
+        cout << endl;
+    }
+    MXPredFree(out);
+}
diff --git a/example/lstm-word-segment/predict/lstm-word-segment-predict.cc b/example/lstm-word-segment/predict/lstm-word-segment-predict.cc
new file mode 100644
index 000000000000..f1087c8260d8
--- /dev/null
+++ b/example/lstm-word-segment/predict/lstm-word-segment-predict.cc
@@ -0,0 +1,220 @@
+/*!
+ * Copyright (c) 2016
+ * \file lstm-word-segment-predict.cc
+ * \brief C++ predict example of mxnet : lstm word segment
+ */
+
+#include <stdio.h>
+
+// Path for c_predict_api
+#include <mxnet/c_predict_api.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <unordered_map>
+
+using namespace std;
+
+class BufferFile {
+    public:
+        std::string file_path_;
+        int length_;
+        char *buffer_;
+
+        explicit BufferFile(std::string file_path): file_path_(file_path) {
+            std::ifstream ifs(file_path_.c_str(), std::ios::in | std::ios::binary);
+            if (!ifs) {
+                std::cerr << "Can't open the file. Please check " << file_path << ". \n";
+            }
+            ifs.seekg(0, std::ios::end);
+            length_ = ifs.tellg();
+            ifs.seekg(0, std::ios::beg);
+            std::cerr << file_path_.c_str() << " ... " << length_ << " bytes\n";
+
+            buffer_ = new char[sizeof(char) * length_];
+            ifs.read(buffer_, length_);
+            ifs.close();
+        }
+
+        int GetLength() {
+            return length_;
+        }
+
+        char *GetBuffer() {
+            return buffer_;
+        }
+
+        ~BufferFile() {
+            delete [] buffer_;
+            buffer_ = NULL;
+        }
+};
+
+void PrintOutputResult(const std::vector<float> &data, unordered_map<int, char> &idx2label, mx_uint shape, vector<char> &res) {
+    float best_acc = 0.0;
+    int best_idx = 0;
+
+    for (int i = 0; i < static_cast<int>(data.size()); ++i) {
+        if (i % shape == 0) {
+            res.push_back(idx2label[best_idx]);
+            best_acc = data[i];
+            best_idx = i % shape;
+        } else {
+            if (data[i] > best_acc) {
+                best_acc = data[i];
+                best_idx = i;
+            }
+        }
+    }
+}
+
+void ReadVocabMap(const std::string &map_file, unordered_map<string, int> &dict) {
+    ifstream m_file(map_file.c_str());
+    if (!m_file) {
+        cerr << "open file " << map_file << " failed." << endl;
+    }
+
+    string line;
+    while (getline(m_file, line)) {
+        int idx = line.find_first_of('');
+        if (idx != -1) {
+            string w = line.substr(0, idx);
+            int index = atoi(line.substr(idx+1).c_str());
+            dict[w] = index;
+        }
+    }
+
+    m_file.close();
+}
+
+int GetUTF8Vec(const string &text, vector<std::string> &utf8_array) {
+    utf8_array.clear();
+    int idx = 0;
+    while (idx < (int) text.size()) {
+        if ((text[idx] & 0x80) == 0) {  // single byte character
+            utf8_array.push_back(text.substr(idx, 1));
+            ++idx;
+        } else if ((text[idx] & 0xE0) == 0xC0) {  // double bytes
+            utf8_array.push_back(text.substr(idx, 2));
+            idx += 2;
+        } else if ((text[idx] & 0xF0) == 0xE0) {  // triple bytes
+            utf8_array.push_back(text.substr(idx, 3));
+            idx += 3;
+        } else {
+            ++idx;
+        }
+    }
+    return idx;
+}
+
+int main(int argc, char *argv[]) {
+    unordered_map<int, char> idx2label;
+    idx2label[0] = 'B';
+    idx2label[1] = 'M';
+    idx2label[2] = 'E';
+    idx2label[3] = 'S';
+
+    // load vocabulary
+    unordered_map<string, int> vocab_dict;
+    ReadVocabMap("../vocab_map", vocab_dict);
+
+    string symbol_file = "../../checkpoint/lstm-symbol.json";
+    string param_file = "../../checkpoint/lstm-0099.params";
+    BufferFile symbol_data(symbol_file);
+    BufferFile param_data(param_file);
+
+    int dev_type = 1; // 1: cpu, 2: gpu
+    int dev_id = 0; // arbitrary
+    mx_uint num_input_nodes = 3; // data, init_c, init_h
+    const char *input_key[3] = { "data", "l0_init_c", "l0_init_h" };
+    const char **input_keys = input_key;
+
+    mx_uint batch_size = 16;
+    mx_uint num_hidden = 300;
+    mx_uint context_size = 7;
+
+    const mx_uint input_shape_indptr[4] = {0, 2, 4, 6};
+    const mx_uint input_shape_data[6] = { batch_size, context_size, batch_size, num_hidden, batch_size, num_hidden };
+
+    PredictorHandle out = 0; // alias for void *
+
+    // Create Predictor
+    MXPredCreate((const char *)symbol_data.GetBuffer(),
+                (const char *)param_data.GetBuffer(),
+                static_cast<size_t>(param_data.GetLength()),
+                dev_type,
+                dev_id,
+                num_input_nodes,
+                input_keys,
+                input_shape_indptr,
+                input_shape_data,
+                &out);
+
+    vector<mx_float> init_c = vector<mx_float>(batch_size * num_hidden);
+    vector<mx_float> init_h = vector<mx_float>(batch_size * num_hidden);
+    string input_str;
+    vector<string> utf8_arr;
+    int window = (int) (context_size - 1) / 2;
+    while (cin>>input_str) {
+        utf8_arr.clear();
+        GetUTF8Vec(input_str, utf8_arr);
+        int num_of_char = utf8_arr.size();
+        int padding_num = batch_size - num_of_char % batch_size;
+        int parts = (num_of_char + padding_num) / batch_size;
+        vector<mx_float> padding_data = vector<mx_float>(parts * batch_size * context_size);
+
+        init_c.clear(); init_h.clear();
+        MXPredSetInput(out, "l0_init_c", init_c.data(), batch_size * num_hidden);
+        MXPredSetInput(out, "l0_init_h", init_h.data(), batch_size * num_hidden);
+        for (size_t i = 0; i < utf8_arr.size(); ++i) {
+            for (int j = -window; j <= window; ++j) {
+                if (i+j < 0 || i+j >= utf8_arr.size()) {
+                    padding_data[i+j+window] = vocab_dict["P"];
+                } else {
+                    if (vocab_dict.find(utf8_arr[i]) != vocab_dict.end()) {
+                        padding_data[i+j+window] = vocab_dict[utf8_arr[i+j]];
+                    } else {
+                        padding_data[i+j+window] = vocab_dict["U"];
+                    }
+                }
+            }
+        }
+        vector<mx_float> data = vector<mx_float>(batch_size * context_size);
+        vector<char> label_result;
+        for (int k = 0; k < parts; ++k) {
+            for (int j = 0; j < batch_size * context_size; ++j) {
+                data[j] = padding_data[k * batch_size + j];
+            }
+
+            MXPredSetInput(out, "data", data.data(), batch_size * context_size);
+            // Do Predict
+            MXPredForward(out);
+            // Get Output
+            mx_uint output_index = 0;
+            mx_uint *shape = 0;
+            mx_uint shape_len;
+
+            MXPredGetOutputShape(out, output_index, &shape, &shape_len);
+            size_t size = 1;
+            for (mx_uint k = 0; k < shape_len; ++k) size *= shape[k];
+            vector<float> result(size);
+
+            MXPredGetOutput(out, output_index, &(result[0]), size);
+            // Print Output Label
+            PrintOutputResult(result, idx2label, shape_len, label_result);
+        }
+
+        for (size_t i = 0; i < utf8_arr.size(); ++i) {
+            switch(label_result[i]) {
+                case 'B': cout << utf8_arr[i]; break;
+                case 'M': cout << utf8_arr[i]; break;
+                case 'E': cout << utf8_arr[i] << " "; break;
+                case 'S': cout << utf8_arr[i] << " "; break;
+            }
+        }
+        cout << endl;
+    }
+    MXPredFree(out);
+}
diff --git a/example/lstm-word-segment/predict/lstm_predict.py b/example/lstm-word-segment/predict/lstm_predict.py
new file mode 100755
index 000000000000..abaeba8b646f
--- /dev/null
+++ b/example/lstm-word-segment/predict/lstm_predict.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import mxnet_predict
+import numpy as np
+import time
+
+symbol_file = '../checkpoint/lstm-symbol.json'
+param_file = '../checkpoint/lstm-0000.params'
+vocab_map = {}
+for line in open('vocab_map'):
+    w, idx = line.strip().split('')
+    vocab_map[w] = int(idx)
+
+# lstm_segment input data format: (batch_size, context_size)
+batch_size = 8
+context_size = 7
+num_hidden = 300
+input_shapes = { 'data':(batch_size, context_size), 
+       'l0_init_c': (batch_size, num_hidden), 'l0_init_h': (batch_size, num_hidden)}
+
+lstm_predict_handle = mxnet_predict.Predictor(open(symbol_file).read(), open(param_file).read(), input_shapes, dev_type='cpu')
+
+input_str = '至于计算机的使用'
+
+def reshape_input(s, context_size):
+    padding_num = int((context_size - 1) / 2)
+    unicode_str = unicode(s, 'utf-8')
+    idx_sen = []
+    for char in unicode_str:
+        schar = char.encode('utf-8')
+        if schar in vocab_map:
+            idx_sen.append(vocab_map[schar])
+        else:
+            # unknown symbol
+            idx_sen.append(vocab_map['U'])
+
+    for _ in range(padding_num):
+        idx_sen.insert(0, vocab_map['P'])
+        idx_sen.append(vocab_map['P'])
+
+    x = []
+    for i in range(len(unicode_str)):
+        x.append(idx_sen[i:i+context_size])
+
+    return np.array(x)
+
+init_c = np.zeros((batch_size, num_hidden))
+init_h = np.zeros((batch_size, num_hidden))
+
+x_data = reshape_input(input_str, context_size)
+num_of_char = x_data.shape[0]
+
+Idx2Label = {0:'B', 1: 'M', 2: 'E', 3: 'S'}
+
+print input_str
+input_data_dict = {'l0_init_c': init_c, 'l0_init_h': init_h, 'data': x_data}
+start = time.time()
+lstm_predict_handle.forward(**input_data_dict)
+output = lstm_predict_handle.get_output(0)
+print [ Idx2Label[x] for x in np.argmax(output, axis=1) ]
+print 'elapsed %.2fs' % (time.time() - start)
+
+# for i in range(num_of_char):
+#     input_data_dict['data'] = x_data[i]
+#     lstm_predict_handle.forward(**input_data_dict)
+#     output = lstm_predict_handle.get_output(0)
+#     print Idx2Label[np.argmax(output, axis=1)[0]]
diff --git a/example/lstm-word-segment/predict/mxnet_predict.py b/example/lstm-word-segment/predict/mxnet_predict.py
new file mode 100644
index 000000000000..e0dd766e3b56
--- /dev/null
+++ b/example/lstm-word-segment/predict/mxnet_predict.py
@@ -0,0 +1,210 @@
+# coding: utf-8
+# pylint: disable=invalid-name, too-many-arguments
+"""Lightweight API for mxnet prediction.
+
+This is for prediction only, use mxnet python package instead for most tasks.
+"""
+from __future__ import absolute_import
+
+import os
+import sys
+import ctypes
+import numpy as np
+
+__all__ = ["Predictor", "load_ndarray_file"]
+
+if sys.version_info[0] == 3:
+    py_str = lambda x: x.decode('utf-8')
+else:
+    py_str = lambda x: x
+
+def c_str(string):
+    """"Convert a python string to C string."""
+    return ctypes.c_char_p(string.encode('utf-8'))
+
+def c_array(ctype, values):
+    """Create ctypes array from a python array."""
+    return (ctype * len(values))(*values)
+
+def _find_lib_path():
+    """Find mxnet library."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    api_path = os.path.join(curr_path, '../../../lib/')
+    dll_path = [curr_path, api_path]
+    dll_path = [os.path.join(p, 'libmxnet.so') for p in dll_path] + \
+        [os.path.join(p, 'libmxnet_predict.so') for p in dll_path]
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if len(lib_path) == 0:
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(dll_path)))
+    return lib_path
+
+
+def _load_lib():
+    """Load libary by searching possible path."""
+    lib_path = _find_lib_path()
+    lib = ctypes.cdll.LoadLibrary(lib_path[0])
+    # DMatrix functions
+    lib.MXGetLastError.restype = ctypes.c_char_p
+    return lib
+
+
+def _check_call(ret):
+    """Check the return value of API."""
+    if ret != 0:
+        raise RuntimeError(py_str(_LIB.MXGetLastError()))
+
+_LIB = _load_lib()
+# type definitions
+mx_uint = ctypes.c_uint
+mx_float = ctypes.c_float
+mx_float_p = ctypes.POINTER(mx_float)
+PredictorHandle = ctypes.c_void_p
+NDListHandle = ctypes.c_void_p
+
+devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3}
+
+class Predictor(object):
+    """A predictor class that runs prediction.
+
+    Parameters
+    ----------
+    symbol_json_str : str
+        Path to the symbol file.
+
+    param_raw_bytes : str, bytes
+        The raw parameter bytes.
+
+    input_shapes : dict of str to tuple
+        The shape of input data
+
+    dev_type : str, optional
+        The device type of the predictor.
+
+    dev_id : int, optional
+        The device id of the predictor.
+    """
+    def __init__(self, symbol_file,
+                 param_raw_bytes, input_shapes,
+                 dev_type="cpu", dev_id=0):
+        dev_type = devstr2type[dev_type]
+        indptr = [0]
+        sdata = []
+        keys = []
+        for k, v  in input_shapes.items():
+            if not isinstance(v, tuple):
+                raise ValueError("Expect input_shapes to be dict str->tuple")
+            keys.append(c_str(k))
+            sdata.extend(v)
+            indptr.append(len(sdata))
+        handle = PredictorHandle()
+        param_raw_bytes = bytearray(param_raw_bytes)
+        ptr = (ctypes.c_char * len(param_raw_bytes)).from_buffer(param_raw_bytes)
+        _check_call(_LIB.MXPredCreate(
+            c_str(symbol_file),
+            ptr, len(param_raw_bytes),
+            ctypes.c_int(dev_type), ctypes.c_int(dev_id),
+            mx_uint(len(indptr) - 1),
+            c_array(ctypes.c_char_p, keys),
+            c_array(mx_uint, indptr),
+            c_array(mx_uint, sdata),
+            ctypes.byref(handle)))
+        self.handle = handle
+
+    def __del__(self):
+        _check_call(_LIB.MXPredFree(self.handle))
+
+    def forward(self, **kwargs):
+        """Perform forward to get the output.
+
+        Parameters
+        ----------
+        **kwargs
+            Keyword arguments of input variable name to data.
+
+        Examples
+        --------
+        >>> predictor.forward(data=mydata)
+        >>> out = predictor.get_output(0)
+        """
+        for k, v in kwargs.items():
+            if not isinstance(v, np.ndarray):
+                raise ValueError("Expect numpy ndarray as input")
+            v = np.ascontiguousarray(v, dtype=np.float32)
+            _check_call(_LIB.MXPredSetInput(
+                self.handle, c_str(k),
+                v.ctypes.data_as(mx_float_p),
+                mx_uint(v.size)))
+        _check_call(_LIB.MXPredForward(self.handle))
+
+    def get_output(self, index):
+        """Get the index-th output.
+
+        Parameters
+        ----------
+        index : int
+            The index of output.
+
+        Returns
+        -------
+        out : numpy array.
+            The output array.
+        """
+        pdata = ctypes.POINTER(mx_uint)()
+        ndim = mx_uint()
+        _check_call(_LIB.MXPredGetOutputShape(
+            self.handle, index,
+            ctypes.byref(pdata),
+            ctypes.byref(ndim)))
+        shape = tuple(pdata[:ndim.value])
+        data = np.empty(shape, dtype=np.float32)
+        _check_call(_LIB.MXPredGetOutput(
+            self.handle, mx_uint(index),
+            data.ctypes.data_as(mx_float_p),
+            mx_uint(data.size)))
+        return data
+
+
+def load_ndarray_file(nd_bytes):
+    """Load ndarray file and return as list of numpy array.
+
+    Parameters
+    ----------
+    nd_bytes : str or bytes
+        The internal ndarray bytes
+
+    Returns
+    -------
+    out : dict of str to numpy array or list of numpy array
+        The output list or dict, depending on whether the saved type is list or dict.
+    """
+    handle = NDListHandle()
+    olen = mx_uint()
+    nd_bytes = bytearray(nd_bytes)
+    ptr = (ctypes.c_char * len(nd_bytes)).from_buffer(nd_bytes)
+    _check_call(_LIB.MXNDListCreate(
+        ptr, len(nd_bytes),
+        ctypes.byref(handle), ctypes.byref(olen)))
+    keys = []
+    arrs = []
+
+    for i in range(olen.value):
+        key = ctypes.c_char_p()
+        cptr = mx_float_p()
+        pdata = ctypes.POINTER(mx_uint)()
+        ndim = mx_uint()
+        _check_call(_LIB.MXNDListGet(
+            handle, mx_uint(i), ctypes.byref(key),
+            ctypes.byref(cptr), ctypes.byref(pdata), ctypes.byref(ndim)))
+        shape = tuple(pdata[:ndim.value])
+        dbuffer = (mx_float * np.prod(shape)).from_address(ctypes.addressof(cptr.contents))
+        ret = np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+        ret = np.array(ret, dtype=np.float32)
+        keys.append(py_str(key.value))
+        arrs.append(ret)
+    _check_call(_LIB.MXNDListFree(handle))
+
+    if len(keys) == 0 or len(keys[0]) == 0:
+        return arrs
+    else:
+        return {keys[i] : arrs[i] for i in range(len(keys))}
diff --git a/example/lstm-word-segment/train.py b/example/lstm-word-segment/train.py
new file mode 100755
index 000000000000..398481011c77
--- /dev/null
+++ b/example/lstm-word-segment/train.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+import sys
+logs = sys.stderr
+
+import mxnet as mx
+import data_helper
+import lstm
+import time
+
+step_size = 10
+context_size = 5
+batch_size = 64
+num_hidden = 150
+num_embed = 100
+
+print >> logs, 'context size = %d' % context_size
+print >> logs, 'batch size = %d' % batch_size
+print >> logs, 'step size = %d' % step_size
+
+train_path, dev_path = 'train.conll', 'test.conll'
+x, y, vocab = data_helper.load_data(train_path)
+x_dev, y_dev, _ = data_helper.load_data(dev_path, vocab, False)
+# save vocabulary
+vocab_file = open('vocab_map', 'w')
+for k, v in vocab.items():
+    print >> vocab_file, '%s%s' % (k, v)
+vocab_file.close()
+
+print >> logs, 'vocabulary size=%d' % len(vocab)
+num_label = len(data_helper.LabelVocab)
+print >> logs, 'output labels = %d' % num_label
+
+
+X_data, y_data = data_helper.reshape_data(x, y, vocab, context_size, step_size)
+X_dev_data, y_dev_data = data_helper.reshape_data(x_dev, y_dev, vocab, context_size, step_size)
+print >> logs, 'training data shape %s' % str(X_data.shape)
+
+num_epoch = 100
+
+lstm_model = lstm.setup_lstm_model(ctx=mx.gpu(1), num_lstm_layer=1,
+                                   step_size=step_size,
+                                   context_size=context_size,
+                                   num_hidden=num_hidden,
+                                   num_embed=num_embed,
+                                   num_label=num_label,
+                                   batch_size=batch_size,
+                                   vocab_size=len(vocab),
+                                   initializer=mx.initializer.Uniform(0.1),
+                                   dropout=0.5)
+
+# default optimizer is RMSProp, you can choose SGD with learning_rate=0.1
+lstm.train_lstm(lstm_model, X_data, y_data, X_dev_data, y_dev_data, num_epoch=num_epoch,
+		optimizer='rmsprop', learning_rate=0.001)

From 9f2a5d31442bad0060b7717fec4084da9043a39e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=A8=E5=8D=9A=20=28Yang=20Bo=29?= <pop.atry@gmail.com>
Date: Sun, 12 Jun 2016 14:24:06 +0800
Subject: [PATCH 003/126] Fix a typo

---
 docs/zh/system/note_data_loading.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/zh/system/note_data_loading.md b/docs/zh/system/note_data_loading.md
index a48040df98d3..738a6e23eb44 100644
--- a/docs/zh/system/note_data_loading.md
+++ b/docs/zh/system/note_data_loading.md
@@ -113,7 +113,7 @@ InputSplit 需要下面的几个参数:
 
 ### Hide IO Cost Using Threadediter
 
-掩藏 IO 开销的一种方式是主线程在做 feed-forward 和 backward 的时候, 使用一个独立的现成做数据预取操作. 为了支持更加复杂的训练方案, MXNet 提供了基于 dmlc-core 的 threadediter 更加通用的 IO 处理流水线.
+掩藏 IO 开销的一种方式是主线程在做 feed-forward 和 backward 的时候, 使用一个独立的线程做数据预取操作. 为了支持更加复杂的训练方案, MXNet 提供了基于 dmlc-core 的 threadediter 更加通用的 IO 处理流水线.
 
 Threadediter 的重点是使用一个独立的线程作为数据提供者, 主线程作为数据消费者, 图示如下.
 

From a1c101fc0ae83a4908ba28afaff6dcf803dd7171 Mon Sep 17 00:00:00 2001
From: Siyuan LIU <weifengzi2009@gmail.com>
Date: Mon, 13 Jun 2016 00:28:05 +0800
Subject: [PATCH 004/126] automatically adjust num of threads used by make
 (#2320)

---
 docker/cpu/Dockerfile  | 2 +-
 docker/cuda/Dockerfile | 2 +-
 docs/how_to/build.md   | 6 +++---
 docs/how_to/cloud.md   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index 10311cb31bf8..1e5a956450dc 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y build-essential git libopenblas-dev lib
 RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
     cp make/config.mk . && \
     echo "USE_BLAS=openblas" >>config.mk && \
-    make -j8
+    make -j$(nproc)
 
 # python pakcage
 RUN apt-get install -y python-numpy wget unzip
diff --git a/docker/cuda/Dockerfile b/docker/cuda/Dockerfile
index 8796b70aa1c9..fff84352bf16 100644
--- a/docker/cuda/Dockerfile
+++ b/docker/cuda/Dockerfile
@@ -9,7 +9,7 @@ RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
     echo "USE_CUDA_PATH=/usr/local/cuda" >>config.mk && \
     echo "USE_CUDNN=1" >>config.mk && \
     echo "USE_BLAS=openblas" >>config.mk && \
-    make -j8 ADD_LDFLAGS=-L/usr/local/cuda/lib64/stubs
+    make -j$(nproc) ADD_LDFLAGS=-L/usr/local/cuda/lib64/stubs
 ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH 
 
 # python pakcage
diff --git a/docs/how_to/build.md b/docs/how_to/build.md
index 458493c70672..7e9c2e873bbc 100644
--- a/docs/how_to/build.md
+++ b/docs/how_to/build.md
@@ -59,7 +59,7 @@ sudo apt-get install -y build-essential git libatlas-base-dev libopencv-dev
 Then build mxnet
 ```bash
 git clone --recursive https://github.com/dmlc/mxnet
-cd mxnet; make -j4
+cd mxnet; make -j$(nproc)
 ```
 
 ### Building on OSX
@@ -77,7 +77,7 @@ Then build mxnet
 
 ```bash
 git clone --recursive https://github.com/dmlc/mxnet
-cd mxnet; cp make/osx.mk ./config.mk; make -j4
+cd mxnet; cp make/osx.mk ./config.mk; make -j$(sysctl -n hw.ncpu)
 ```
 
 Troubleshooting:
@@ -95,7 +95,7 @@ ln -s path1 /usr/local/lib/libgomp.dylib
 
 ```
 
-then run `make -j4` again.
+then run `make -j$(sysctl -n hw.ncpu)` again.
 
 
 ### Building on Windows
diff --git a/docs/how_to/cloud.md b/docs/how_to/cloud.md
index 1f8bfe9907be..fc00f3872d0e 100644
--- a/docs/how_to/cloud.md
+++ b/docs/how_to/cloud.md
@@ -72,7 +72,7 @@ echo "USE_CUDNN=1" >>config.mk
 echo "USE_BLAS=atlas" >> config.mk
 echo "USE_DIST_KVSTORE = 1" >>config.mk
 echo "USE_S3=1" >>config.mk
-make -j8
+make -j$(nproc)
 ```
 
 To test whether everything is installed properly, we train a Convolutional neural network on MNIST using a GPU:

From 8cd4cc255bfb307c176573445ce3bb94db34ba1a Mon Sep 17 00:00:00 2001
From: Yizhi Liu <javelinjs@gmail.com>
Date: Mon, 13 Jun 2016 01:41:44 +0800
Subject: [PATCH 005/126] update contributors (#2402)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a2578ea469a0..a9f61ae69bf0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -107,3 +107,4 @@ List of Contributors
 * [Yuqi Li](https://github.com/ziyeqinghan)
 * [Depeng Liang](https://github.com/Ldpe2G)
 * [Kiko Qiu](https://github.com/kikoqiu)
+* [Yang Bo](https://github.com/Atry)

From d7fdf4022f202be18b39dd3ec099807ddc2a7841 Mon Sep 17 00:00:00 2001
From: Minjie Wang <wmjlyjemaine@gmail.com>
Date: Sun, 12 Jun 2016 19:14:31 -0400
Subject: [PATCH 006/126] 1. temporarily fix gcc 6.1 and cuda problem; 2. add
 power to ndarray

---
 Makefile                |  4 ++--
 python/mxnet/ndarray.py | 44 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 1ef81159ad07..0b0fd325c8ac 100644
--- a/Makefile
+++ b/Makefile
@@ -37,9 +37,9 @@ endif
 CFLAGS += -I$(ROOTDIR)/mshadow/ -I$(ROOTDIR)/dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
 ifeq ($(DEBUG), 1)
-	NVCCFLAGS = -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -Xcompiler -std=c++98 -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS = -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -Xcompiler -std=c++98 -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 ifndef LINT_LANG
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 12ad735ffc72..9f296fd12677 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -160,6 +160,12 @@ def __idiv__(self, other):
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
+    def __pow__(self, other):
+        return power(self, other)
+
+    def __rpow__(self, other):
+        return power(other, self)
+
     def __truediv__(self, other):
         return self.__div__(other)
 
@@ -683,6 +689,44 @@ def divide(lhs, rhs):
         raise TypeError('type %s not supported' % str(type(rhs)))
     # pylint: enable= no-member, protected-access
 
+def power(lhs, rhs):
+    """ Perform power operator
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    # pylint: disable= no-member, protected-access
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return lhs ** rhs
+        elif isinstance(rhs, NDArray):
+            return NDArray._rpower_scalar(rhs, float(lhs))
+        else:
+            raise TypeError('type %s not supported' % str(type(rhs)))
+    elif isinstance(rhs, numeric_types):
+        return NDArray._power_scalar(lhs, float(rhs))
+    elif isinstance(rhs, NDArray):
+        lsize = functools.reduce(operator.mul, lhs.shape)
+        rsize = functools.reduce(operator.mul, rhs.shape)
+        if lsize < rsize:
+            lhs = lhs.broadcast_to(rhs.shape)
+        elif lsize > rsize:
+            rhs = rhs.broadcast_to(lhs.shape)
+        return NDArray._power(lhs, rhs)
+    else:
+        raise TypeError('type %s not supported' % str(type(rhs)))
+    # pylint: enable= no-member, protected-access
+
 def true_divide(lhs, rhs):
     """ Same as numpy's true_divide. It adjusts the output type to present the best answer,
     regardless of input types.

From 9a42e15ef1618315ca319dea79c3a2ba50ea06db Mon Sep 17 00:00:00 2001
From: Minjie Wang <wmjlyjemaine@gmail.com>
Date: Sun, 12 Jun 2016 22:48:55 -0400
Subject: [PATCH 007/126] add ufunc_helper that helps all broadcasting issues;
 fix div operator in python3

---
 python/mxnet/ndarray.py | 183 +++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 98 deletions(-)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 9f296fd12677..7ab7b185271f 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -2,6 +2,7 @@
 # pylint: disable= too-many-lines, redefined-builtin
 """NDArray API of mxnet."""
 from __future__ import absolute_import
+from __future__ import division
 
 import ctypes
 import warnings
@@ -144,13 +145,13 @@ def __imul__(self, other):
     def __rmul__(self, other):
         return self.__mul__(other)
 
-    def __div__(self, other):
+    def __truediv__(self, other):
         return divide(self, other)
 
-    def __rdiv__(self, other):
+    def __rtruediv__(self, other):
         return divide(other, self)
 
-    def __idiv__(self, other):
+    def __itruediv__(self, other):
         if not self.writable:
             raise ValueError('trying to divide from a readonly NDArray')
         if isinstance(other, NDArray):
@@ -166,9 +167,6 @@ def __pow__(self, other):
     def __rpow__(self, other):
         return power(other, self)
 
-    def __truediv__(self, other):
-        return self.__div__(other)
-
     def __getstate__(self):
         this = self.__dict__.copy()
         handle = this['handle']
@@ -541,42 +539,85 @@ def empty(shape, ctx=None, dtype=mx_real_t):
         ctx = Context.default_ctx
     return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
 
-def add(lhs, rhs):
-    """ Perform element-wise addition
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
+    """ Helper function for element-wise operation
+    The function will perform numpy-like broadcasting if needed and call different functions
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
+    lhs : NDArray or numeric value
+        left hande side operand
 
-    rhs : Array of float value
+    rhs : NDArray or numeric value
         right hand side operand
 
+    fn_array : function
+        function to be called if both lhs and rhs are of NDArray type
+
+    fn_scalar : function
+        function to be called if both lhs and rhs are numeric values
+
+    lfn_scalar : function
+        function to be called if lhs is NDArray while rhs is numeric value
+
+    rfn_scalar : function
+        function to be called if lhs is numeric value while rhs is NDArray;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
     Returns
     -------
-    out: Array
+    out: NDArray
         result array
     """
     # pylint: disable= no-member, protected-access
     if isinstance(lhs, numeric_types):
         if isinstance(rhs, numeric_types):
-            return lhs + rhs
+            return fn_scalar(lhs, rhs)
         else:
-            return add(rhs, lhs)
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs))
+            else:
+                return rfn_scalar(rhs, float(lhs))
     elif isinstance(rhs, numeric_types):
-        return NDArray._plus_scalar(lhs, float(rhs))
+        return lfn_scalar(lhs, float(rhs))
     elif isinstance(rhs, NDArray):
+        # check whether broadcasting is needed
         lsize = functools.reduce(operator.mul, lhs.shape)
         rsize = functools.reduce(operator.mul, rhs.shape)
         if lsize < rsize:
             lhs = lhs.broadcast_to(rhs.shape)
         elif lsize > rsize:
             rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._plus(lhs, rhs)
+        return fn_array(lhs, rhs)
     else:
         raise TypeError('type %s not supported' % str(type(rhs)))
     # pylint: enable= no-member, protected-access
 
+def add(lhs, rhs):
+    """ Perform element-wise addition
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    return _ufunc_helper(
+            lhs,
+            rhs,
+            NDArray._plus,
+            operator.add,
+            NDArray._plus_scalar,
+            None)
+
 def subtract(lhs, rhs):
     """ Perform element-wise subtract
 
@@ -593,27 +634,13 @@ def subtract(lhs, rhs):
     out: Array
         result array
     """
-    # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs - rhs
-        elif isinstance(rhs, NDArray):
-            return NDArray._rminus_scalar(rhs, float(lhs))
-        else:
-            raise TypeError('type %s not supported' % str(type(rhs)))
-    elif isinstance(rhs, numeric_types):
-        return NDArray._minus_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._minus(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
-    # pylint: enable= no-member, protected-access
+    return _ufunc_helper(
+            lhs,
+            rhs,
+            NDArray._minus,
+            operator.sub,
+            NDArray._minus_scalar,
+            NDArray._rminus_scalar)
 
 def multiply(lhs, rhs):
     """ Perform element-wise multiplication
@@ -631,25 +658,13 @@ def multiply(lhs, rhs):
     out: Array
         result array
     """
-    # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs * rhs
-        else:
-            return multiply(rhs, lhs)
-    elif isinstance(rhs, numeric_types):
-        return NDArray._mul_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._mul(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
-    # pylint: enable= no-member, protected-access
+    return _ufunc_helper(
+            lhs,
+            rhs,
+            NDArray._mul,
+            operator.mul,
+            NDArray._mul_scalar,
+            None)
 
 def divide(lhs, rhs):
     """ Perform element-wise divide
@@ -667,27 +682,13 @@ def divide(lhs, rhs):
     out: Array
         result array
     """
-    # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs / rhs
-        elif isinstance(rhs, NDArray):
-            return NDArray._rdiv_scalar(rhs, float(lhs))
-        else:
-            raise TypeError('type %s not supported' % str(type(rhs)))
-    elif isinstance(rhs, numeric_types):
-        return NDArray._div_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._div(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
-    # pylint: enable= no-member, protected-access
+    return _ufunc_helper(
+            lhs,
+            rhs,
+            NDArray._div,
+            operator.truediv,
+            NDArray._div_scalar,
+            NDArray._rdiv_scalar)
 
 def power(lhs, rhs):
     """ Perform power operator
@@ -705,27 +706,13 @@ def power(lhs, rhs):
     out: Array
         result array
     """
-    # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs ** rhs
-        elif isinstance(rhs, NDArray):
-            return NDArray._rpower_scalar(rhs, float(lhs))
-        else:
-            raise TypeError('type %s not supported' % str(type(rhs)))
-    elif isinstance(rhs, numeric_types):
-        return NDArray._power_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._power(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
-    # pylint: enable= no-member, protected-access
+    return _ufunc_helper(
+            lhs,
+            rhs,
+            NDArray._power,
+            operator.pow,
+            NDArray._power_scalar,
+            NDArray._rpower_scalar)
 
 def true_divide(lhs, rhs):
     """ Same as numpy's true_divide. It adjusts the output type to present the best answer,

From eb8b3eca71918a6a2a66665df6625671b7504f0c Mon Sep 17 00:00:00 2001
From: Minjie Wang <wmjlyjemaine@gmail.com>
Date: Sun, 12 Jun 2016 22:55:05 -0400
Subject: [PATCH 008/126] add maximum operator

---
 python/mxnet/ndarray.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 7ab7b185271f..d8166386a6f2 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -714,6 +714,31 @@ def power(lhs, rhs):
             NDArray._power_scalar,
             NDArray._rpower_scalar)
 
+def maximum(lhs, rhs):
+    """ Perform maximum operator
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    return _ufunc_helper(
+            lhs,
+            rhs,
+            NDArray._maximum,
+            lambda x, y: x if x > y else y,
+            NDArray._maximum_scalar,
+            None)
+
+
 def true_divide(lhs, rhs):
     """ Same as numpy's true_divide. It adjusts the output type to present the best answer,
     regardless of input types.

From 1a0d91410e095f0226c328c2d6c292043847f56f Mon Sep 17 00:00:00 2001
From: Minjie Wang <wmjlyjemaine@gmail.com>
Date: Mon, 13 Jun 2016 01:19:44 -0400
Subject: [PATCH 009/126] fix lint and python2 division overload

---
 python/mxnet/ndarray.py | 126 +++++++++++++++++++++++-----------------
 1 file changed, 74 insertions(+), 52 deletions(-)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index d8166386a6f2..7a3f5a7f1f67 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -17,6 +17,7 @@
 from .base import check_call, ctypes2docstring
 from .context import Context
 
+# pylint: disable= no-member
 _DTYPE_NP_TO_MX = {
     np.float32 : 0,
     np.float64 : 1,
@@ -32,6 +33,7 @@
     3 : np.uint8,
     4 : np.int32
 }
+# pylint: enable= no-member
 
 def _new_empty_handle():
     """Return a new empty handle.
@@ -145,13 +147,13 @@ def __imul__(self, other):
     def __rmul__(self, other):
         return self.__mul__(other)
 
-    def __truediv__(self, other):
+    def __div__(self, other):
         return divide(self, other)
 
-    def __rtruediv__(self, other):
+    def __rdiv__(self, other):
         return divide(other, self)
 
-    def __itruediv__(self, other):
+    def __idiv__(self, other):
         if not self.writable:
             raise ValueError('trying to divide from a readonly NDArray')
         if isinstance(other, NDArray):
@@ -161,6 +163,15 @@ def __itruediv__(self, other):
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
+    def __truediv__(self, other):
+        return divide(self, other)
+
+    def __rtruediv__(self, other):
+        return divide(other, self)
+
+    def __itruediv__(self, other):
+        return self.__idiv__(other)
+
     def __pow__(self, other):
         return power(self, other)
 
@@ -309,12 +320,12 @@ def broadcast_to(self, shape):
         if len(shape) < len(cur_shape):
             raise ValueError(err_str)
         cur_shape = (1,) * (len(shape) - len(cur_shape)) + cur_shape
-        cur_shape = np.array(cur_shape)
+        cur_shape_arr = np.array(cur_shape)
         shape = np.array(shape)
-        broadcasting_axes = np.nonzero(cur_shape != shape)
-        if (cur_shape[broadcasting_axes] != 1).any():
+        broadcasting_axes = np.nonzero(cur_shape_arr != shape)
+        if (cur_shape_arr[broadcasting_axes] != 1).any():
             raise ValueError(err_str)
-        ret = self.reshape(tuple(cur_shape))
+        ret = self.reshape(tuple(cur_shape_arr))
         for axis in broadcasting_axes[0]:
             ret = broadcast_axis(ret, axis=axis, size=shape[axis])
         return ret
@@ -539,6 +550,7 @@ def empty(shape, ctx=None, dtype=mx_real_t):
         ctx = Context.default_ctx
     return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
 
+#pylint: disable= too-many-arguments, no-member, protected-access
 def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
     """ Helper function for element-wise operation
     The function will perform numpy-like broadcasting if needed and call different functions
@@ -569,7 +581,6 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
     out: NDArray
         result array
     """
-    # pylint: disable= no-member, protected-access
     if isinstance(lhs, numeric_types):
         if isinstance(rhs, numeric_types):
             return fn_scalar(lhs, rhs)
@@ -592,7 +603,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
         return fn_array(lhs, rhs)
     else:
         raise TypeError('type %s not supported' % str(type(rhs)))
-    # pylint: enable= no-member, protected-access
+#pylint: enable= too-many-arguments, no-member, protected-access
 
 def add(lhs, rhs):
     """ Perform element-wise addition
@@ -610,13 +621,15 @@ def add(lhs, rhs):
     out: Array
         result array
     """
+    # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-            lhs,
-            rhs,
-            NDArray._plus,
-            operator.add,
-            NDArray._plus_scalar,
-            None)
+        lhs,
+        rhs,
+        NDArray._plus,
+        operator.add,
+        NDArray._plus_scalar,
+        None)
+    # pylint: enable= no-member, protected-access
 
 def subtract(lhs, rhs):
     """ Perform element-wise subtract
@@ -634,13 +647,15 @@ def subtract(lhs, rhs):
     out: Array
         result array
     """
+    # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-            lhs,
-            rhs,
-            NDArray._minus,
-            operator.sub,
-            NDArray._minus_scalar,
-            NDArray._rminus_scalar)
+        lhs,
+        rhs,
+        NDArray._minus,
+        operator.sub,
+        NDArray._minus_scalar,
+        NDArray._rminus_scalar)
+    # pylint: enable= no-member, protected-access
 
 def multiply(lhs, rhs):
     """ Perform element-wise multiplication
@@ -658,13 +673,15 @@ def multiply(lhs, rhs):
     out: Array
         result array
     """
+    # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-            lhs,
-            rhs,
-            NDArray._mul,
-            operator.mul,
-            NDArray._mul_scalar,
-            None)
+        lhs,
+        rhs,
+        NDArray._mul,
+        operator.mul,
+        NDArray._mul_scalar,
+        None)
+    # pylint: enable= no-member, protected-access
 
 def divide(lhs, rhs):
     """ Perform element-wise divide
@@ -682,13 +699,15 @@ def divide(lhs, rhs):
     out: Array
         result array
     """
+    # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-            lhs,
-            rhs,
-            NDArray._div,
-            operator.truediv,
-            NDArray._div_scalar,
-            NDArray._rdiv_scalar)
+        lhs,
+        rhs,
+        NDArray._div,
+        operator.truediv,
+        NDArray._div_scalar,
+        NDArray._rdiv_scalar)
+    # pylint: enable= no-member, protected-access
 
 def power(lhs, rhs):
     """ Perform power operator
@@ -706,13 +725,15 @@ def power(lhs, rhs):
     out: Array
         result array
     """
+    # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-            lhs,
-            rhs,
-            NDArray._power,
-            operator.pow,
-            NDArray._power_scalar,
-            NDArray._rpower_scalar)
+        lhs,
+        rhs,
+        NDArray._power,
+        operator.pow,
+        NDArray._power_scalar,
+        NDArray._rpower_scalar)
+    # pylint: enable= no-member, protected-access
 
 def maximum(lhs, rhs):
     """ Perform maximum operator
@@ -730,14 +751,15 @@ def maximum(lhs, rhs):
     out: Array
         result array
     """
+    # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-            lhs,
-            rhs,
-            NDArray._maximum,
-            lambda x, y: x if x > y else y,
-            NDArray._maximum_scalar,
-            None)
-
+        lhs,
+        rhs,
+        NDArray._maximum,
+        lambda x, y: x if x > y else y,
+        NDArray._maximum_scalar,
+        None)
+    # pylint: enable= no-member, protected-access
 
 def true_divide(lhs, rhs):
     """ Same as numpy's true_divide. It adjusts the output type to present the best answer,
@@ -806,11 +828,11 @@ def _reduce(arr, axis=None, keepdims=False, typ='sum'):
     out: Array
         The reduced NDArray.
     """
-    if 'sum' == typ:
+    if typ == 'sum':
         reduce_func = sum_axis
-    elif 'max' == typ:
+    elif typ == 'max':
         reduce_func = max_axis
-    elif 'min' == typ:
+    elif typ == 'min':
         reduce_func = min_axis
     else:
         raise TypeError('typ=\'%s\' is not supported.' % typ)
@@ -833,7 +855,7 @@ def _reduce(arr, axis=None, keepdims=False, typ='sum'):
     for i in axis:
         if not isinstance(i, int):
             raise TypeError('\'%s\' object cannot be interpreted as an integer' % type(i).__name__)
-    axis = sorted([x if 0 <= x else x + ndim for x in axis])
+    axis = sorted([x if x >= 0 else x + ndim for x in axis])
     for i in axis:
         if i < 0 or ndim <= i:
             raise ValueError('\'axis\' entry is out of bounds')
@@ -1179,7 +1201,7 @@ def binary_ndarray_function(lhs, rhs, out=None, **kwargs):
         """Internal binary function
         """
         if out:
-            if isinstance(out, NDArray) == False:
+            if not isinstance(out, NDArray):
                 raise TypeError('out must be NDArray')
             if not out.writable:
                 raise TypeError('out must be writable')
@@ -1200,7 +1222,7 @@ def binary_ndarray_function(lhs, rhs, out=None, **kwargs):
     def unary_ndarray_function(src, out=None, *args, **kwargs):
         """internal NDArray function"""
         if out:
-            if isinstance(out, NDArray) == False:
+            if not isinstance(out, NDArray):
                 raise TypeError('out must be NDArray')
             if not out.writable:
                 raise TypeError('out must be writable')

From 28b91881523c13e329b093277c4c64ecb9a28760 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Mon, 13 Jun 2016 18:22:52 +0800
Subject: [PATCH 010/126] fix clang unsupported option -fopenmp issue

clang: error: unsupported option '-fopenmp'
---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index 0b0fd325c8ac..43ff9beec9cd 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,6 @@ endif
 include $(config)
 include mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
-unexport NO_OPENMP
 
 # all tge possible warning tread
 WARNFLAGS= -Wall

From edb32aacb0dd45b4f0654365b614c1214fbeeaf8 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 9 Jun 2016 11:51:33 +0900
Subject: [PATCH 011/126] enable other DTypes in Concat

---
 src/operator/channel_op_common.h | 28 +++++++-------
 src/operator/concat-inl.h        | 63 ++++++++++++++++++++++++++------
 src/operator/concat.cc           | 17 +++++++--
 src/operator/concat.cu           |  8 +++-
 4 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 249b07b54632..9ae6a6602c2e 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -14,15 +14,15 @@
 namespace mxnet {
 namespace op {
 
-template<typename xpu, int dim, int cdim>
-inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim> > &input,
-                               mshadow::Tensor<xpu, dim> *output, const int dimension,
+template<typename xpu, int dim, int cdim, typename DType>
+inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType> > &input,
+                               mshadow::Tensor<xpu, dim, DType> *output, const int dimension,
                                const OpReqType req) {
   using mshadow::expr::concat;
   using mshadow::expr::slice;
 
   if (dimension == cdim) {
-    mshadow::Tensor<xpu, dim> out = *output;
+    mshadow::Tensor<xpu, dim, DType> out = *output;
     size_t size = input.size();
     index_t begin = 0;
     for (index_t i = 0; i < size; ++i) {
@@ -35,9 +35,9 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim> > &in
   }
 }
 
-template<typename xpu, int dim>
-inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim> > &input,
-                        mshadow::Tensor<xpu, dim> *output, const int dimension,
+template<typename xpu, int dim, typename DType>
+inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim, DType> > &input,
+                        mshadow::Tensor<xpu, dim, DType> *output, const int dimension,
                         const OpReqType req) {
   if (dimension < 0) {
     LOG(FATAL) << "dimension (" << dimension << ") must be greater than 0";
@@ -49,15 +49,15 @@ inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim> > &input,
 }
 
 
-template<typename xpu, int dim, int cdim>
-void split_helper(const mshadow::Tensor<xpu, dim> &input,
-           std::vector<mshadow::Tensor<xpu, dim> > *output,
+template<typename xpu, int dim, int cdim, typename DType>
+void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
+           std::vector<mshadow::Tensor<xpu, dim, DType> > *output,
            const int dimension, const std::vector<OpReqType> &req) {
   using mshadow::expr::concat;
   using mshadow::expr::slice;
 
   if (dimension == cdim) {
-    std::vector<mshadow::Tensor<xpu, dim> > out = *output;
+    std::vector<mshadow::Tensor<xpu, dim, DType> > out = *output;
     size_t size = out.size();
     index_t begin = 0;
     for (index_t i = 0; i < size; ++i) {
@@ -70,9 +70,9 @@ void split_helper(const mshadow::Tensor<xpu, dim> &input,
   }
 }
 
-template<typename xpu, int dim>
-void Split(const mshadow::Tensor<xpu, dim> &input,
-           std::vector<mshadow::Tensor<xpu, dim> > *output,
+template<typename xpu, int dim, typename DType>
+void Split(const mshadow::Tensor<xpu, dim, DType> &input,
+           std::vector<mshadow::Tensor<xpu, dim, DType> > *output,
            const int dimension, const std::vector<OpReqType> &req) {
   if (dimension < 0) {
     LOG(FATAL) << "dimension (" << dimension << ") must be greater than 0";
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index 5ece2bbfe9df..f8de862ea3cb 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -36,7 +36,7 @@ struct ConcatParam : public dmlc::Parameter<ConcatParam> {
   }
 };  // struct ConcatParam
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ConcatOp : public Operator {
  public:
   explicit ConcatOp(ConcatParam param)
@@ -53,8 +53,8 @@ class ConcatOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     CHECK_LT(dimension_, in_data[concat_enum::kData0].ndim());
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3> > data(size_);
-    Tensor<xpu, 3> out;
+    std::vector<Tensor<xpu, 3, DType> > data(size_);
+    Tensor<xpu, 3, DType> out;
     size_t leading = 1, trailing = 1;
     for (int i = 0; i < dimension_; ++i) {
       leading *= out_data[concat_enum::kOut].shape_[i];
@@ -64,11 +64,11 @@ class ConcatOp : public Operator {
     }
     size_t mid = out_data[concat_enum::kOut].shape_[dimension_];
     Shape<3> oshape = Shape3(leading, mid, trailing);
-    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, real_t>(oshape, s);
+    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
 
     for (int i = 0; i < size_; ++i) {
       Shape<3> dshape = Shape3(leading, in_data[i].shape_[dimension_], trailing);
-      data[i] = in_data[i].get_with_shape<xpu, 3, real_t>(dshape, s);
+      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     Concatenate(data, &out, 1, req[concat_enum::kOut]);
   }
@@ -85,8 +85,8 @@ class ConcatOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3> > grad_in(size_);
-    Tensor<xpu, 3> grad;
+    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
+    Tensor<xpu, 3, DType> grad;
     size_t leading = 1, trailing = 1;
     for (int i = 0; i < dimension_; ++i) {
       leading *= out_grad[concat_enum::kOut].shape_[i];
@@ -96,11 +96,11 @@ class ConcatOp : public Operator {
     }
     size_t mid = out_grad[concat_enum::kOut].shape_[dimension_];
     Shape<3> oshape = Shape3(leading, mid, trailing);
-    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, real_t>(oshape, s);
+    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
 
     for (int i = 0; i < size_; ++i) {
       Shape<3> dshape = Shape3(leading, in_grad[i].shape_[dimension_], trailing);
-      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, real_t>(dshape, s);
+      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     Split(grad, &grad_in, 1, req);
   }
@@ -111,7 +111,7 @@ class ConcatOp : public Operator {
 };  // class ConcatOp
 
 template<typename xpu>
-Operator *CreateOp(ConcatParam param);
+Operator *CreateOp(ConcatParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class ConcatProp : public OperatorProperty {
@@ -162,6 +162,41 @@ class ConcatProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    int dtype = -1;
+
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == dtype ||
+              in_type->at(i) == -1) <<
+              "Non-uniform data type in Concat";
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Not enough information to infer type in Concat.";
+      return false;
+    }
+
+    size_t nin = this->ListArguments().size();
+    in_type->clear();
+    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+    size_t naux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
+
+    size_t nout = this->ListOutputs().size();
+    out_type->clear();
+    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new ConcatProp();
     ptr->param_ = param_;
@@ -179,7 +214,13 @@ class ConcatProp : public OperatorProperty {
     return out_grad;
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   ConcatParam param_;
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
index 6ea9bc974c5e..579443e3bd13 100644
--- a/src/operator/concat.cc
+++ b/src/operator/concat.cc
@@ -10,12 +10,21 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ConcatParam param) {
-  return new ConcatOp<cpu>(param);
+Operator* CreateOp<cpu>(ConcatParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConcatOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* ConcatProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                       std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ConcatParam);
diff --git a/src/operator/concat.cu b/src/operator/concat.cu
index 4e24b45cc676..fb3cf3862f3e 100644
--- a/src/operator/concat.cu
+++ b/src/operator/concat.cu
@@ -10,8 +10,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ConcatParam param) {
-  return new ConcatOp<gpu>(param);
+Operator* CreateOp<gpu>(ConcatParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConcatOp<gpu, DType>(param);
+  });
+  return op;
 }
 
 }  // namespace op

From f5bd5bb2fff8038f9b1d7c9e5864b108d150c7f3 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 9 Jun 2016 13:25:50 +0900
Subject: [PATCH 012/126] enable other DTypes in UpSampling

---
 src/operator/upsampling-inl.h | 45 ++++++++++++++++++++------
 src/operator/upsampling.cc    | 60 ++++++++++++++++++++---------------
 src/operator/upsampling.cu    | 51 +++++++++++++++--------------
 3 files changed, 97 insertions(+), 59 deletions(-)

diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index 59a0a5a8a406..743427b7a942 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -62,7 +62,7 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
   }
 };  // struct UpSamplingParam
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class UpSamplingNearestOp : public Operator {
  public:
   explicit UpSamplingNearestOp(UpSamplingParam p) {
@@ -82,11 +82,11 @@ class UpSamplingNearestOp : public Operator {
       return;
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> out = out_data[up_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
     if (param_.num_args > 1) {
       int begin = 0;
       for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4> data = in_data[i].get<xpu, 4, real_t>(s);
+        Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
         int end = begin + data.size(1);
         int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
         if (param_.multi_input_mode == up_enum::kSum) {
@@ -101,7 +101,7 @@ class UpSamplingNearestOp : public Operator {
         begin = end;
       }
     } else {
-      Tensor<xpu, 4> data = in_data[up_enum::kData].get<xpu, 4, real_t>(s);
+      Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
       Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale));
     }
   }
@@ -118,11 +118,11 @@ class UpSamplingNearestOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), param_.num_args);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[up_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> grad = out_grad[up_enum::kOut].get<xpu, 4, DType>(s);
     if (param_.num_args > 1) {
       int begin = 0;
       for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4> input_grad = in_grad[i].get<xpu, 4, real_t>(s);
+        Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
         mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
         int end = begin + input_grad.size(1);
         int scale = grad.size(2)/in_shape[0];
@@ -146,7 +146,7 @@ class UpSamplingNearestOp : public Operator {
         begin = end;
       }
     } else {
-      Tensor<xpu, 4> input_grad = in_grad[up_enum::kData].get<xpu, 4, real_t>(s);
+      Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
       mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
       Assign(input_grad, req[up_enum::kData],
              pool<mshadow::red::sum>(grad,
@@ -163,7 +163,7 @@ class UpSamplingNearestOp : public Operator {
 };  // class UpSamplingNearestOp
 
 template<typename xpu>
-Operator *CreateOp(UpSamplingParam param);
+Operator *CreateOp(UpSamplingParam param, int dtype);
 
 
 #if DMLC_USE_CXX11
@@ -232,6 +232,26 @@ class UpSamplingProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new UpSamplingProp();
     ptr->param_ = this->param_;
@@ -279,7 +299,14 @@ class UpSamplingProp : public OperatorProperty {
     }
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
 
  private:
   UpSamplingParam param_;
diff --git a/src/operator/upsampling.cc b/src/operator/upsampling.cc
index d69e7e99c040..77373a820219 100644
--- a/src/operator/upsampling.cc
+++ b/src/operator/upsampling.cc
@@ -11,34 +11,42 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(UpSamplingParam param) {
-  if (param.sample_type == up_enum::kNearest) {
-    return new UpSamplingNearestOp<cpu>(param);
-  } else if (param.sample_type == up_enum::kBilinear) {
-    DeconvolutionParam p = DeconvolutionParam();
-    int kernel = 2 * param.scale - param.scale % 2;
-    int stride = param.scale;
-    int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-    p.workspace = param.workspace;
-    p.num_group = param.num_filter;
-    p.num_filter = param.num_filter;
-    p.no_bias =  true;
-    int shape[] = {1, 1};
-    shape[0] = shape[1] = kernel;
-    p.kernel = TShape(shape, shape + 2);
-    shape[0] = shape[1] = stride;
-    p.stride = TShape(shape, shape + 2);
-    shape[0] = shape[1] = pad;
-    p.pad = TShape(shape, shape + 2);
-    return new DeconvolutionOp<cpu, real_t>(p);
-  } else {
-    LOG(FATAL) << "Unknown sample type";
-    return NULL;
-  }
+Operator *CreateOp<cpu>(UpSamplingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.sample_type == up_enum::kNearest) {
+      op = new UpSamplingNearestOp<cpu, DType>(param);
+    } else if (param.sample_type == up_enum::kBilinear) {
+      DeconvolutionParam p = DeconvolutionParam();
+      int kernel = 2 * param.scale - param.scale % 2;
+      int stride = param.scale;
+      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+      p.workspace = param.workspace;
+      p.num_group = param.num_filter;
+      p.num_filter = param.num_filter;
+      p.no_bias =  true;
+      int shape[] = {1, 1};
+      shape[0] = shape[1] = kernel;
+      p.kernel = TShape(shape, shape + 2);
+      shape[0] = shape[1] = stride;
+      p.stride = TShape(shape, shape + 2);
+      shape[0] = shape[1] = pad;
+      p.pad = TShape(shape, shape + 2);
+      op = new DeconvolutionOp<cpu, DType>(p);
+    } else {
+      LOG(FATAL) << "Unknown sample type";
+    }
+  });
+  return op;
 }
 
-Operator* UpSamplingProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                           std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(UpSamplingParam);
diff --git a/src/operator/upsampling.cu b/src/operator/upsampling.cu
index 526f3a91de84..95864e430010 100644
--- a/src/operator/upsampling.cu
+++ b/src/operator/upsampling.cu
@@ -11,30 +11,33 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(UpSamplingParam param) {
-  if (param.sample_type == up_enum::kNearest) {
-    return new UpSamplingNearestOp<gpu>(param);
-  } else if (param.sample_type == up_enum::kBilinear) {
-    DeconvolutionParam p = DeconvolutionParam();
-    int kernel = 2 * param.scale - param.scale % 2;
-    int stride = param.scale;
-    int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-    p.workspace = param.workspace;
-    p.num_group = param.num_filter;
-    p.num_filter = param.num_filter;
-    p.no_bias =  true;
-    int shape[] = {1, 1};
-    shape[0] = shape[1] = kernel;
-    p.kernel = TShape(shape, shape + 2);
-    shape[0] = shape[1] = stride;
-    p.stride = TShape(shape, shape + 2);
-    shape[0] = shape[1] = pad;
-    p.pad = TShape(shape, shape + 2);
-    return new DeconvolutionOp<gpu, real_t>(p);
-  } else {
-    LOG(FATAL) << "Unknown sample type";
-    return NULL;
-  }
+Operator *CreateOp<gpu>(UpSamplingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.sample_type == up_enum::kNearest) {
+      op = new UpSamplingNearestOp<gpu, DType>(param);
+    } else if (param.sample_type == up_enum::kBilinear) {
+      DeconvolutionParam p = DeconvolutionParam();
+      int kernel = 2 * param.scale - param.scale % 2;
+      int stride = param.scale;
+      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+      p.workspace = param.workspace;
+      p.num_group = param.num_filter;
+      p.num_filter = param.num_filter;
+      p.no_bias =  true;
+      int shape[] = {1, 1};
+      shape[0] = shape[1] = kernel;
+      p.kernel = TShape(shape, shape + 2);
+      shape[0] = shape[1] = stride;
+      p.stride = TShape(shape, shape + 2);
+      shape[0] = shape[1] = pad;
+      p.pad = TShape(shape, shape + 2);
+      op = new DeconvolutionOp<gpu, DType>(p);
+    } else {
+      LOG(FATAL) << "Unknown sample type";
+    }
+  });
+  return op;
 }
 
 }  // namespace op

From 0c56be2d3ba27742b54c690055c55286970ef8a1 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Jun 2016 11:31:17 +0900
Subject: [PATCH 013/126] add tests for multiple dtypes for Concat and
 UpSampling

---
 tests/python/gpu/test_operator_gpu.py | 47 ++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7bf532d8468c..ad7de89924b1 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -64,9 +64,6 @@ def check_speed(sym, ctx, scale=1.0, N=100):
         exe.outputs[0].wait_to_read()
     return (time.time() - tic)*1.0/N
 
-
-
-
 def test_convolution_with_type():
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), name='conv')
     ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},
@@ -77,13 +74,36 @@ def test_convolution_with_type():
     check_consistency(sym, ctx_list)
 
 def test_deconvolution_with_type():
-    sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='conv')
-    ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},
-                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float32}},
-                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float16}},
-                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},
-                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float32}}]
-    check_type_consistency(sym, ctx_list)
+    sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='deconv')
+    ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
+                {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float16}},
+                {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
+                {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_upsampling_with_type():
+    sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type = 'nearest', num_args=1)
+    ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
+                {'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float32}},
+                {'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float16}},
+                {'ctx': mx.cpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
+                {'ctx': mx.cpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_concat_with_type():
+    sym = mx.sym.Concat(name='concat', num_args=2)
+    ctx_list = [{'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float64, 'concat_arg1': np.float64}},
+                {'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}},
+                {'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float16, 'concat_arg1': np.float16}},
+                {'ctx': mx.cpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float64, 'concat_arg1': np.float64}},
+                {'ctx': mx.cpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
+    check_consistency(sym, ctx_list)
 
 def test_fullyconnected_with_type():
     sym = mx.sym.FullyConnected(num_hidden=3, name='inner')
@@ -107,7 +127,10 @@ def test_activation_with_type():
 if __name__ == '__main__':
     test_convolution_with_type()
     test_deconvolution_with_type()
+    test_upsampling_with_type()
+    test_concat_with_type()
     test_fullyconnected_with_type()
     test_activation_with_type()
-	#test_softmax_with_shape((3,4), mx.gpu())
-    #test_multi_softmax_with_shape((3,4,5), mx.gpu())
\ No newline at end of file
+    #test_softmax_with_shape((3,4), mx.gpu())
+    #test_multi_softmax_with_shape((3,4,5), mx.gpu())
+

From a825c57856a34f7a9347e911c1625d30f7681088 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Jun 2016 15:58:43 +0900
Subject: [PATCH 014/126] enable other DTypes in Reshape

---
 src/operator/reshape-inl.h            | 32 +++++++++++++++++++++------
 src/operator/reshape.cc               | 17 ++++++++++----
 src/operator/reshape.cu               |  8 +++++--
 tests/python/gpu/test_operator_gpu.py | 10 +++++++++
 4 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 4ddc310e46a0..b61224f7ec3b 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -128,7 +128,7 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   }
 };
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ReshapeOp : public Operator {
  public:
   explicit ReshapeOp(ReshapeParam param) {}  // Do nothing
@@ -145,8 +145,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (req[reshape_enum::kOut] == kNullOp) return;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> data = in_data[reshape_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[reshape_enum::kOut].FlatTo2D<xpu, DType>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (data.dptr_ == out.dptr_) return;
@@ -168,8 +168,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad_in = in_grad[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad_out = out_grad[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> grad_in = in_grad[reshape_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad_out = out_grad[reshape_enum::kData].FlatTo2D<xpu, DType>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
     if (grad_out.dptr_ == grad_in.dptr_) return;
@@ -179,7 +179,7 @@ class ReshapeOp : public Operator {
 };  // class ReshapeOp
 
 template<typename xpu>
-Operator* CreateOp(ReshapeParam);
+Operator* CreateOp(ReshapeParam, int dtype);
 
 #if DMLC_USE_CXX11
 class ReshapeProp : public OperatorProperty {
@@ -275,6 +275,18 @@ class ReshapeProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new ReshapeProp();
     ptr->param_ = param_;
@@ -306,7 +318,13 @@ class ReshapeProp : public OperatorProperty {
     return {{out_grad[reshape_enum::kOut], in_grad[reshape_enum::kData]}};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  protected:
   ReshapeParam param_;
diff --git a/src/operator/reshape.cc b/src/operator/reshape.cc
index beee35bb0cf9..ff5579b9b286 100644
--- a/src/operator/reshape.cc
+++ b/src/operator/reshape.cc
@@ -11,12 +11,21 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(ReshapeParam param) {
-  return new ReshapeOp<cpu>(param);
+Operator *CreateOp<cpu>(ReshapeParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ReshapeOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* ReshapeProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* ReshapeProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                        std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ReshapeParam);
diff --git a/src/operator/reshape.cu b/src/operator/reshape.cu
index 06bbaec1fdfd..ab911749e684 100644
--- a/src/operator/reshape.cu
+++ b/src/operator/reshape.cu
@@ -11,8 +11,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(ReshapeParam param) {
-  return new ReshapeOp<gpu>(param);
+Operator *CreateOp<gpu>(ReshapeParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ReshapeOp<gpu, DType>(param);
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index ad7de89924b1..822f39743ee3 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -105,6 +105,15 @@ def test_concat_with_type():
                  'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
     check_consistency(sym, ctx_list)
 
+def test_reshape_with_type():
+    sym = mx.sym.Reshape(name='reshape', shape=(-1,1,1,0))
+    ctx_list = [{'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float64}},
+                {'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float32}},
+                {'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float16}},
+                {'ctx': mx.cpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float64}},
+                {'ctx': mx.cpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
 def test_fullyconnected_with_type():
     sym = mx.sym.FullyConnected(num_hidden=3, name='inner')
     ctx_list = [{'ctx': mx.gpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float64}},
@@ -129,6 +138,7 @@ def test_activation_with_type():
     test_deconvolution_with_type()
     test_upsampling_with_type()
     test_concat_with_type()
+    test_reshape_with_type()
     test_fullyconnected_with_type()
     test_activation_with_type()
     #test_softmax_with_shape((3,4), mx.gpu())

From 22ca4942c5802ef8f9e403f49badb57bd7a34937 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 10 Jun 2016 16:13:47 +0900
Subject: [PATCH 015/126] enable other DTypes in BlockGrad

---
 src/operator/block_grad-inl.h         | 29 +++++++++++++++++++++------
 src/operator/block_grad.cc            | 17 ++++++++++++----
 src/operator/block_grad.cu            |  8 ++++++--
 tests/python/gpu/test_operator_gpu.py | 10 +++++++++
 4 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
index ff5262d4e04a..eaf39ce59ac4 100644
--- a/src/operator/block_grad-inl.h
+++ b/src/operator/block_grad-inl.h
@@ -24,7 +24,7 @@ enum BlockGradientOpInputs {kData};
 enum BlockGradientOpOutputs {kOut};
 }  // namespace blockgrad
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class BlockGradientOp : public Operator {
  public:
   virtual void Forward(const OpContext &ctx,
@@ -37,8 +37,8 @@ class BlockGradientOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[blockgrad::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> data = in_data[blockgrad::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[blockgrad::kOut].FlatTo2D<xpu, DType>(s);
     out = F<mshadow_op::identity>(data);
   }
 
@@ -52,13 +52,13 @@ class BlockGradientOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad = in_grad[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> grad = in_grad[blockgrad::kData].FlatTo2D<xpu, DType>(s);
     grad = 0.f;
   }
 };  // class BlockGradientOp
 
 template<typename xpu>
-Operator *CreateOp();
+Operator *CreateOp(int dtype);
 
 #if DMLC_USE_CXX11
 class BlockGradientProp : public OperatorProperty {
@@ -81,6 +81,17 @@ class BlockGradientProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     return new BlockGradientProp();
   }
@@ -102,7 +113,13 @@ class BlockGradientProp : public OperatorProperty {
     return {{in_data[blockgrad::kData], out_data[blockgrad::kOut]}};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 };  // class BlockGradientProperty
 
 #endif  // DMLC_USE_CXX11
diff --git a/src/operator/block_grad.cc b/src/operator/block_grad.cc
index 67256f79f268..764618f51622 100644
--- a/src/operator/block_grad.cc
+++ b/src/operator/block_grad.cc
@@ -9,12 +9,21 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>() {
-  return new BlockGradientOp<cpu>();
+Operator *CreateOp<cpu>(int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new BlockGradientOp<cpu, DType>();
+  });
+  return op;
 }
 
-Operator *BlockGradientProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp);
+Operator *BlockGradientProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                              std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, in_type->at(0));
 }
 
 MXNET_REGISTER_OP_PROPERTY(BlockGrad, BlockGradientProp)
diff --git a/src/operator/block_grad.cu b/src/operator/block_grad.cu
index 22707e940b7e..af5fc1660797 100644
--- a/src/operator/block_grad.cu
+++ b/src/operator/block_grad.cu
@@ -9,8 +9,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>() {
-  return new BlockGradientOp<gpu>();
+Operator *CreateOp<gpu>(int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new BlockGradientOp<gpu, DType>();
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 822f39743ee3..44dbf0231233 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -114,6 +114,15 @@ def test_reshape_with_type():
                 {'ctx': mx.cpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+def test_blockgrad_with_type():
+    sym = mx.sym.BlockGrad(name='bg')
+    ctx_list = [{'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float64}},
+                {'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float32}},
+                {'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float16}},
+                {'ctx': mx.cpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float64}},
+                {'ctx': mx.cpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
 def test_fullyconnected_with_type():
     sym = mx.sym.FullyConnected(num_hidden=3, name='inner')
     ctx_list = [{'ctx': mx.gpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float64}},
@@ -139,6 +148,7 @@ def test_activation_with_type():
     test_upsampling_with_type()
     test_concat_with_type()
     test_reshape_with_type()
+    test_blockgrad_with_type()
     test_fullyconnected_with_type()
     test_activation_with_type()
     #test_softmax_with_shape((3,4), mx.gpu())

From 4605518c3cb129ff51ffa55493c31e61cc14b402 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 11 Jun 2016 10:46:25 +0900
Subject: [PATCH 016/126] enable other DTypes in SwapAxis

---
 src/operator/swapaxis-inl.h           | 27 ++++++++++++++++++++++-----
 src/operator/swapaxis.cc              | 17 +++++++++++++----
 src/operator/swapaxis.cu              |  8 ++++++--
 tests/python/gpu/test_operator_gpu.py | 10 ++++++++++
 4 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index 6b3e0be2765b..fe301d1d186e 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -40,7 +40,7 @@ struct SwapAxisParam : public dmlc::Parameter<SwapAxisParam> {
 };
 
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class SwapAxisOp : public Operator {
  public:
   explicit SwapAxisOp(SwapAxisParam p) {
@@ -99,12 +99,12 @@ class SwapAxisOp : public Operator {
 
     Reshape2Five(&inter_shape, shape_in, dim1, dim2);
 
-    Tensor<xpu, 5> inter_data_in = data_in.get_with_shape<xpu, 5, real_t>(inter_shape, s);
+    Tensor<xpu, 5, DType> inter_data_in = data_in.get_with_shape<xpu, 5, DType>(inter_shape, s);
 
     Shape<5> inter_shape2 = inter_shape;
     std::swap(inter_shape2[1], inter_shape2[3]);
 
-    Tensor<xpu, 5> inter_data_out = data_out.get_with_shape<xpu, 5, real_t>(inter_shape2, s);
+    Tensor<xpu, 5, DType> inter_data_out = data_out.get_with_shape<xpu, 5, DType>(inter_shape2, s);
 
     inter_data_out = swapaxis<3, 1>(inter_data_in);
   }
@@ -138,7 +138,7 @@ class SwapAxisOp : public Operator {
 
 
 template<typename xpu>
-Operator* CreateOp(SwapAxisParam param);
+Operator* CreateOp(SwapAxisParam param, int dtype);
 
 
 #if DMLC_USE_CXX11
@@ -171,6 +171,17 @@ class SwapAxisProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new SwapAxisProp();
     ptr->param_ = param_;
@@ -188,7 +199,13 @@ class SwapAxisProp : public OperatorProperty {
     return {out_grad[swapaxisenum::kOut]};
   };
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   SwapAxisParam param_;
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index 427e83e3619a..d2570da6a400 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -11,12 +11,21 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator* CreateOp<cpu>(SwapAxisParam param) {
-  return new SwapAxisOp<cpu>(param);
+Operator* CreateOp<cpu>(SwapAxisParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SwapAxisOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* SwapAxisProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* SwapAxisProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                         std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 
diff --git a/src/operator/swapaxis.cu b/src/operator/swapaxis.cu
index c27d3d2f7a1b..93f78c2e733d 100644
--- a/src/operator/swapaxis.cu
+++ b/src/operator/swapaxis.cu
@@ -11,8 +11,12 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator *CreateOp<gpu>(SwapAxisParam param) {
-  return new SwapAxisOp<gpu>(param);
+Operator *CreateOp<gpu>(SwapAxisParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op =  new SwapAxisOp<gpu, DType>(param);
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 44dbf0231233..09fc928e93b5 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -123,6 +123,15 @@ def test_blockgrad_with_type():
                 {'ctx': mx.cpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+def test_swapaxis_with_type():
+    sym = mx.sym.SwapAxis(name='swap', dim1=1)
+    ctx_list = [{'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float64}},
+                {'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float32}},
+                {'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float16}},
+                {'ctx': mx.cpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float64}},
+                {'ctx': mx.cpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
 def test_fullyconnected_with_type():
     sym = mx.sym.FullyConnected(num_hidden=3, name='inner')
     ctx_list = [{'ctx': mx.gpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float64}},
@@ -149,6 +158,7 @@ def test_activation_with_type():
     test_concat_with_type()
     test_reshape_with_type()
     test_blockgrad_with_type()
+    test_swapaxis_with_type()
     test_fullyconnected_with_type()
     test_activation_with_type()
     #test_softmax_with_shape((3,4), mx.gpu())

From 7b847e5790cc35a091b9398d3f2b4d9612cf9dcd Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 12 Jun 2016 12:26:12 +0900
Subject: [PATCH 017/126] enable other DTypes in ElementWiseSum

---
 src/operator/elementwise_sum-inl.h    | 70 ++++++++++++++++++++-------
 src/operator/elementwise_sum.cc       | 17 +++++--
 src/operator/elementwise_sum.cu       |  8 ++-
 tests/python/gpu/test_operator_gpu.py | 16 ++++++
 4 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index b7755485bea9..ebf33f90cc1c 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -34,7 +34,7 @@ struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
   }
 };
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ElementWiseSumOp : public Operator {
  public:
   explicit ElementWiseSumOp(ElementWiseSumParam param)
@@ -52,34 +52,34 @@ class ElementWiseSumOp : public Operator {
     if (req[elemsum::kOut] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> out = out_data[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> out = out_data[elemsum::kOut].FlatTo2D<xpu, DType>(s);
     switch (size_) {
       case 2: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1);
         break;
       }
       case 3: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2);
         break;
       }
       case 4: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2 + in_3);
         break;
       }
       default: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], F<mshadow_op::identity>(in_0));
         for (int i = 1; i < size_; ++i) {
-          out += in_data[i].FlatTo2D<xpu, real_t>(s);
+          out += in_data[i].FlatTo2D<xpu, DType>(s);
         }
         break;
       }
@@ -97,10 +97,10 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, DType>(s);
     for (int i = 0; i < size_; ++i) {
       if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
-      Tensor<xpu, 2> igrad = in_grad[i].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2, DType> igrad = in_grad[i].FlatTo2D<xpu, DType>(s);
       Assign(igrad, req[i], F<mshadow_op::identity>(ograd));
     }
   }
@@ -120,7 +120,7 @@ class ElementWiseSumOp : public Operator {
 };  // class ElementWiseSumOp
 
 template<typename xpu>
-Operator* CreateOp(ElementWiseSumParam param);
+Operator* CreateOp(ElementWiseSumParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class ElementWiseSumProp : public OperatorProperty {
@@ -155,6 +155,36 @@ class ElementWiseSumProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    size_t nin = in_type->size();
+    CHECK_EQ(nin, static_cast<size_t>(param_.num_args));
+
+    int dtype = -1;
+    for (size_t i = 0; i < nin; ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == dtype ||
+              in_type->at(i) == -1) <<
+              "This operator requires uniform type";
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "At least one input type needs to be known";
+      return false;
+    }
+
+    in_type->clear();
+    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   std::vector<std::string> ListArguments() const override {
     std::vector<std::string> ret;
     for (int i = 0; i < param_.num_args; ++i) {
@@ -194,7 +224,13 @@ class ElementWiseSumProp : public OperatorProperty {
     return {{in_data[0], out_data[0]}};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   ElementWiseSumParam param_;
diff --git a/src/operator/elementwise_sum.cc b/src/operator/elementwise_sum.cc
index d8546148f76c..fe58cbc0e452 100644
--- a/src/operator/elementwise_sum.cc
+++ b/src/operator/elementwise_sum.cc
@@ -7,13 +7,22 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ElementWiseSumParam param) {
-  return new ElementWiseSumOp<cpu>(param);
+Operator* CreateOp<cpu>(ElementWiseSumParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ElementWiseSumOp<cpu, DType>(param);
+  });
+  return op;
 }
 
 // DO_BIND_DISPATCH comes from static_operator_common.h
-Operator* ElementWiseSumProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* ElementWiseSumProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                               std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ElementWiseSumParam);
diff --git a/src/operator/elementwise_sum.cu b/src/operator/elementwise_sum.cu
index 7a9b443dad82..ae373916b7d4 100644
--- a/src/operator/elementwise_sum.cu
+++ b/src/operator/elementwise_sum.cu
@@ -7,8 +7,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ElementWiseSumParam param) {
-  return new ElementWiseSumOp<gpu>(param);
+Operator* CreateOp<gpu>(ElementWiseSumParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ElementWiseSumOp<gpu, DType>(param);
+  });
+  return op;
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 09fc928e93b5..afa1f0f8f3b3 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -105,6 +105,21 @@ def test_concat_with_type():
                  'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
     check_consistency(sym, ctx_list)
 
+def test_elementwisesum_with_type():
+    sym = mx.sym.ElementWiseSum(name='ews', num_args=2)
+    ctx_list = [{'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float64, 'ews_arg1': np.float64}},
+                {'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float32, 'ews_arg1': np.float32}},
+                {'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float16, 'ews_arg1': np.float16}},
+                {'ctx': mx.cpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float64, 'ews_arg1': np.float64}},
+                {'ctx': mx.cpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float32, 'ews_arg1': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+
 def test_reshape_with_type():
     sym = mx.sym.Reshape(name='reshape', shape=(-1,1,1,0))
     ctx_list = [{'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float64}},
@@ -156,6 +171,7 @@ def test_activation_with_type():
     test_deconvolution_with_type()
     test_upsampling_with_type()
     test_concat_with_type()
+    test_elementwisesum_with_type()
     test_reshape_with_type()
     test_blockgrad_with_type()
     test_swapaxis_with_type()

From be8266d3171caa2b87c3e0698f1230a99d4dd3ed Mon Sep 17 00:00:00 2001
From: sxjscience <xshiab@ust.hk>
Date: Tue, 14 Jun 2016 18:02:15 +0800
Subject: [PATCH 018/126] Update submodules

---
 dmlc-core | 2 +-
 mshadow   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dmlc-core b/dmlc-core
index 9fd3b48462a7..c39001019e44 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 9fd3b48462a7a651e12a197679f71e043dcb25a2
+Subproject commit c39001019e443c7a061789bd1180f58ce85fc3e6
diff --git a/mshadow b/mshadow
index 310e015e5c65..489748eee934 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 310e015e5c65bd5314e57fc79ceb06b162547325
+Subproject commit 489748eee93435cf63ea657bb0933808109953c7

From 967e07d55fe32d4172c8b133794b6cd297455af4 Mon Sep 17 00:00:00 2001
From: Lin Xuan <515364970@qq.com>
Date: Thu, 16 Jun 2016 00:59:27 +0800
Subject: [PATCH 019/126] Revert "Enable training fast rcnn with multiple GPUs
 (#2358)" (#2425)

This reverts commit 004c2383a623496b319753c735bf04fd94f15874.
---
 example/rcnn/README.md                        | 18 +---
 .../helper/processing/image_processing.py     |  2 +-
 example/rcnn/rcnn/callback.py                 |  4 +-
 example/rcnn/rcnn/data_iter.py                | 46 ++++------
 example/rcnn/rcnn/metric.py                   | 16 +---
 example/rcnn/rcnn/minibatch.py                | 20 ++---
 example/rcnn/rcnn/solver.py                   | 88 ++++++++++++-------
 example/rcnn/tools/test_net.py                |  2 +-
 example/rcnn/tools/train_net.py               | 24 ++---
 example/rcnn/train.py                         | 14 ++-
 python/mxnet/executor_manager.py              | 54 ++----------
 python/mxnet/model.py                         | 14 +--
 12 files changed, 114 insertions(+), 188 deletions(-)

diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index e35d09cb92e7..240234870dc8 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -45,12 +45,11 @@ https://github.com/rbgirshick/fast-rcnn/tree/master/data/demo
 * Start training by run `python train.py`. Variable args can be found by run
 `python train.py --help`.
 * Training can be done in cpu, modify `train.py` accordingly.
-* Training can be done in multiple gpus.
 ```
 usage: train.py [-h] [--image_set IMAGE_SET] [--year YEAR]
                 [--root_path ROOT_PATH] [--devkit_path DEVKIT_PATH]
                 [--pretrained PRETRAINED] [--epoch EPOCH] [--prefix PREFIX]
-                [--gpus GPU_ID] [--begin_epoch BEGIN_EPOCH]
+                [--gpu GPU_ID] [--begin_epoch BEGIN_EPOCH]
                 [--end_epoch END_EPOCH] [--frequent FREQUENT]
 
 Train a Fast R-CNN network
@@ -68,24 +67,13 @@ optional arguments:
                         pretrained model prefix
   --epoch EPOCH         epoch of pretrained model
   --prefix PREFIX       new model prefix
-  --gpus GPU_ID         GPU devices to train with
+  --gpu GPU_ID          GPU device to train with
   --begin_epoch BEGIN_EPOCH
                         begin epoch of training
   --end_epoch END_EPOCH
                         end epoch of training
   --frequent FREQUENT   frequency of logging
-  --kv_store KV_STORE   kv_store type used in multi-device training
-  --work_load_list WORK_LOAD_LIST
-                        list of work load for different devices
 ```
-- Performance in terms of training speed
-
- | GPUs | batch size | samples per second |
- | --- | --- | --- |
- | 1 | 2 | 3.02 |
- | 2 | 4 | 3.80 |
- | 4 | 8 | 5.96 |
-
 
 ## Testing
 * Start testing by run `python test.py`. Variable args can be found by run
@@ -141,4 +129,4 @@ This repository used code from [MXNet](https://github.com/dmlc/mxnet),
 [caffe](https://github.com/BVLC/caffe). Training data are from
 [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/),
 [ImageNet](http://image-net.org/). Model comes from
-[VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
+[VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
\ No newline at end of file
diff --git a/example/rcnn/helper/processing/image_processing.py b/example/rcnn/helper/processing/image_processing.py
index 8fbabd304e98..833c2c74caa5 100644
--- a/example/rcnn/helper/processing/image_processing.py
+++ b/example/rcnn/helper/processing/image_processing.py
@@ -12,7 +12,7 @@ def resize(im, target_size, max_size):
     """
     im_shape = im.shape
     im_size_min = np.min(im_shape[0:2])
-    im_size_max = np.max(im_shape[0:2])
+    im_size_max = np.min(im_shape[0:2])
     im_scale = float(target_size) / float(im_size_min)
     # prevent bigger axis from being more than max_size:
     if np.round(im_scale * im_size_max) > max_size:
diff --git a/example/rcnn/rcnn/callback.py b/example/rcnn/rcnn/callback.py
index 7b05628829ad..bf5a8e72d24d 100644
--- a/example/rcnn/rcnn/callback.py
+++ b/example/rcnn/rcnn/callback.py
@@ -22,8 +22,10 @@ def __call__(self, param):
                 speed = self.frequent * self.batch_size / (time.time() - self.tic)
                 if param.eval_metric is not None:
                     name, value = param.eval_metric.get()
+                    cls, cls_value = param.cls_metric.get()
+                    bbox, bbox_value = param.bbox_metric.get()
                     logging.info("Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f,\t%s=%f,\t%s=%f",
-                                 param.epoch, count, speed, name[0], value[0], name[1], value[1], name[2], value[2])
+                                 param.epoch, count, speed, name, value, cls, cls_value, bbox, bbox_value)
                 else:
                     logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
                                  param.epoch, count, speed)
diff --git a/example/rcnn/rcnn/data_iter.py b/example/rcnn/rcnn/data_iter.py
index a1f022c6f5a0..765334b2090f 100644
--- a/example/rcnn/rcnn/data_iter.py
+++ b/example/rcnn/rcnn/data_iter.py
@@ -4,7 +4,7 @@
 
 
 class ROIIter(mx.io.DataIter):
-    def __init__(self, roidb, ctx, batch_size=2, shuffle=False, mode='train', work_load_list=None):
+    def __init__(self, roidb, batch_size=2, shuffle=False, mode='train'):
         """
         This Iter will provide roi data to Fast R-CNN network
         :param roidb: must be preprocessed
@@ -15,11 +15,9 @@ def __init__(self, roidb, ctx, batch_size=2, shuffle=False, mode='train', work_l
         super(ROIIter, self).__init__()
 
         self.roidb = roidb
-        self.ctx = ctx
         self.batch_size = batch_size
         self.shuffle = shuffle
         self.mode = mode
-        self.work_load_list = work_load_list
         if self.mode != 'train':
             assert self.batch_size == 1
 
@@ -32,17 +30,16 @@ def __init__(self, roidb, ctx, batch_size=2, shuffle=False, mode='train', work_l
         self.data = None
         self.label = None
         self.get_batch()
+        self.data_name = self.data.keys()
+        self.label_name = self.label.keys()
 
     @property
     def provide_data(self):
-        return [('data', self.data[0].shape), ('rois', self.data[1].shape)]
+        return [(k, v.shape) for k, v in self.data.items()]
 
     @property
     def provide_label(self):
-        return [('cls_prob_label', self.label[0].shape),
-                ('bbox_loss_target', self.label[1].shape),
-                ('bbox_loss_inside_weight', self.label[2].shape),
-                ('bbox_loss_outside_weight', self.label[3].shape)]
+        return [(k, v.shape) for k, v in self.label.items()]
 
     def reset(self):
         self.cur = 0
@@ -56,13 +53,8 @@ def next(self):
         if self.iter_next():
             self.get_batch()
             self.cur += self.batch_size
-            if self.mode == 'train':
-                return mx.io.DataBatch(data=self.data, label=self.label,
-                                       pad=self.getpad(), index=self.getindex(),
-                                       provide_data=self.provide_data, provide_label=self.provide_label)
-            else:
-                return mx.io.DataBatch(data=self.data, label=self.label,
-                                       pad=self.getpad(), index=self.getindex())
+            return mx.io.DataBatch(data=self.data, label=self.label,
+                                   pad=self.getpad(), index=self.getindex())
         else:
             raise StopIteration
 
@@ -70,17 +62,17 @@ def getindex(self):
         return self.cur / self.batch_size
 
     def getpad(self):
-        if self.cur + self.batch_size > self.size:
-            return self.cur + self.batch_size - self.size
-        else:
-            return 0
+        return self.batch_size - self.size % self.batch_size
 
     def get_batch(self):
         if self.mode == 'train':
             self.batch = self._get_train_batch()
-            self.data = [mx.nd.array(self.batch['data']), mx.nd.array(self.batch['rois'])]
-            self.label = [mx.nd.array(self.batch['labels']), mx.nd.array(self.batch['bbox_targets']),
-                mx.nd.array(self.batch['bbox_inside_weights']), mx.nd.array(self.batch['bbox_outside_weights'])]
+            self.data = {'data': self.batch['data'],
+                         'rois': self.batch['rois']}
+            self.label = {'cls_prob_label': self.batch['labels'],
+                          'bbox_loss_target': self.batch['bbox_targets'],
+                          'bbox_loss_inside_weight': self.batch['bbox_inside_weights'],
+                          'bbox_loss_outside_weight': self.batch['bbox_outside_weights']}
         else:
             self.batch = self._get_test_batch()
             self.data = {'data': self.batch['data'],
@@ -93,13 +85,9 @@ def _get_train_batch(self):
         :return: training batch (e.g. 128 samples)
         """
         cur_from = self.cur
-        cur_to = cur_from + self.batch_size
-        if cur_to <= self.size:
-            roidb = [self.roidb[i] for i in range(cur_from, cur_to)]
-        else:
-            pad = cur_to - self.size
-            roidb = self.roidb[cur_from:] + self.roidb[:pad]
-        batch = minibatch.get_minibatch(roidb, self.num_classes, self.ctx, self.work_load_list)
+        cur_to = min(cur_from + self.batch_size, self.size)
+        roidb = [self.roidb[i] for i in range(cur_from, cur_to)]
+        batch = minibatch.get_minibatch(roidb, self.num_classes)
         return batch
 
     def _get_test_batch(self):
diff --git a/example/rcnn/rcnn/metric.py b/example/rcnn/rcnn/metric.py
index c31e5533c04b..8bf5119dffc3 100644
--- a/example/rcnn/rcnn/metric.py
+++ b/example/rcnn/rcnn/metric.py
@@ -24,20 +24,8 @@ def __init__(self):
         super(SmoothL1LossMetric, self).__init__('SmoothL1Loss')
 
     def update(self, labels, preds):
-        bbox_loss = preds[1].asnumpy()
-        label = labels[1].asnumpy()
+        bbox_loss = preds[0].asnumpy()
+        label = labels[0].asnumpy()
         bbox_loss = np.sum(bbox_loss)
         self.sum_metric += bbox_loss
         self.num_inst += label.shape[0]
-
-
-class Accuracy(mx.metric.EvalMetric):
-    def __init__(self):
-        super(Accuracy, self).__init__('accuracy')
-
-    def update(self, labels, preds):
-        pred_label = mx.ndarray.argmax_channel(preds[0]).asnumpy().astype('int32')
-        label = labels[0].asnumpy().astype('int32')
-
-        self.sum_metric += (pred_label.flat == label.flat).sum()
-        self.num_inst += len(pred_label.flat)
diff --git a/example/rcnn/rcnn/minibatch.py b/example/rcnn/rcnn/minibatch.py
index fe9990e781ce..b47ec0a7cf07 100644
--- a/example/rcnn/rcnn/minibatch.py
+++ b/example/rcnn/rcnn/minibatch.py
@@ -22,9 +22,9 @@
 from helper.processing import image_processing
 from helper.processing.bbox_regression import expand_bbox_regression_targets
 from rcnn.config import config
-from mxnet.executor_manager import _split_input_slice
 
-def get_minibatch(roidb, num_classes, ctx, work_load_list=None):
+
+def get_minibatch(roidb, num_classes):
     """
     return minibatch of images in roidb
     :param roidb: subset of main database
@@ -40,30 +40,20 @@ def get_minibatch(roidb, num_classes, ctx, work_load_list=None):
 
     # im_array: [num_images, c, h, w]
     im_array, im_scales = get_image_array(roidb, config.TRAIN.SCALES, random_scale_indexes)
+
     rois_array = list()
     labels_array = list()
     bbox_targets_array = list()
     bbox_inside_array = list()
 
-    if work_load_list is None:
-        work_load_list = [1] * len(ctx)
-    assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
-        "Invalid settings for work load. "
-    slices = _split_input_slice(num_images, work_load_list)
-
-    idx_in_slice = []
-    for islice in slices:
-        num_im = islice.stop - islice.start
-        for i in range(num_im):
-            idx_in_slice.append(i)
-    for im_i, idx in enumerate(idx_in_slice):
+    for im_i in range(num_images):
         im_rois, labels, bbox_targets, bbox_inside_weights, overlaps = \
             sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes)
 
         # project im_rois
         # do not round roi
         rois = im_rois * im_scales[im_i]
-        batch_index = idx * np.ones((rois.shape[0], 1))
+        batch_index = im_i * np.ones((rois.shape[0], 1))
         rois_array_this_image = np.hstack((batch_index, rois))
         rois_array.append(rois_array_this_image)
 
diff --git a/example/rcnn/rcnn/solver.py b/example/rcnn/rcnn/solver.py
index f59e9422b1c6..8f5d6efbb5f8 100644
--- a/example/rcnn/rcnn/solver.py
+++ b/example/rcnn/rcnn/solver.py
@@ -6,14 +6,13 @@
 from callback import Speedometer
 from config import config
 
+
 class Solver(object):
     def __init__(self, prefix,
                  symbol, ctx=None,
                  begin_epoch=0, num_epoch=None,
-                 kv_store='local',
                  arg_params=None, aux_params=None,
-                 optimizer='sgd',
-                 max_data_shape=None, **kwargs):
+                 optimizer='sgd', **kwargs):
         self.prefix = prefix
         self.symbol = symbol
         self.ctx = ctx
@@ -21,67 +20,88 @@ def __init__(self, prefix,
             self.ctx = mx.cpu()
         self.begin_epoch = begin_epoch
         self.num_epoch = num_epoch
-        self.kv_store = kv_store
         self.arg_params = arg_params
         self.aux_params = aux_params
+        self.grad_params = None
+        self.executor = None
         self.optimizer = optimizer
         self.updater = None
-        self.max_data_shape = max_data_shape
         self.kwargs = kwargs.copy()
 
-        self.arg_names = None
-        self.param_names = None
-        self.aux_names = None
-
     def get_params(self, grad_req):
         arg_names = self.symbol.list_arguments()
-        self.arg_names = arg_names
         arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
         if grad_req != 'null':
-            param_names = []
+            self.grad_params = {}
             for name, shape in zip(arg_names, arg_shapes):
                 if not (name.endswith('data') or name.endswith('rois') or
                         name.endswith('inside_weight') or name.endswith('outside_weight') or
                         name.endswith('label') or name.endswith('target') or
                         name.startswith('conv1') or name.startswith('conv2')):
-                    param_names.append(name)
-            self.param_names = list(param_names)
+                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
         aux_names = self.symbol.list_auxiliary_states()
-        self.aux_names = aux_names
         self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
 
     def fit(self, train_data,
             grad_req='write',
             frequent=20,
             logger=None):
-        (kvstore, update_on_kvstore) = mx.model._create_kvstore(
-            self.kv_store, len(self.ctx), self.arg_params)
         if logger is None:
             logger = logging
         logger.info('Start training with %s', str(self.ctx))
-
+        speedometer_param = namedtuple('BatchEndParams',
+                                       ['epoch', 'nbatch', 'eval_metric', 'cls_metric', 'bbox_metric'])
         batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
         epoch_end_callback = mx.callback.do_checkpoint(self.prefix)
 
         self.get_params(grad_req)
+        self.optimizer = mx.optimizer.create(self.optimizer, rescale_grad=(1.0 / config.TRAIN.BATCH_SIZE), **self.kwargs)
+        self.updater = mx.optimizer.get_updater(self.optimizer)
 
-        eval_metric = metric.Accuracy()
+        eval_metric = mx.metric.create("accuracy")
         cls_metric = metric.LogLossMetric()
         bbox_metric = metric.SmoothL1LossMetric()
-        eval_metrics = mx.metric.CompositeEvalMetric()
-        for child_metric in [eval_metric, cls_metric, bbox_metric]:
-            eval_metrics.add(child_metric)
-        max_data_shape = self.max_data_shape
 
-        self.optimizer = mx.optimizer.create(self.optimizer, rescale_grad=(1.0 / config.TRAIN.BATCH_SIZE), **self.kwargs)
-        mx.model._train_multi_device(self.symbol, self.ctx, self.arg_names, self.param_names,
-                                     self.aux_names, self.arg_params, self.aux_params,
-                                     begin_epoch=self.begin_epoch, end_epoch=self.num_epoch,
-                                     epoch_size=None, optimizer=self.optimizer,
-                                     train_data=train_data, eval_data=None,
-                                     eval_metric=eval_metrics,
-                                     epoch_end_callback=epoch_end_callback,
-                                     batch_end_callback=batch_end_callback,
-                                     kvstore=kvstore, update_on_kvstore=update_on_kvstore,
-                                     logger=logger, work_load_list=None, monitor=None,
-                                     mutable_data_shape=True, max_data_shape=self.max_data_shape)
+        # begin training
+        for epoch in range(self.begin_epoch, self.num_epoch):
+            nbatch = 0
+            train_data.reset()
+            eval_metric.reset()
+            cls_metric.reset()
+            bbox_metric.reset()
+            for databatch in train_data:
+                nbatch += 1
+                for k, v in databatch.data.items():
+                    self.arg_params[k] = mx.nd.array(v, self.ctx)
+                for k, v in databatch.label.items():
+                    self.arg_params[k] = mx.nd.array(v, self.ctx)
+                self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params,
+                                                 grad_req=grad_req, aux_states=self.aux_params)
+                assert len(self.symbol.list_arguments()) == len(self.executor.grad_arrays)
+                update_dict = {name: nd for name, nd
+                               in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd}
+                output_dict = {name: nd for name, nd
+                               in zip(self.symbol.list_outputs(), self.executor.outputs)}
+                self.executor.forward(is_train=True)
+                self.executor.backward()
+
+                for key, arr in update_dict.items():
+                    self.updater(key, arr, self.arg_params[key])
+
+                label = self.arg_params['cls_prob_label']
+                pred = output_dict['cls_prob_output']
+                bb_target = self.arg_params['bbox_loss_target']
+                bb_loss = output_dict['bbox_loss_output']
+                eval_metric.update([label], [pred])
+                cls_metric.update([label], [pred])
+                bbox_metric.update([bb_target], [bb_loss])
+
+                # print speed and accuracy metric
+                batch_end_params = speedometer_param(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric,
+                                                     cls_metric=cls_metric, bbox_metric=bbox_metric)
+                batch_end_callback(batch_end_params)
+
+            if epoch_end_callback:
+                epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params)
+            name, value = eval_metric.get()
+            logger.info("                     --->Epoch[%d] Train-%s=%f", epoch, name, value)
diff --git a/example/rcnn/tools/test_net.py b/example/rcnn/tools/test_net.py
index 1c0a763d24d3..fd7ceafd9571 100644
--- a/example/rcnn/tools/test_net.py
+++ b/example/rcnn/tools/test_net.py
@@ -25,7 +25,7 @@ def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx):
 
     # load testing data
     voc, roidb = load_test_roidb(imageset, year, root_path, devkit_path)
-    test_data = ROIIter(roidb, ctx=ctx, batch_size=1, shuffle=False, mode='test')
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
 
     # load model
     args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
diff --git a/example/rcnn/tools/train_net.py b/example/rcnn/tools/train_net.py
index 02fe598f0b83..4078f833b1fa 100644
--- a/example/rcnn/tools/train_net.py
+++ b/example/rcnn/tools/train_net.py
@@ -10,7 +10,7 @@
 
 
 def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None):
+              prefix, ctx, begin_epoch, end_epoch, frequent):
     """
     wrapper for solver
     :param image_set: image set to train on
@@ -32,10 +32,10 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
 
     # load training data
     voc, roidb, means, stds = load_train_roidb(image_set, year, root_path, devkit_path, flip=True)
-    train_data = ROIIter(roidb, ctx=ctx,  batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train', work_load_list=work_load_list)
+    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train')
 
     # load pretrained
-    args, auxs = load_param(pretrained, epoch, convert=True, ctx=ctx[0])
+    args, auxs = load_param(pretrained, epoch, convert=True, ctx=ctx)
     del args['fc8_bias']
     del args['fc8_weight']
 
@@ -45,20 +45,20 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
     # initialize params
     arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-    args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'], ctx=ctx[0])
-    args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'], ctx=ctx[0])
-    args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'], ctx=ctx[0])
-    args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'], ctx=ctx[0])
+    args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'], ctx=ctx)
+    args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'], ctx=ctx)
+    args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'], ctx=ctx)
+    args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'], ctx=ctx)
 
     # train
-    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
-                    learning_rate=0.001, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1), max_data_shape=[3, 1000, 1000])
+    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, args, auxs, momentum=0.9, wd=0.0005,
+                    learning_rate=0.001, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1))
     solver.fit(train_data, frequent=frequent)
 
     # edit params and save
     for epoch in range(begin_epoch + 1, end_epoch + 1):
         arg_params, aux_params = load_checkpoint(prefix, epoch)
-        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds, ctx=ctx[0])).T
-        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds, ctx=ctx[0]) + \
-                                       mx.nd.array(means, ctx=ctx[0])
+        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds, ctx=ctx)).T
+        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds, ctx=ctx) + \
+                                       mx.nd.array(means, ctx=ctx)
         save_checkpoint(prefix, epoch, arg_params, aux_params)
diff --git a/example/rcnn/train.py b/example/rcnn/train.py
index ad61855ae50f..bca8585efd80 100644
--- a/example/rcnn/train.py
+++ b/example/rcnn/train.py
@@ -7,7 +7,7 @@
 def parse_args():
     parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')
     parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
-                        default='trainval', type=str)
+                        default='train', type=str)
     parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
                         default='2007', type=str)
     parser.add_argument('--root_path', dest='root_path', help='output data folder',
@@ -20,23 +20,19 @@ def parse_args():
                         default=1, type=int)
     parser.add_argument('--prefix', dest='prefix', help='new model prefix',
                         default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
-                        default='0', type=str)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with',
+                        default=0, type=int)
     parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
                         default=0, type=int)
     parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
                         default=8, type=int)
     parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
                         default=20, type=int)
-    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
-                        default='local', type=str)
-    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
-                        default=None, type=list)
     args = parser.parse_args()
     return args
 
 if __name__ == '__main__':
     args = parse_args()
-    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    ctx = mx.gpu(args.gpu_id)
     train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
-              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent, args.kv_store, args.work_load_list)
+              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent)
diff --git a/python/mxnet/executor_manager.py b/python/mxnet/executor_manager.py
index 6afda1e3b956..4ff9298092e3 100644
--- a/python/mxnet/executor_manager.py
+++ b/python/mxnet/executor_manager.py
@@ -80,12 +80,7 @@ def _load_general(data, targets):
             d_src.copyto(d_targets)
         else:
             for slice_idx, d_dst in d_targets:
-                if d_src[slice_idx].shape != d_dst.shape:
-                    n = d_dst.shape[0] / (slice_idx.stop - slice_idx.start)
-                    new_slice = slice(slice_idx.start * n, slice_idx.stop * n)
-                    d_src[new_slice].copyto(d_dst)
-                else:
-                    d_src[slice_idx].copyto(d_dst)
+                d_src[slice_idx].copyto(d_dst)
 
 def _load_data(batch, targets):
     """Load data into sliced arrays"""
@@ -201,14 +196,10 @@ class DataParallelExecutorGroup(object):
         The dataset for training. It could be any object with `provide_data` and
         `provide_label` properties. Loading of actual data is not necessarily needed
         at this stage.
-    max_data_shape: list of float or int
-        Maximum shape of input data
     shared_grop: DataParallelExecutorGroup
         An existing executor group, if to share parameters with it.
     """
-    def __init__(self, sym, arg_names, param_names,
-                 ctx, slices, train_data,
-                 max_data_shape=None, shared_group=None):
+    def __init__(self, sym, arg_names, param_names, ctx, slices, train_data, shared_group=None):
         # make sure the architecture is valid
         _check_arguments(sym)
 
@@ -225,23 +216,8 @@ def __init__(self, sym, arg_names, param_names,
 
         self.train_execs = []
         for i in range(len(ctx)):
-            data_shapes = {}
-            batch_size = 0
-            for k, v in train_data.provide_data:
-                if k == 'data':
-                    batch_size = v[0]
-            for k, v in train_data.provide_data + train_data.provide_label:
-                if k == 'data':
-                    if shared_group is None and max_data_shape is not None:
-                        # init first executor group
-                        # data size is set to max possible size of input data
-                        data_shapes[k] = tuple([slices[i].stop - slices[i].start] + max_data_shape)
-                    else:
-                        data_shapes[k] = tuple([slices[i].stop - slices[i].start] + list(v[1:]))
-                else:
-                    data_shapes[k] = tuple([int((slices[i].stop - slices[i].start) * v[0] \
-                                           / batch_size)] + list(v[1:]))
-
+            data_shapes = {k: tuple([slices[i].stop-slices[i].start] + list(v[1:]))
+                           for k, v in train_data.provide_data + train_data.provide_label}
             shared_exec = None if shared_group is None else shared_group.train_execs[i]
             train_exec = _bind_exec(sym, ctx[i], data_shapes, self.param_names,
                                     need_grad=True, base_exec=shared_exec,
@@ -282,9 +258,7 @@ def backward(self):
     def update_metric(self, metric, labels):
         """ Update evaluation metric with label and current outputs """
         for texec, islice in zip(self.train_execs, self.slices):
-            n = int(texec.outputs[0].shape[0] / (islice.stop - islice.start))
-            new_slice = slice(islice.start * n, islice.stop * n)
-            labels_slice = [label[new_slice] for label in labels]
+            labels_slice = [label[islice] for label in labels]
             metric.update(labels_slice, texec.outputs)
 
 class DataParallelExecutorManager(object):
@@ -310,15 +284,10 @@ class DataParallelExecutorManager(object):
         When not specified, default logger will be used.
     sym_gen : a function that generate new Symbols depending on different
         input shapes. Used only for bucketing.
-    mutable_data_shape: bool
-        Whether input data have different shapes or not.
-    max_data_shape: list of float or int
-        The maximum shape of input data
     """
     def __init__(self, symbol, ctx, train_data,
                  arg_names, param_names, aux_names,
-                 work_load_list=None, logger=None, sym_gen=None,
-                 mutable_data_shape=False, max_data_shape=None):
+                 work_load_list=None, logger=None, sym_gen=None):
         if logger is None:
             logger = logging
         # preparation
@@ -337,10 +306,9 @@ def __init__(self, symbol, ctx, train_data,
         self.param_names = param_names
         self.aux_names = aux_names
         self.ctx = ctx
-        self.mutable_data_shape = mutable_data_shape
 
         self.execgrp = DataParallelExecutorGroup(symbol, self.arg_names, self.param_names, self.ctx,
-                                                 self.slices, train_data, max_data_shape)
+                                                 self.slices, train_data)
         self.symbol = symbol
 
         self.sym_gen = sym_gen
@@ -420,15 +388,9 @@ def load_data_batch(self, data_batch):
                 self.execgrp_bucket[key] = execgrp
 
             self.curr_execgrp = self.execgrp_bucket[key]
-        elif self.mutable_data_shape is True:
-            # for each data batch, generate new execgrp and share params with the initial one
-            execgrp = DataParallelExecutorGroup(self.symbol, self.arg_names,
-                                                self.param_names, self.ctx,
-                                                self.slices, data_batch,
-                                                shared_group=self.execgrp)
-            self.curr_execgrp = execgrp
         else:
             self.curr_execgrp = self.execgrp
+
         self.curr_execgrp.load_data_batch(data_batch)
 
     def forward(self, is_train=False):
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index a84aa754ef74..41e5c032311a 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -123,8 +123,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                         train_data, eval_data=None, eval_metric=None,
                         epoch_end_callback=None, batch_end_callback=None,
                         logger=None, work_load_list=None, monitor=None,
-                        eval_batch_end_callback=None, sym_gen=None,
-                        mutable_data_shape=False, max_data_shape=None):
+                        eval_batch_end_callback=None, sym_gen=None):
     """Internal training function on multiple devices.
     This function will also work for single device as well.
     Parameters
@@ -176,11 +175,6 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
     monitor : Monitor, optional
         Monitor installed to executor,
         for monitoring outputs, weights, and gradients for debugging.
-    mutable_data_shape: bool, optional
-        Whether input data have different shapes or not.
-        It is set to False in default.
-    max_data_shape: list of float or int, optional
-        The maximum shape of input data
     Notes
     -----
     - This function will inplace update the NDArrays in arg_params and aux_states.
@@ -195,9 +189,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                                                    arg_names=arg_names,
                                                    aux_names=aux_names,
                                                    work_load_list=work_load_list,
-                                                   logger=logger,
-                                                   mutable_data_shape=mutable_data_shape,
-                                                   max_data_shape=max_data_shape)
+                                                   logger=logger)
     if monitor:
         executor_manager.install_monitor(monitor)
 
@@ -270,7 +262,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                     do_reset = False
                     break
 
-            if do_reset is True:
+            if do_reset == True:
                 logger.info('Epoch[%d] Resetting Data Iterator', epoch)
                 train_data.reset()
 

From 8fd4d1692048944a00ce5b424b436f40aecb79c7 Mon Sep 17 00:00:00 2001
From: Liang Xiang <xlvector@gmail.com>
Date: Wed, 18 May 2016 19:55:28 +0800
Subject: [PATCH 020/126] Intergrate with baidu warpctc

local config

warpctc init code

warpctc

cpu can run

warpctc gpu

warpctc: use cpu

success ocr example

add warpctc path to config

add readme

Update README.md

fix code style

fix code style

fix code style

add cannot find -lwarpctc to README

add library path to warpctc

label size is diff from output size in ctc

remove change in gitignore

remove debug code

remove debug code

free cuda memory and fix test fail

dmlc-core mshadow to current version
---
 example/warpctc/README.md    |  86 ++++++++++++
 example/warpctc/lstm.py      |  79 +++++++++++
 example/warpctc/lstm_ocr.py  | 166 ++++++++++++++++++++++
 example/warpctc/toy_ctc.py   | 163 ++++++++++++++++++++++
 mshadow                      |   2 +-
 plugin/warpctc/warpctc-inl.h | 258 +++++++++++++++++++++++++++++++++++
 plugin/warpctc/warpctc.cc    |  29 ++++
 plugin/warpctc/warpctc.cu    |  19 +++
 plugin/warpctc/warpctc.mk    |   7 +
 9 files changed, 808 insertions(+), 1 deletion(-)
 create mode 100644 example/warpctc/README.md
 create mode 100644 example/warpctc/lstm.py
 create mode 100644 example/warpctc/lstm_ocr.py
 create mode 100644 example/warpctc/toy_ctc.py
 create mode 100644 plugin/warpctc/warpctc-inl.h
 create mode 100644 plugin/warpctc/warpctc.cc
 create mode 100644 plugin/warpctc/warpctc.cu
 create mode 100644 plugin/warpctc/warpctc.mk

diff --git a/example/warpctc/README.md b/example/warpctc/README.md
new file mode 100644
index 000000000000..c34c2b4f55af
--- /dev/null
+++ b/example/warpctc/README.md
@@ -0,0 +1,86 @@
+# Baidu Warp CTC with Mxnet
+
+Baidu-warpctc is a CTC implement by Baidu which support GPU. CTC can be used with LSTM to solve lable alignment problems in many areas such as OCR, speech recognition.
+
+## Install baidu warpctc
+
+```
+  cd ~/
+  git clone https://github.com/baidu-research/warp-ctc
+  cd warp-ctc
+  mkdir build
+  cd build
+  cmake ..
+  make
+  sudo make install
+```
+
+## Enable warpctc in mxnet
+
+```
+  comment out following lines in make/config.mk
+  WARPCTC_PATH = $(HOME)/warpctc
+  MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+  
+  rebuild mxnet by
+  make clean && make -j4
+```
+
+## Run examples
+
+I implement two examples, one is just a toy example which can be used to prove ctc integration is right. The second is a OCR example with LSTM+CTC. You can run it by:
+
+```
+  cd examples/warpctc
+  python lstm_ocr.py
+```
+
+The OCR example is constructed as follows:
+  
+1. I generate 80x30 image for 4 digits captcha by an python captcha library
+2. The 80x30 image is used as 80 input for lstm and every input is one column of image (a 30 dim vector)
+3. The output layer use CTC loss
+
+Following code show detail construction of the net:
+
+```
+  def lstm_unroll(num_lstm_layer, seq_len,
+                  num_hidden, num_label):
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
+                                     i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
+                                     h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
+                                     h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert(len(last_states) == num_lstm_layer)
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    
+    #every column of image is an input, there are seq_len inputs
+    wordvec = mx.sym.SliceChannel(data=data, num_outputs=seq_len, squeeze_axis=1)
+    hidden_all = []
+    for seqidx in range(seq_len):
+        hidden = wordvec[seqidx]
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i)
+            hidden = next_state.h
+            last_states[i] = next_state
+        hidden_all.append(hidden)
+    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
+    pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
+    
+    # here we do NOT need to transpose label as other lstm examples do
+    label = mx.sym.Reshape(data=label, target_shape=(0,))
+    #label should be int type, so use cast
+    label = mx.sym.Cast(data = label, dtype = 'int32')
+    sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)
+    return sm
+```
+  
diff --git a/example/warpctc/lstm.py b/example/warpctc/lstm.py
new file mode 100644
index 000000000000..97fda6b9c9d4
--- /dev/null
+++ b/example/warpctc/lstm.py
@@ -0,0 +1,79 @@
+# pylint:skip-file
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+import time
+import math
+LSTMState = namedtuple("LSTMState", ["c", "h"])
+LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias",
+                                     "h2h_weight", "h2h_bias"])
+LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol",
+                                     "init_states", "last_states",
+                                     "seq_data", "seq_labels", "seq_outputs",
+                                     "param_blocks"])
+
+def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx):
+    """LSTM Cell symbol"""
+    i2h = mx.sym.FullyConnected(data=indata,
+                                weight=param.i2h_weight,
+                                bias=param.i2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_i2h" % (seqidx, layeridx))
+    h2h = mx.sym.FullyConnected(data=prev_state.h,
+                                weight=param.h2h_weight,
+                                bias=param.h2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_h2h" % (seqidx, layeridx))
+    gates = i2h + h2h
+    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
+                                      name="t%d_l%d_slice" % (seqidx, layeridx))
+    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
+    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
+    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
+    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")
+    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
+    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")
+    return LSTMState(c=next_c, h=next_h)
+
+
+def lstm_unroll(num_lstm_layer, seq_len,
+                num_hidden, num_label):
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
+                                     i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
+                                     h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
+                                     h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert(len(last_states) == num_lstm_layer)
+
+    # embeding layer
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    wordvec = mx.sym.SliceChannel(data=data, num_outputs=seq_len, squeeze_axis=1)
+
+    hidden_all = []
+    for seqidx in range(seq_len):
+        hidden = wordvec[seqidx]
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i)
+            hidden = next_state.h
+            last_states[i] = next_state
+        hidden_all.append(hidden)
+
+    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
+    pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
+
+    label = mx.sym.Reshape(data=label, target_shape=(0,))
+    label = mx.sym.Cast(data = label, dtype = 'int32')
+    sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)
+    return sm
+
diff --git a/example/warpctc/lstm_ocr.py b/example/warpctc/lstm_ocr.py
new file mode 100644
index 000000000000..22247e85d8d7
--- /dev/null
+++ b/example/warpctc/lstm_ocr.py
@@ -0,0 +1,166 @@
+# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+import sys, random
+sys.path.insert(0, "../../python")
+import numpy as np
+import mxnet as mx
+
+from lstm import lstm_unroll
+
+from io import BytesIO
+from captcha.image import ImageCaptcha
+import cv2, random
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+        self.pad = 0
+        self.index = None # TODO: what is index?
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+def gen_rand():
+    num = random.randint(0, 9999)
+    buf = str(num)
+    while len(buf) < 4:
+        buf = "0" + buf
+    return buf
+
+def get_label(buf):
+    ret = np.zeros(4)
+    for i in range(4):
+        ret[i] = 1 + int(buf[i])
+    return ret
+
+class OCRIter(mx.io.DataIter):
+    def __init__(self, count, batch_size, num_label, init_states):
+        super(OCRIter, self).__init__()
+        self.captcha = ImageCaptcha(fonts=['./data/Xerox.ttf'])
+        self.batch_size = batch_size
+        self.count = count
+        self.num_label = num_label
+        self.init_states = init_states
+        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
+        self.provide_data = [('data', (batch_size, 2400))] + init_states
+        self.provide_label = [('label', (self.batch_size, 4))]
+
+    def __iter__(self):
+        print 'iter'
+        init_state_names = [x[0] for x in self.init_states]
+        for k in range(self.count):
+            data = []
+            label = []
+            for i in range(self.batch_size):
+                num = gen_rand()
+                img = self.captcha.generate(num)
+                img = np.fromstring(img.getvalue(), dtype='uint8')
+                img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
+                img = cv2.resize(img, (80, 30))
+                img = img.transpose(1, 0)
+                img = img.reshape((80 * 30))
+                img = np.multiply(img, 1/255.0)
+                data.append(img)
+                label.append(get_label(num))
+
+            data_all = [mx.nd.array(data)] + self.init_state_arrays
+            label_all = [mx.nd.array(label)]
+            data_names = ['data'] + init_state_names
+            label_names = ['label']
+            
+            
+            data_batch = SimpleBatch(data_names, data_all, label_names, label_all)
+            yield data_batch
+
+    def reset(self):
+        pass
+
+BATCH_SIZE = 32
+SEQ_LENGTH = 80
+
+def ctc_label(p):
+    ret = []
+    p1 = [0] + p
+    for i in range(len(p)):
+        c1 = p1[i]
+        c2 = p1[i+1]
+        if c2 == 0 or c2 == c1:
+            continue
+        ret.append(c2)
+    return ret        
+
+def Accuracy(label, pred):
+    global BATCH_SIZE
+    global SEQ_LENGTH
+    hit = 0.
+    total = 0.
+    for i in range(BATCH_SIZE):
+        l = label[i]
+        p = []
+        for k in range(SEQ_LENGTH):
+            p.append(np.argmax(pred[k * BATCH_SIZE + i]))
+        p = ctc_label(p)
+        if len(p) == len(l):
+            match = True
+            for k in range(len(p)):
+                if p[k] != int(l[k]):
+                    match = False
+                    break
+            if match:
+                hit += 1.0
+        total += 1.0
+    return hit / total
+
+if __name__ == '__main__':
+    num_hidden = 100
+    num_lstm_layer = 2
+
+    num_epoch = 10
+    learning_rate = 0.001
+    momentum = 0.9
+    num_label = 4
+
+    contexts = [mx.context.gpu(1)]
+
+    def sym_gen(seq_len):
+        return lstm_unroll(num_lstm_layer, seq_len,
+                           num_hidden=num_hidden,
+                           num_label = num_label)
+
+    init_c = [('l%d_init_c'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_h = [('l%d_init_h'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_states = init_c + init_h
+
+    data_train = OCRIter(10000, BATCH_SIZE, num_label, init_states)
+    data_val = OCRIter(1000, BATCH_SIZE, num_label, init_states)
+
+    symbol = sym_gen(SEQ_LENGTH)
+
+    model = mx.model.FeedForward(ctx=contexts,
+                                 symbol=symbol,
+                                 num_epoch=num_epoch,
+                                 learning_rate=learning_rate,
+                                 momentum=momentum,
+                                 wd=0.00001,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    print 'begin fit'
+
+    model.fit(X=data_train, eval_data=data_val,
+              eval_metric = mx.metric.np(Accuracy),
+              batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 50),)
+
+    model.save("ocr")
diff --git a/example/warpctc/toy_ctc.py b/example/warpctc/toy_ctc.py
new file mode 100644
index 000000000000..1000e09dbd85
--- /dev/null
+++ b/example/warpctc/toy_ctc.py
@@ -0,0 +1,163 @@
+# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+import sys
+sys.path.insert(0, "../../python")
+import numpy as np
+import mxnet as mx
+import random
+from lstm import lstm_unroll
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+        self.pad = 0
+        self.index = None # TODO: what is index?
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+def gen_feature(n):
+    ret = np.zeros(10)
+    ret[n] = 1
+    return ret
+
+def gen_rand():
+    num = random.randint(0, 9999)
+    buf = str(num)
+    while len(buf) < 4:
+        buf = "0" + buf
+    ret = np.array([])
+    for i in range(80):
+        c = int(buf[i / 20])
+        ret = np.concatenate([ret, gen_feature(c)])
+    return buf, ret
+
+def get_label(buf):
+    ret = np.zeros(4)
+    for i in range(4):
+        ret[i] = 1 + int(buf[i])
+    return ret
+
+class DataIter(mx.io.DataIter):
+    def __init__(self, count, batch_size, num_label, init_states):
+        super(DataIter, self).__init__()
+        self.batch_size = batch_size
+        self.count = count
+        self.num_label = num_label
+        self.init_states = init_states
+        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
+        self.provide_data = [('data', (batch_size, 10 * 80))] + init_states
+        self.provide_label = [('label', (self.batch_size, 4))]
+
+    def __iter__(self):
+        init_state_names = [x[0] for x in self.init_states]
+        for k in range(self.count):
+            data = []
+            label = []
+            for i in range(self.batch_size):
+                num, img = gen_rand()
+                data.append(img)
+                label.append(get_label(num))
+
+            data_all = [mx.nd.array(data)] + self.init_state_arrays
+            label_all = [mx.nd.array(label)]
+            data_names = ['data'] + init_state_names
+            label_names = ['label']
+            
+            
+            data_batch = SimpleBatch(data_names, data_all, label_names, label_all)
+            yield data_batch
+
+    def reset(self):
+        pass
+
+BATCH_SIZE = 32
+SEQ_LENGTH = 80
+
+def ctc_label(p):
+    ret = []
+    p1 = [0] + p
+    for i in range(len(p)):
+        c1 = p1[i]
+        c2 = p1[i+1]
+        if c2 == 0 or c2 == c1:
+            continue
+        ret.append(c2)
+    return ret
+        
+
+def Accuracy(label, pred):
+    global BATCH_SIZE
+    global SEQ_LENGTH
+    hit = 0.
+    total = 0.
+    for i in range(BATCH_SIZE):
+        l = label[i]
+        p = []
+        for k in range(SEQ_LENGTH):
+            p.append(np.argmax(pred[k * BATCH_SIZE + i]))
+        p = ctc_label(p)
+        if len(p) == len(l):
+            match = True
+            for k in range(len(p)):
+                if p[k] != int(l[k]):
+                    match = False
+                    break
+            if match:
+                hit += 1.0
+        total += 1.0
+    return hit / total
+
+if __name__ == '__main__':
+    num_hidden = 100
+    num_lstm_layer = 1
+
+    num_epoch = 10
+    learning_rate = 0.001
+    momentum = 0.9
+    num_label = 4
+
+    contexts = [mx.context.gpu(0)]
+
+    def sym_gen(seq_len):
+        return lstm_unroll(num_lstm_layer, seq_len,
+                           num_hidden=num_hidden,
+                           num_label = num_label)
+
+    init_c = [('l%d_init_c'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_h = [('l%d_init_h'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_states = init_c + init_h
+
+    data_train = DataIter(100000, BATCH_SIZE, num_label, init_states)
+    data_val = DataIter(1000, BATCH_SIZE, num_label, init_states)
+
+    symbol = sym_gen(SEQ_LENGTH)
+
+    model = mx.model.FeedForward(ctx=contexts,
+                                 symbol=symbol,
+                                 num_epoch=num_epoch,
+                                 learning_rate=learning_rate,
+                                 momentum=momentum,
+                                 wd=0.00001,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    print 'begin fit'
+
+    model.fit(X=data_train, eval_data=data_val,
+              eval_metric = mx.metric.np(Accuracy),
+              batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 50),)
+
+    model.save("ocr")
diff --git a/mshadow b/mshadow
index 489748eee934..65da7de8b59f 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 489748eee93435cf63ea657bb0933808109953c7
+Subproject commit 65da7de8b59fd1736e0c2d71508937ef25b91686
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
new file mode 100644
index 000000000000..8ed155e53480
--- /dev/null
+++ b/plugin/warpctc/warpctc-inl.h
@@ -0,0 +1,258 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file warpctc-inl.h
+ * \brief warpctc operator
+ * \author Liang Xiang
+*/
+#ifndef PLUGIN_WARPCTC_WARPCTC_INL_H_
+#define PLUGIN_WARPCTC_WARPCTC_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <stdio.h>
+#include <ctc.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include "../../src/operator/operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace warpctc_enum {
+  enum CTCOpInputs {kData, kLabel};
+  enum CTCOpOutputs {kOut};
+}  // namespace warpctc_enum
+
+struct WarpCTCParam : public dmlc::Parameter<WarpCTCParam> {
+  int label_length;
+  int input_length;
+  DMLC_DECLARE_PARAMETER(WarpCTCParam) {
+    DMLC_DECLARE_FIELD(label_length)
+        .set_default(0)
+        .describe("Real label length");
+    DMLC_DECLARE_FIELD(input_length)
+        .set_default(0)
+        .describe("Input length");
+  }
+};
+
+template<typename xpu>
+class WarpCTCOp : public Operator {
+ private:
+  WarpCTCParam param_;
+
+ public:
+  explicit WarpCTCOp(WarpCTCParam p) {
+    this->param_ = p;
+  }
+
+  ~WarpCTCOp() {
+  }
+
+  inline void throw_on_error(ctcStatus_t status, const char* message) {
+    if (status != CTC_STATUS_SUCCESS) {
+      throw std::runtime_error(message
+                               + (", stat = "
+                                  + std::string(ctcGetStatusString(status))));
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2) << "CTCOutput Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "CTCOutput Output: [output]";
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    TBlob data = in_data[warpctc_enum::kData];
+    TBlob out = out_data[warpctc_enum::kOut];
+    Tensor<xpu, 2, float> data_tensor = data.FlatTo2D<xpu, float>(s);
+    Tensor<xpu, 2, float> out_tensor = out.FlatTo2D<xpu, float>(s);
+    Softmax(out_tensor, data_tensor);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    TBlob data = in_data[warpctc_enum::kData];
+    TBlob label = in_data[warpctc_enum::kLabel];
+    CHECK_EQ(data.shape_.ndim(), 2) << "input data shape should be 2 (t*n, p)";
+    ctcComputeInfo info;
+    if (data.dev_mask_ == cpu::kDevMask) {
+      info.loc = CTC_CPU;
+      info.num_threads = 1;
+    } else if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      info.loc = CTC_GPU;
+      info.stream = ctx.get_stream<gpu>()->stream_;
+#endif
+    } else {
+      LOG(FATAL) << "Unknown device type " << data.dev_mask_;
+    }
+
+    int T = param_.input_length;
+    int minibatch = data.shape_[0] / T;
+    int alphabet_size = data.shape_[1];
+    std::vector<int> input_lengths;
+    for (int i = 0; i < minibatch; i++) {
+      input_lengths.push_back(T);
+    }
+    std::vector<int> label_lengths;
+    for (int i = 0; i < minibatch; i++) {
+      label_lengths.push_back(param_.label_length);
+    }
+
+    size_t alloc_bytes;
+    throw_on_error(get_workspace_size(label_lengths.data(),
+                                      input_lengths.data(),
+                                      alphabet_size,
+                                      input_lengths.size(), info,
+                                      &alloc_bytes),
+                   "Error: get_workspace_size in inf_test");
+    void* ctc_workspace;
+
+    cudaError_t cuda_status;
+    float* activations = static_cast<float*>(data.dptr_);
+    int* flat_labels = static_cast<int*>(label.dptr_);
+    int* cpu_labels = flat_labels;
+    float* grads = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
+
+    if (data.dev_mask_ == cpu::kDevMask) {
+      ctc_workspace = malloc(alloc_bytes);
+    } else if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      cpu_labels = reinterpret_cast<int*>(malloc(sizeof(int) * label.Size()));
+      cuda_status = cudaMemcpyAsync(cpu_labels, flat_labels,
+                                    label.Size()*sizeof(int),
+                                    cudaMemcpyDeviceToHost,
+                                    ctx.get_stream<gpu>()->stream_);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
+
+      cuda_status = cudaMalloc(&ctc_workspace, alloc_bytes);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda malloc worksapce fail";
+#endif
+    } else {
+      LOG(FATAL) << "Unknown device type " << data.dev_mask_;
+    }
+
+    std::vector<float> costs(minibatch);
+    throw_on_error(compute_ctc_loss(activations,
+                                    grads,
+                                    cpu_labels,
+                                    label_lengths.data(),
+                                    input_lengths.data(),
+                                    alphabet_size,
+                                    minibatch,
+                                    costs.data(),
+                                    ctc_workspace,
+                                    info),
+                   "Error: compute_ctc_loss");
+    if (data.dev_mask_ == cpu::kDevMask) {
+      free(ctc_workspace);
+    } else if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      cuda_status = cudaFree(ctc_workspace);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda free workspace fail";
+#endif
+    }
+  }
+};
+
+template<typename xpu>
+Operator* CreateOp(WarpCTCParam type);
+
+
+#if DMLC_USE_CXX11
+class WarpCTCProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  virtual std::vector<std::string> ListOutputs() const {
+    return {"output"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs)
+      override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    TShape label_shape(dshape.ndim() - 1);
+    label_shape[0] = param_.label_length * (dshape[0] / param_.input_length);
+    std::cout << "infer label shape: " << label_shape[0] << std::endl;
+    SHAPE_ASSIGN_CHECK(*in_shape, warpctc_enum::kLabel, label_shape);
+
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual bool InferType(std::vector<int> *in_type,
+                         std::vector<int> *out_type,
+                         std::vector<int> *aux_type) const {
+    CHECK_LE(in_type->size(), this->ListArguments().size());
+    in_type->clear();
+    in_type->push_back(mshadow::kFloat32);
+    in_type->push_back(mshadow::kInt32);
+    out_type->clear();
+    out_type->push_back(mshadow::kFloat32);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new WarpCTCProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "WarpCTC";
+  }
+
+
+  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
+                                             const std::vector<int> &in_data,
+                                             const std::vector<int> &out_data)
+      const override {
+    return {in_data[warpctc_enum::kData],
+          in_data[warpctc_enum::kLabel],
+          out_data[warpctc_enum::kOut]};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  WarpCTCParam param_;
+};
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // PLUGIN_WARPCTC_WARPCTC_INL_H_
diff --git a/plugin/warpctc/warpctc.cc b/plugin/warpctc/warpctc.cc
new file mode 100644
index 000000000000..db88a3316c7e
--- /dev/null
+++ b/plugin/warpctc/warpctc.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file warpctc.cc
+ * \brief warpctc op
+ * \author Liang Xiang
+*/
+
+#include "./warpctc-inl.h"
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(WarpCTCParam param) {
+  return new WarpCTCOp<cpu>(param);
+}
+
+Operator *WarpCTCProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(WarpCTCParam);
+
+MXNET_REGISTER_OP_PROPERTY(WarpCTC, WarpCTCProp)
+.describe("warp ctc.")
+.add_arguments(WarpCTCParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/warpctc/warpctc.cu b/plugin/warpctc/warpctc.cu
new file mode 100644
index 000000000000..186c4d0c18f4
--- /dev/null
+++ b/plugin/warpctc/warpctc.cu
@@ -0,0 +1,19 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file warpctc.cc
+ * \brief warpctc op
+ * \author Liang Xiang
+*/
+#include "./warpctc-inl.h"
+#include <stdio.h>
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(WarpCTCParam param) {
+  return new WarpCTCOp<gpu>(param);
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/warpctc/warpctc.mk b/plugin/warpctc/warpctc.mk
new file mode 100644
index 000000000000..2223879ee3e3
--- /dev/null
+++ b/plugin/warpctc/warpctc.mk
@@ -0,0 +1,7 @@
+CFLAGS += -I$(WARPCTC_PATH)/include
+LDFLAGS += -L$(WARPCTC_PATH)/build -lwarpctc
+
+WARPCTC_SRC = $(wildcard plugin/warpctc/*.cc)
+PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(WARPCTC_SRC))
+WARPCTC_CUSRC = $(wildcard plugin/warpctc/*.cu)
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(WARPCTC_CUSRC))

From 498a0e92fe20e3de2d83627cf59c75ad9bff262c Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Fri, 17 Jun 2016 00:57:56 +0800
Subject: [PATCH 021/126] Update index.md (#2433)

---
 docs/packages/python/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/packages/python/index.md b/docs/packages/python/index.md
index a9f3a0f2bac5..aa22ebcd2dce 100644
--- a/docs/packages/python/index.md
+++ b/docs/packages/python/index.md
@@ -1,7 +1,7 @@
 MXNet Python Package
 ====================
 This page contains links to all the python related documents on python package.
-To install the package package, checkout [Build and Installation Instruction](../../how_to/build.md).
+To install the python package, checkout [Build and Installation Instruction](../../how_to/build.md).
 There are three types of documents you can find about mxnet.
 
 * [Tutorials](#tutorials) are self contained materials that introduces a certain use-cases of mxnet.

From 6e85eb2fcc4818cc71615ad056dc80625bcb16e8 Mon Sep 17 00:00:00 2001
From: tmatas <matassini.tommaso@gmail.com>
Date: Thu, 16 Jun 2016 20:58:32 +0200
Subject: [PATCH 022/126] Saving folder, log, removed some import, fix typos
 (#2406)

---
 tools/im2rec.py | 146 +++++++++++++++++++++++++-----------------------
 1 file changed, 75 insertions(+), 71 deletions(-)

diff --git a/tools/im2rec.py b/tools/im2rec.py
index 8f2ca9248930..81e388e0f363 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -1,20 +1,21 @@
 import os
 import sys
+
 curr_path = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(curr_path, "../python"))
 import mxnet as mx
 import random
-import numpy as np
 import argparse
-import threading
-import cv, cv2
+import cv2
 import time
+
+
 def list_image(root, recursive, exts):
     image_list = []
     if recursive:
         cat = {}
         for path, subdirs, files in os.walk(root, followlinks=True):
-	    subdirs.sort()
+            subdirs.sort()
             print(len(cat), path)
             for fname in files:
                 fpath = os.path.join(path, fname)
@@ -31,50 +32,56 @@ def list_image(root, recursive, exts):
                 image_list.append((len(image_list), os.path.relpath(fpath, root), 0))
     return image_list
 
+
 def write_list(path_out, image_list):
     with open(path_out, 'w') as fout:
-        for i in xrange(len(image_list)):
-            line = '%d\t'%image_list[i][0]
+        n_images = xrange(len(image_list))
+        for i in n_images:
+            line = '%d\t' % image_list[i][0]
             for j in image_list[i][2:]:
-                line += '%f\t'%j
-            line += '%s\n'%image_list[i][1]
+                line += '%d\t' % j
+            line += '%s\n' % image_list[i][1]
             fout.write(line)
 
-def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio, test_ratio):
-    image_list = list_image(root, recursive, exts)
-    random.seed(100)
-    random.shuffle(image_list)
+
+def make_list(args):
+    image_list = list_image(args.root, args.recursive, args.exts)
+    if args.shuffle is True:
+        random.seed(100)
+        random.shuffle(image_list)
     N = len(image_list)
-    chunk_size = (N+num_chunks-1)/num_chunks
-    for i in xrange(num_chunks):
-        chunk = image_list[i*chunk_size:(i+1)*chunk_size]
-        if num_chunks > 1:
-            str_chunk = '_%d'%i
+    chunk_size = (N + args.chunks - 1) / args.chunks
+    for i in xrange(args.chunks):
+        chunk = image_list[i * chunk_size:(i + 1) * chunk_size]
+        if args.chunks > 1:
+            str_chunk = '_%d' % i
         else:
             str_chunk = ''
-        sep = int(chunk_size*train_ratio)
-	sep_test=int(chunk_size*test_ratio)
-        write_list(prefix_out+str_chunk+'_test.lst', chunk[:sep_test])
-        write_list(prefix_out+str_chunk+'_train.lst', chunk[sep_test:sep_test+sep])
-        write_list(prefix_out+str_chunk+'_val.lst', chunk[sep_test+sep:])
+        sep = int(chunk_size * args.train_ratio)
+        sep_test = int(chunk_size * args.test_ratio)
+        write_list(args.prefix + str_chunk + '_test.lst', chunk[:sep_test])
+        write_list(args.prefix + str_chunk + '_train.lst', chunk[sep_test:sep_test + sep])
+        write_list(args.prefix + str_chunk + '_val.lst', chunk[sep_test + sep:])
+
 
 def read_list(path_in):
     image_list = []
     with open(path_in) as fin:
         for line in fin.readlines():
             line = [i.strip() for i in line.strip().split('\t')]
-            item = [int(line[0])] + [line[-1]] + [float(i) for i in line[1:-1]]
+            item = [int(line[0])] + [line[-1]] + [int(i) for i in line[1:-1]]
             image_list.append(item)
     return image_list
 
-def write_record(args, image_list):
+
+def write_record(args, image_list, fname):
     source = image_list
     tic = [time.time()]
     color_modes = {-1: cv2.IMREAD_UNCHANGED,
-                    0: cv2.IMREAD_GRAYSCALE,
-                    1: cv2.IMREAD_COLOR}
+                   0: cv2.IMREAD_GRAYSCALE,
+                   1: cv2.IMREAD_COLOR}
     total = len(source)
-    
+
     def image_encode(item, q_out):
         try:
             img = cv2.imread(os.path.join(args.root, item[1]), color_modes[args.color])
@@ -86,16 +93,16 @@ def image_encode(item, q_out):
             return
         if args.center_crop:
             if img.shape[0] > img.shape[1]:
-                margin = (img.shape[0] - img.shape[1])/2;
-                img = img[margin:margin+img.shape[1], :]
+                margin = (img.shape[0] - img.shape[1]) / 2;
+                img = img[margin:margin + img.shape[1], :]
             else:
-                margin = (img.shape[1] - img.shape[0])/2;
-                img = img[:, margin:margin+img.shape[0]]
+                margin = (img.shape[1] - img.shape[0]) / 2;
+                img = img[:, margin:margin + img.shape[0]]
         if args.resize:
             if img.shape[0] > img.shape[1]:
-                newsize = (img.shape[0]*args.resize/img.shape[1], args.resize)
+                newsize = (img.shape[0] * args.resize / img.shape[1], args.resize)
             else:
-                newsize = (args.resize, img.shape[1]*args.resize/img.shape[0])
+                newsize = (args.resize, img.shape[1] * args.resize / img.shape[0])
             img = cv2.resize(img, newsize)
         header = mx.recordio.IRHeader(0, item[2], item[0], 0)
 
@@ -103,7 +110,7 @@ def image_encode(item, q_out):
             s = mx.recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding)
             q_out.put(('data', s, item))
         except:
-            print 'pack_img error:',item[1]
+            print 'pack_img error:', item[1]
             return
 
     def read_worker(q_in, q_out):
@@ -111,14 +118,16 @@ def read_worker(q_in, q_out):
             item = q_in.get()
             image_encode(item, q_out)
 
-    def write_worker(q_out, prefix):
+    def write_worker(q_out, fname, saving_folder):
         pre_time = time.time()
         sink = []
-	record = mx.recordio.MXRecordIO(prefix+'.rec', 'w')
+        os.chdir(saving_folder)
+        fname_rec = fname[:fname.rfind('.')]
+        record = mx.recordio.MXRecordIO(fname_rec + '.rec', 'w')
         while True:
             stat, s, item = q_out.get()
             if stat == 'finish':
-                write_list(prefix+'.lst', sink)
+                write_list(fname_rec + '.lst', sink)
                 break
             record.write(s)
             sink.append(item)
@@ -134,10 +143,10 @@ def write_worker(q_out, prefix):
         for i in range(len(image_list)):
             q_in[i % len(q_in)].put(image_list[i])
         read_process = [multiprocessing.Process(target=read_worker, args=(q_in[i], q_out)) \
-                for i in range(args.num_thread)]
+                        for i in range(args.num_thread)]
         for p in read_process:
             p.start()
-        write_process = multiprocessing.Process(target=write_worker, args=(q_out,args.prefix))
+        write_process = multiprocessing.Process(target=write_worker, args=(q_out, fname, args.saving_folder))
         write_process.start()
         for p in read_process:
             p.join()
@@ -147,7 +156,9 @@ def write_worker(q_out, prefix):
         print('multiprocessing not available, fall back to single threaded encoding')
         import Queue
         q_out = Queue.Queue()
-	record = mx.recordio.MXRecordIO(args.prefix+'.rec', 'w')
+        os.chdir(args.saving_folder)
+        fname_rec = fname[:fname.rfind('.')]
+        record = mx.recordio.MXRecordIO(fname_rec + '.rec', 'w')
         cnt = 0
         pre_time = time.time()
         for item in image_list:
@@ -162,73 +173,66 @@ def write_worker(q_out, prefix):
                 print 'time:', cur_time - pre_time, ' count:', cnt
                 pre_time = cur_time
 
+
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description='Create an image list or \
-    	make a record database by reading from an image list')
+        make a record database by reading from an image list')
     parser.add_argument('prefix', help='prefix of input/output files.')
     parser.add_argument('root', help='path to folder containing images.')
 
     cgroup = parser.add_argument_group('Options for creating image lists')
     cgroup.add_argument('--list', type=bool, default=False,
-        help='If this is set im2rec will create image list(s) by traversing root folder\
+                        help='If this is set im2rec will create image list(s) by traversing root folder\
         and output to <prefix>.lst.\
         Otherwise im2rec will read <prefix>.lst and create a database at <prefix>.rec')
-    cgroup.add_argument('--exts', type=list, default=['.jpeg','.jpg'],
-        help='list of acceptable image extensions.')
+    cgroup.add_argument('--exts', type=list, default=['.jpeg', '.jpg'],
+                        help='list of acceptable image extensions.')
     cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.')
     cgroup.add_argument('--train_ratio', type=float, default=1.0,
-        help='Ratio of images to use for training.')
+                        help='Ratio of images to use for training.')
     cgroup.add_argument('--test_ratio', type=float, default=0,
-	help='Ratio of images to use for testing.')
+                        help='Ratio of images to use for testing.')
     cgroup.add_argument('--recursive', type=bool, default=False,
-        help='If true recursively walk through subdirs and assign an unique label\
+                        help='If true recursively walk through subdirs and assign an unique label\
         to images in each folder. Otherwise only include images in the root folder\
         and give them label 0.')
 
     rgroup = parser.add_argument_group('Options for creating database')
     rgroup.add_argument('--resize', type=int, default=0,
-        help='resize the shorter edge of image to the newsize, original images will\
+                        help='resize the shorter edge of image to the newsize, original images will\
         be packed by default.')
     rgroup.add_argument('--center_crop', type=bool, default=False,
-        help='specify whether to crop the center image to make it rectangular.')
+                        help='specify whether to crop the center image to make it rectangular.')
     rgroup.add_argument('--quality', type=int, default=80,
-        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
+                        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
     rgroup.add_argument('--num_thread', type=int, default=1,
-        help='number of thread to use for encoding. order of images will be different\
+                        help='number of thread to use for encoding. order of images will be different\
         from the input list if >1. the input list will be modified to match the\
         resulting order.')
     rgroup.add_argument('--color', type=int, default=1, choices=[-1, 0, 1],
-        help='specify the color mode of the loaded image.\
+                        help='specify the color mode of the loaded image.\
         1: Loads a color image. Any transparency of image will be neglected. It is the default flag.\
         0: Loads image in grayscale mode.\
         -1:Loads image as such including alpha channel.')
     rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'],
-        help='specify the encoding of the images.')
-    rgroup.add_argument('--shuffle', action='store_true',
-        help='If this is set and --list is not, im2rec will randomize the image order\
-        in <prefix>.lst and <prefix>.rec.')
-
+                        help='specify the encoding of the images.')
+    rgroup.add_argument('--saving_folder', type=str, default='.',
+                        help='folder in which .rec files will be saved.')
+    rgroup.add_argument('--shuffle', default=True, help='If this is set as True, \
+        im2rec will randomize the image order in <prefix>.lst')
     args = parser.parse_args()
-    
     if args.list:
-        make_list(args.prefix, args.root, args.recursive,
-                  args.exts, args.chunks, args.train_ratio, args.test_ratio)
+        make_list(args)
     else:
         files = [f for f in os.listdir('.') if os.path.isfile(f)]
         for f in files:
-        # do something
-            #print 'path: ', path
-            #print 'subdirs: ', subdirs
-            print 'current file: ', f
-            if f.startswith(args.prefix) is True:
-                print 'OK'
+            if f.startswith(args.prefix) is True and f.endswith('.lst') is True:
+                print 'Creating .rec file from', f, 'in', args.saving_folder
                 image_list = read_list(f)
-                if args.shuffle:
-                    random.shuffle(image_list)
-                write_record(args, image_list)
-            else:
-                print 'not OK'
+                write_record(args, image_list, f)
+
+
 if __name__ == '__main__':
     main()

From 887491d212bda17a8552227870728c246c1b57b1 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Fri, 17 Jun 2016 06:50:02 +0800
Subject: [PATCH 023/126] create mxnet/docs/zh/packages/python/index.md (#2434)

---
 docs/zh/packages/python/index.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 docs/zh/packages/python/index.md

diff --git a/docs/zh/packages/python/index.md b/docs/zh/packages/python/index.md
new file mode 100644
index 000000000000..0a539eb6bb36
--- /dev/null
+++ b/docs/zh/packages/python/index.md
@@ -0,0 +1,26 @@
+MXNet Python Package
+====================
+
+这个页面包含 python 程序包中所有相关的文档.
+为了安装 python 程序包, 请 checkout [Build and Installation Instruction](../../how_to/build.md).
+
+这里有关于 mxnet 的三种文档.
+
+* [Tutorials](#tutorials)  介绍一个特定的关于 mxnet 的用例.
+* [Code Examples](../../../example) 示例代码.
+* [Python API Documents](#python-api-documents) 关于指定模块的文档, 同时也包含所有 API 的参考文档.
+
+Tutorials
+---------
+* [Python Overview Tutorial](tutorial.md)
+* [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md)
+* [How to Create New Operations (Layers)](../../how_to/new_op.md)
+
+Python API Documents
+--------------------
+* [High Level Model Training Related API](model.md)
+* [The Module API](module.md)
+* [NDArray API](ndarray.md)
+* [Symbolic API](symbol.md)
+* [KVStore API](kvstore.md)
+* [Data Loading API](io.md)

From 4b88c19912f97304d137b7b45619949bc5e7c792 Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@ust.hk>
Date: Fri, 17 Jun 2016 09:37:34 +0800
Subject: [PATCH 024/126] Support ndim up to 7 for binary broadcasting
 operators + Accelerate reducing OPs by calling reduce_except_dim if possible.
 + Add `/bigobj` to CMakeList (#2418)

Reshape the lhs and rhs to ndim=3 if possible otherwise reshape them into
ndim=7.
---
 CMakeLists.txt                                |   2 +-
 include/mxnet/operator_util.h                 |  46 +-
 src/operator/broadcast_reduce_op-inl.h        | 116 ++-
 src/operator/broadcast_reduce_op_common.h     | 165 +++++
 .../elementwise_binary_broadcast_op-inl.h     | 678 +++++++-----------
 tests/python/unittest/test_ndarray.py         |   2 +-
 tests/python/unittest/test_operator.py        |  47 +-
 7 files changed, 532 insertions(+), 524 deletions(-)
 create mode 100644 src/operator/broadcast_reduce_op_common.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7613fe00375b..de8d1e85360d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,7 @@ if(MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-DMXNET_EXPORTS)
   set(CMAKE_C_FLAGS "/MP")
-  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /bigobj")
 else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index f96b85108b47..94eb994d07e1 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -11,6 +11,10 @@
 #ifndef MXNET_OPERATOR_UTIL_H_
 #define MXNET_OPERATOR_UTIL_H_
 
+#ifdef _MSC_VER
+#pragma warning(disable:4503)  // disable warning: decorated name length exceeded.
+#endif
+
 #include <dmlc/registry.h>
 #include <dmlc/parameter.h>
 #include <map>
@@ -412,47 +416,9 @@ class SimpleOpRegistry {
   }
 
 /*!
-* \brief cast dynamic range variable into static variable
-* \param var the source value, constrained to be between 1 and 5
-* \param NDIM the const NDIM that can be used in the template
+* \brief Maximum ndim supported for special operators like broadcasting with non contiguous lhs/rhs
 */
-#define MXNET_RANGE_SWITCH(var, NDIM, ...)         \
-  {                                                \
-    switch (var) {                                 \
-      case 1:                                      \
-        {                                          \
-          static const int NDIM = 1;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 2:                                      \
-        {                                          \
-          static const int NDIM = 2;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 3:                                      \
-        {                                          \
-          static const int NDIM = 3;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 4:                                      \
-        {                                          \
-          static const int NDIM = 4;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 5:                                      \
-        {                                          \
-          static const int NDIM = 5;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      default:                                     \
-        LOG(FATAL) << "Only support ndim=1 to 5."; \
-    }                                              \
-  }
+#define MXNET_SPECIAL_MAX_NDIM 7
 
 
 //--------------------------------------------------------------
diff --git a/src/operator/broadcast_reduce_op-inl.h b/src/operator/broadcast_reduce_op-inl.h
index f43bafbc16da..fa6b7fbf106a 100644
--- a/src/operator/broadcast_reduce_op-inl.h
+++ b/src/operator/broadcast_reduce_op-inl.h
@@ -103,11 +103,14 @@ void L2Norm(const TBlob &src,
             OpReqType req,
             RunContext ctx) {
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
-  mshadow::Tensor<xpu, 1> in =
-      src.get_with_shape<xpu, 1, real_t>(mshadow::Shape1(src.shape_.Size()), s);
-  mshadow::VectorDot(out, in, in);
-  out = mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out);
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+    mshadow::Tensor<xpu, 1, DType> out = ret->get<xpu, 1, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> in =
+      src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(src.shape_.Size()), s);
+    mshadow::VectorDot(out, in, in);
+    ASSIGN_DISPATCH(out, req, mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out));
+  });
 }
 
 template<typename xpu, typename Reducer>
@@ -117,10 +120,13 @@ void Reduce(const TBlob &src,
             OpReqType req,
             RunContext ctx) {
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
-  mshadow::Tensor<xpu, 2> in =
-      src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(1, src.shape_.Size()), s);
-  out = mshadow::expr::reduce_except_dim<0, Reducer>(in);
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+    mshadow::Tensor<xpu, 1, DType> out = ret->get<xpu, 1, DType>(s);
+    mshadow::Tensor<xpu, 2, DType> in =
+      src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(1, src.shape_.Size()), s);
+    ASSIGN_DISPATCH(out, req, (mshadow::expr::reduce_except_dim<0, Reducer>(in)));
+  });
 }
 
 // backward function that takes input value of the op
@@ -135,7 +141,7 @@ void SumBackward_(const OutputGrad& scale,
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(in_grad->type_flag_, scale.data.type_flag_)
     << "Unary function only support input/output with the same type";
-  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+  MSHADOW_REAL_TYPE_SWITCH(in_grad->type_flag_, DType, {
       mshadow::Tensor<xpu, 1, DType> mscale = scale.data.get<xpu, 1, DType>(s);
       mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
       ASSIGN_DISPATCH(igrad, req,
@@ -143,7 +149,7 @@ void SumBackward_(const OutputGrad& scale,
   });
 }
 
-template<typename xpu, typename Reducer, bool get_mask>
+template<typename xpu, typename Reducer>
 void ReduceChannel(const TBlob &src,
                    const EnvArguments& env,
                    TBlob *ret,
@@ -153,13 +159,17 @@ void ReduceChannel(const TBlob &src,
   using namespace mshadow;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 2> out = ret->get_with_shape<xpu, 2, real_t>(
-    Shape2(src.shape_[0], src.Size()/src.shape_[0]/src.shape_[1]),
-    s);
-  Tensor<xpu, 3> in = src.get_with_shape<xpu, 3, real_t>(
-    Shape3(src.shape_[0], src.shape_[1], src.Size()/src.shape_[0]/src.shape_[1]),
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+    Tensor<xpu, 2, DType> out = ret->get_with_shape<xpu, 2, DType>(
+    Shape2(src.shape_[0], src.Size() / src.shape_[0] / src.shape_[1]),
     s);
-  out = reduce_with_axis<Reducer, get_mask>(in, 1);
+    Tensor<xpu, 3, DType> in = src.get_with_shape<xpu, 3, DType>(
+      Shape3(src.shape_[0], src.shape_[1], src.Size() / src.shape_[0] / src.shape_[1]),
+      s);
+    CHECK(req != kAddTo) << "AddTo is not supported";
+    ASSIGN_DISPATCH(out, req, (reduce_with_axis<Reducer, true>(in, 1)));
+  });
 }
 
 // return a shape of ReduceChannel output
@@ -184,13 +194,16 @@ void ReduceAxisImpl_(const TBlob &src,
   bool keepdims) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
   if (-1 == axis) {
     // Reduce all dimensions if axis == -1
-    mshadow::Tensor<xpu, 2> in =
-      src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(1, src.shape_.Size()), s);
-    mshadow::Tensor<xpu, 1> out =
-      ret->get_with_shape<xpu, 1, real_t>(mshadow::Shape1(ret->shape_.Size()), s);
-    out = mshadow::expr::reduce_except_dim<0, Reducer>(in);
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> in =
+        src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(src.shape_.Size(), 1), s);
+      mshadow::Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(ret->shape_.Size()), s);
+      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(in)));
+    });
     return;
   }
   int trailing = 1;
@@ -202,11 +215,46 @@ void ReduceAxisImpl_(const TBlob &src,
       trailing *= src.shape_[i];
     }
   }
-  mshadow::Tensor<xpu, 3> in =
-    src.get_with_shape<xpu, 3, real_t>(mshadow::Shape3(leading, src.shape_[axis], trailing), s);
-  mshadow::Tensor<xpu, 2> out =
-    ret->get_with_shape<xpu, 2, real_t>(mshadow::Shape2(leading, trailing), s);
-  out = mshadow::expr::reduce_with_axis<Reducer, get_mask>(in, 1);
+  if (get_mask) {
+    // If get_mask is on, we have to use the slower `reduce_with_axis`
+    // since reduce_except_dim does not support the flag.
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Tensor<xpu, 3, DType> in =
+        src.get_with_shape<xpu, 3, DType>(mshadow::Shape3(leading, src.shape_[axis], trailing), s);
+      mshadow::Tensor<xpu, 2, DType> out =
+        ret->get_with_shape<xpu, 2, DType>(mshadow::Shape2(leading, trailing), s);
+      CHECK(req != kAddTo) << "AddTo is not supported for `get_mask = true`";
+      ASSIGN_DISPATCH(out, req, (reduce_with_axis<Reducer, true>(in, 1)));
+    });
+    return;
+  }
+  if (1 == leading) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> in =
+        src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(src.shape_[axis], trailing), s);
+      mshadow::Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(trailing), s);
+      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(in)));
+    });
+  } else if (1 == trailing) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> in =
+        src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(leading, src.shape_[axis]), s);
+      mshadow::Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(leading), s);
+      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(in.T())));
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Tensor<xpu, 3, DType> in =
+        src.get_with_shape<xpu, 3, DType>(mshadow::Shape3(leading, src.shape_[axis], trailing), s);
+      mshadow::Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(leading * trailing), s);
+      ASSIGN_DISPATCH(out, req,
+        (reduce_except_dim<1, Reducer>(reshape(swapaxis<1, 0>(in),
+        mshadow::Shape2(src.shape_[axis], leading * trailing)))));
+    });
+  }
 }
 
 // Broadcast the given axis to the given broadcasting size
@@ -240,11 +288,13 @@ void BroadcastAxisImpl_(const TBlob &src,
       trailing *= ret->shape_[i];
     }
   }
-  mshadow::Tensor<xpu, 2> in =
-    src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(leading, trailing), s);
-  mshadow::Tensor<xpu, 3> out =
-    ret->get_with_shape<xpu, 3, real_t>(mshadow::Shape3(leading, bsize, trailing), s);
-  out = mshadow::expr::broadcast_with_axis(in, 0, bsize);
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    mshadow::Tensor<xpu, 2, DType> in =
+    src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(leading, trailing), s);
+    mshadow::Tensor<xpu, 3, DType> out =
+      ret->get_with_shape<xpu, 3, DType>(mshadow::Shape3(leading, bsize, trailing), s);
+    ASSIGN_DISPATCH(out, req, broadcast_with_axis(in, 0, bsize));
+  });
 }
 
 // Forward pass of reduce over the given axis
@@ -386,7 +436,7 @@ MXNET_REGISTER_SIMPLE_OP(sum_axis, XPU)
 
 // argmax channel
 MXNET_REGISTER_SIMPLE_OP(argmax_channel, XPU)
-.set_function(XPU::kDevMask, ReduceChannel<XPU, mshadow::red::maximum, true>,
+.set_function(XPU::kDevMask, ReduceChannel<XPU, mshadow::red::maximum>,
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceChannelShape)
 .describe("Take argmax indices of each channel of the src."
diff --git a/src/operator/broadcast_reduce_op_common.h b/src/operator/broadcast_reduce_op_common.h
new file mode 100644
index 000000000000..4ec50d4b3b56
--- /dev/null
+++ b/src/operator/broadcast_reduce_op_common.h
@@ -0,0 +1,165 @@
+/*!
+* Copyright (c) 2016 by Contributors
+* \file broadcast_reduce_op_common.h
+* \brief common function used for broadcasting and reducing
+* \author Xingjian Shi
+*/
+#ifndef MXNET_OPERATOR_BROADCAST_REDUCE_OP_COMMON_H_
+#define MXNET_OPERATOR_BROADCAST_REDUCE_OP_COMMON_H_
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+
+namespace mxnet {
+namespace op {
+
+/*!
+* \brief Check if the axes are continuous + get reducing size. E.g (1, 3) -> false, (1,2,3) -> true
+* \param is_contiguous_axes whether the axes is contiguous
+* \param reducing_size product of source shape in the given axes
+* \param axes 
+* \param src_shape shape of the source tensor
+*/
+inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
+  const mshadow::TShape &axes, const mshadow::TShape &src_shape) {
+  *is_contiguous_axes = true;
+  *reducing_size = 1;
+  for (index_t i = 0; i < axes.ndim(); ++i) {
+    *reducing_size *= src_shape[axes[i]];
+    if (i > 0) {
+      *is_contiguous_axes = *is_contiguous_axes && (axes[i] == (axes[i - 1] + 1));
+      CHECK(axes[i - 1] < axes[i]) << "axes must be in increasing order, received axes=" << axes;
+    }
+  }
+}
+
+template<int dimsrc>
+inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
+  const mshadow::TShape &axes, const mshadow::Shape<dimsrc> &src_shape) {
+  CheckContiguousAxes_(is_contiguous_axes, reducing_size, axes,
+    TShape(src_shape.shape_, src_shape.shape_ + dimsrc));
+}
+
+inline TShape GetBroadcastingAxes_(const mshadow::TShape &src_shape,
+  const mshadow::TShape &target_shape) {
+  std::vector<index_t> axes_vec;
+  CHECK_EQ(target_shape.ndim(), src_shape.ndim());
+  for (int i = 0; i < src_shape.ndim(); ++i) {
+    if (src_shape[i] != target_shape[i]) {
+      CHECK_EQ(src_shape[i], 1) << "broadcastsing axis must have size 1, received src_shape="
+        << src_shape << " target_shape=" << target_shape;
+      axes_vec.push_back(i);
+    }
+  }
+  TShape axes = TShape(axes_vec.begin(), axes_vec.end());
+  return axes;
+}
+
+/*!
+* \brief a reduce over multiple axes and assign to the output tensor.
+* \param out output tensor, must have dim 1
+* \param src the source expression
+* \param axes the given axes, should be in increasing order
+* \tparam Reducer type of the reducing operation
+* \tparam xpu
+* \tparam SrcExp the src expression template
+* \tparam etype type of expression
+*/
+template<typename Reducer, typename xpu, typename SrcExp, typename DType>
+void ReduceAxesAssign(mshadow::Tensor<xpu, 1, DType> out, const OpReqType req,
+  const SrcExp &src_, const TShape &axes) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  static const int dimsrc = ExpInfo<SrcExp>::kDim;
+  CHECK(axes.ndim() <= dimsrc);
+  Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+
+  // 1. Check if the axes has size 0, if so, no reducing is needed.
+  if (0 == axes.ndim()) {
+    ASSIGN_DISPATCH(out, req, reshape(src_, Shape1(src_shape.ProdShape(0, dimsrc))));
+    return;
+  }
+
+  // 2. Check if we want to reduce over contiguous axes and get the reducing size.
+  //  e.g. (1,2,3) --> contiguous, (1,3) --> noncontiguous
+  bool is_contiguous_axes = true;
+  index_t reducing_size = 1;
+  CheckContiguousAxes_(&is_contiguous_axes, &reducing_size, axes, src_shape);
+
+  // 3. For contiguous axes, we can always reshape them to (leading, reducing_size, trailing)
+  //  and we can then simplify the combination of mshadow symbols.
+  if (is_contiguous_axes) {
+    index_t leading = 1;
+    index_t trailing = 1;
+    for (index_t i = 0; i < dimsrc; ++i) {
+      if (i < axes[0]) {
+        leading *= src_shape[i];
+      } else if (i > axes[axes.ndim() - 1]) {
+        trailing *= src_shape[i];
+      }
+    }
+    if (1 == leading) {
+      ASSIGN_DISPATCH(out, req,
+        (reduce_except_dim<1, Reducer>(reshape(src_, Shape2(reducing_size, trailing)))));
+    } else {
+      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(
+        reshape(swapaxis<1, 0>(reshape(src_, Shape3(leading, reducing_size, trailing))),
+        Shape2(reducing_size, leading * trailing)))));
+    }
+    return;
+  }
+  // 4. For non-contiguous axes, we need to push axes to the front of the shape vector then reduce.
+  //   E.g axes = (1, 2), dim = 6 => transpose_shape = (1, 2, 0, 3, 4, 5)
+  Shape<dimsrc> transpose_shape = src_shape;
+  index_t remaining_size = 1;
+  for (index_t i = 0; i < axes.ndim(); ++i) {
+    transpose_shape[i] = axes[i];
+    if (i > 0) {
+      for (index_t j = axes[i - 1] + 1; j < axes[i]; ++j) {
+        transpose_shape[axes.ndim() - i + j] = j;
+        remaining_size *= src_shape[j];
+      }
+    }
+    if (axes.ndim() - 1 == i) {
+      for (index_t j = axes[axes.ndim() - 1] + 1; j < dimsrc; ++j) {
+        transpose_shape[j] = j;
+        remaining_size *= src_shape[j];
+      }
+    }
+    if (0 == i) {
+      for (index_t j = 0; j < axes[0]; ++j) {
+        transpose_shape[axes.ndim() - i + j] = j;
+        remaining_size *= src_shape[j];
+      }
+    }
+  }
+  ASSIGN_DISPATCH(out, req,
+    (reduce_except_dim<1, Reducer>(reshape(transpose(src_, transpose_shape),
+    Shape2(reducing_size, remaining_size)))));
+}
+
+/*!
+* \brief a reduce to the given shape and assign to the output tensor.
+* \param out output tensor, must have dim 1
+* \param src the source expression
+* \param target_shape shape of the target tensor, must have size 1 for the reduction axes
+* \tparam Reducer type of the reducing operation
+* \tparam xpu
+* \tparam SrcExp the src expression template
+* \tparam etype type of expression
+*/
+template<typename Reducer, typename xpu, typename SrcExp, typename DType>
+void ReduceToAssign(mshadow::Tensor<xpu, 1, DType> out, const OpReqType req,
+  const TShape &target_shape, const SrcExp &src_) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  static const int dimsrc = ExpInfo<SrcExp>::kDim;
+  Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+  TShape axes = GetBroadcastingAxes_(target_shape,
+    TShape(src_shape.shape_, src_shape.shape_ + dimsrc));
+  ReduceAxesAssign<Reducer>(out, req, src_, axes);
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_BROADCAST_REDUCE_OP_COMMON_H_
diff --git a/src/operator/elementwise_binary_broadcast_op-inl.h b/src/operator/elementwise_binary_broadcast_op-inl.h
index b210998e2775..89fedf5cc0c9 100644
--- a/src/operator/elementwise_binary_broadcast_op-inl.h
+++ b/src/operator/elementwise_binary_broadcast_op-inl.h
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2015 by Contributors
+ *  Copyright (c) 2016 by Contributors
  * \file elementwise_binary_broadcast_op-inl.h
  * \brief Function defintion of elementwise binary operators with broadcast
  *
@@ -26,16 +26,13 @@
  *
  * Here are examples of shapes that do not broadcast:
  *
- *   A      (3d tensor):  15 x 3 x 5
- *   B      (3d tensor):  15 x 1 x 5  # the diminsions for broadcasting should be continous
- *
  *   A      (1d tensor):  3
  *   B      (1d tensor):  4 # trailing dimensions do not match
  *
  *   A      (2d tensor):  1 x 2 x 1
  *   B      (3d tensor):  8 x 4 x 3 # second from last dimensions mismatched
  *
- * When no broadcast is need, it fails back to elementwise_binary_op-inl.h
+ * When no broadcast is need, it falls back to elementwise_binary_op-inl.h
  */
 #ifndef MXNET_OPERATOR_ELEMENTWISE_BINARY_BROADCAST_OP_INL_H_
 #define MXNET_OPERATOR_ELEMENTWISE_BINARY_BROADCAST_OP_INL_H_
@@ -44,6 +41,7 @@
 #include <algorithm>
 #include <vector>
 #include "./mshadow_op.h"
+#include "./broadcast_reduce_op_common.h"
 
 #if defined(__CUDACC__)
 #define XPU gpu
@@ -56,8 +54,7 @@ namespace op {
 
 inline bool IsBroadcastNeeded_(const TShape& lhs,
                               const TShape& rhs) {
-  // force ndim to be equal. do not smartly padding dims with 1s, which may
-  // confuse users
+  // force ndim to be equal. do not smartly padding dims with 1s, which may confuse users
   CHECK_EQ(lhs.ndim(), rhs.ndim());
   for (index_t i = 0; i < lhs.ndim(); ++i) {
     if (lhs[i] != rhs[i]) return true;
@@ -65,7 +62,6 @@ inline bool IsBroadcastNeeded_(const TShape& lhs,
   return false;
 }
 
-
 inline TShape BinaryBroadcastShape_(const TShape& lhs,
                                     const TShape& rhs,
                                     const EnvArguments& env) {
@@ -74,96 +70,66 @@ inline TShape BinaryBroadcastShape_(const TShape& lhs,
   for (size_t i = 0; i < ret.size(); ++i) {
     ret[i] = std::max(lhs[i], rhs[i]);
   }
-  // check
-  for (int h = 0; h < 2; ++h) {
-    const TShape& inp = h == 0 ? lhs : rhs;
-    int contdim = 0;
-    for (size_t i = 0; i < inp.ndim(); ++i) {
-      if (inp[i] != 1) {
-        CHECK_EQ(inp[i], ret[i]) << "broadcast error on index " << i << ". "
-                                 << "lhs = " << lhs << "; rhs = " << rhs;
-      }
-      if (inp[i] == ret[i]) {
-        if (i == 0 || inp[i-1] != ret[i-1]) ++contdim;
-      }
-    }
-    CHECK_LE(contdim, 1) << "broadcast dimensions are not continuous. "
-                         << "lhs = " << lhs << "; rhs = " << rhs;
-  }
   return TShape(ret.begin(), ret.end());
 }
 
-inline void GetBroadcastShape_(const TShape& lhs,
-                               const TShape& rhs,
-                               TShape* ret_reshaped,
-                               int* lhs_broadcast_axis,
-                               int* rhs_broadcast_axis) {
-  TShape ret = BinaryBroadcastShape_(lhs, rhs, EnvArguments());
-  int n = static_cast<int>(ret.ndim());
-  int pos[4] = {0, n, n, n};
-  for (int h = 0; h < 2; ++h) {
-    const TShape& inp = h == 0 ? lhs : rhs;
-    for (int i = 0; i < n; ++i) {
-      if (inp[i] == ret[i]) {
-        pos[h*2] = i; break;
-      }
-    }
-    for (int i = n; i > 0; --i) {
-      if (inp[i-1] == ret[i-1]) {
-        pos[h*2+1] = i; break;
-      }
-    }
-  }
-  bool no_broadcast_lhs = pos[0] == 0 && pos[1] == n;
-  bool no_broadcast_rhs = pos[2] == 0 && pos[3] == n;
-  int pos_ordered[4] = {0, -1, -1, n};
-  if (no_broadcast_lhs && no_broadcast_rhs) {
-    // no broadcast
-    LOG(FATAL) << "no broadcast is needed";
-  } else if (no_broadcast_lhs && !no_broadcast_rhs) {
-    // only broadcast rhs
-    *rhs_broadcast_axis = 1;
-    *lhs_broadcast_axis = -1;
-    pos_ordered[1] = pos[2];
-    pos_ordered[2] = pos[3];
-  } else if (!no_broadcast_lhs && no_broadcast_rhs) {
-    // only broadcast lhs
-    *rhs_broadcast_axis = -1;
-    *lhs_broadcast_axis = 1;
-    pos_ordered[1] = pos[0];
-    pos_ordered[2] = pos[1];
-  } else {
-    // broadcast both lhs and rhs
-    int p;
-    if (pos[0] <= pos[2]) {
-      CHECK(pos[0] == 0 && pos[1] == pos[2] && pos[3] == n)
-        << "broadcast shape error: lhs = " << lhs << "; rhs = " << rhs;
-      *lhs_broadcast_axis = 0;
-      *rhs_broadcast_axis = 1;
-      p = pos[1];
+inline void InferBroadcastNewShapes_(bool *do_opt,
+  TShape *new_lhs_shape, TShape *new_rhs_shape, TShape *new_out_shape,
+  const TShape &lhs_shape, const TShape &rhs_shape, const TShape &out_shape) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK((lhs_shape.ndim() == rhs_shape.ndim()) && (rhs_shape.ndim() == out_shape.ndim())) <<
+    "ndim inconsistency, lhs_shape=" << lhs_shape << ", rhs_shape=" << rhs_shape <<
+    ", out_shape=" << out_shape;
+  *do_opt = false;
+  TShape lhs_axes = GetBroadcastingAxes_(lhs_shape, out_shape);
+  TShape rhs_axes = GetBroadcastingAxes_(rhs_shape, out_shape);
+  bool lhs_contiguous, rhs_contiguous;
+  index_t lhs_broadcasting_size, rhs_broadcasting_size;
+  CheckContiguousAxes_(&lhs_contiguous, &lhs_broadcasting_size, lhs_axes, out_shape);
+  CheckContiguousAxes_(&rhs_contiguous, &rhs_broadcasting_size, rhs_axes, out_shape);
+  if (lhs_contiguous && rhs_contiguous && (lhs_axes.ndim() == 0 || rhs_axes.ndim() == 0)) {
+    *do_opt = true;
+    if (lhs_axes.ndim() == 0) {
+      index_t leading =
+        rhs_shape.ProdShape(0, rhs_axes[0]);
+      index_t trailing =
+        rhs_shape.ProdShape(rhs_axes[rhs_axes.ndim() - 1] + 1, rhs_shape.ndim());
+      *new_lhs_shape = Shape3(leading, rhs_broadcasting_size, trailing);
+      *new_rhs_shape = Shape3(leading, 1, trailing);
+      *new_out_shape = Shape3(leading, rhs_broadcasting_size, trailing);
     } else {
-      CHECK(pos[2] == 0 && pos[3] == pos[0] && pos[1] == n)
-        << "broadcast shape error: lhs = " << lhs << "; rhs = " << rhs;
-      *lhs_broadcast_axis = 1;
-      *rhs_broadcast_axis = 0;
-      p = pos[0];
+      index_t leading =
+        lhs_shape.ProdShape(0, lhs_axes[0]);
+      index_t trailing =
+        lhs_shape.ProdShape(lhs_axes[lhs_axes.ndim() - 1] + 1, lhs_shape.ndim());
+      *new_lhs_shape = Shape3(leading, 1, trailing);
+      *new_rhs_shape = Shape3(leading, lhs_broadcasting_size, trailing);
+      *new_out_shape = Shape3(leading, lhs_broadcasting_size, trailing);
     }
-    std::vector<index_t> dim(2, 1);
-    for (int i = 0; i < p; ++i) dim[0] *= ret[i];
-    for (int i = p; i < n; ++i) dim[1] *= ret[i];
-    *ret_reshaped = TShape(dim.begin(), dim.end());
-    return;
-  }
-  std::vector<index_t> dim(3, 1);
-  for (int i = 0; i < 3; ++i) {
-    for (int j = pos_ordered[i]; j < pos_ordered[i+1]; ++j) {
-      dim[i] *= ret[j];
+  } else {
+    *do_opt = false;
+    CHECK(lhs_shape.ndim() <= MXNET_SPECIAL_MAX_NDIM)
+      << "Only support input dimension up to " << MXNET_SPECIAL_MAX_NDIM
+      << ", lhs_shape=" << lhs_shape << ", rhs_shape=" << rhs_shape
+      << ", out_shape=" << out_shape;
+    *new_lhs_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
+    *new_rhs_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
+    *new_out_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
+    for (int i = 0; i < lhs_shape.ndim(); i++) {
+      (*new_lhs_shape)[i] = lhs_shape[i];
+      (*new_rhs_shape)[i] = rhs_shape[i];
+      (*new_out_shape)[i] = out_shape[i];
     }
   }
-  *ret_reshaped = TShape(dim.begin(), dim.end());
+  CHECK(((*new_lhs_shape).Size() == lhs_shape.Size())
+    && ((*new_rhs_shape).Size() == rhs_shape.Size())
+    && ((*new_out_shape).Size() == out_shape.Size()))
+    << "new_lhs_shape:" << *new_lhs_shape << ",lhs_shape:" << lhs_shape
+    << "new_rhs_shape:" << *new_rhs_shape << ",rhs_shape:" << rhs_shape
+    << "new_out_shape:" << *new_out_shape << ",out_shape:" << out_shape;
 }
 
-
 template<typename xpu, typename OP>
 void BinaryBroadcastForward_(const TBlob& lhs,
                              const TBlob& rhs,
@@ -171,94 +137,61 @@ void BinaryBroadcastForward_(const TBlob& lhs,
                              TBlob *ret,
                              OpReqType req,
                              RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
     << "Binary function only support input/output with the same type";
   CHECK_EQ(ret->type_flag_, rhs.type_flag_)
     << "Binary function only support input/output with the same type";
-
+  CHECK_EQ(lhs.shape_.ndim(), rhs.shape_.ndim()) << "the ndim of lhs and rhs must be equal,"
+    " shape of lhs=" << lhs.shape_ << " shape of rhs=" << rhs.shape_;
   if (!IsBroadcastNeeded_(lhs.shape_, rhs.shape_)) {
     // no broadcast
     MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-        Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
-        ASSIGN_DISPATCH(out, req,
-                        F<OP>(lhs.FlatTo2D<xpu, DType>(s),
-                              rhs.FlatTo2D<xpu, DType>(s)));
-      });
+      mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      ASSIGN_DISPATCH(out, req,
+        F<OP>(lhs.FlatTo2D<xpu, DType>(s),
+        rhs.FlatTo2D<xpu, DType>(s)));
+    });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs.shape_, rhs.shape_, &ret_reshaped,
-                     &lhs_broadcast_axis, &rhs_broadcast_axis);
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs.shape_, rhs.shape_, ret->shape_);
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-      if (lhs_broadcast_axis >= 0) {
-        // broadcast lhs
-        Tensor<xpu, 1, DType> mlhs =
-            lhs.get_with_shape<xpu, 1, DType>(Shape1(lhs.shape_.Size()), s);
-        if (rhs_broadcast_axis >= 0) {
-          // broadcast both
-          Tensor<xpu, 1, DType> mrhs =
-              rhs.get_with_shape<xpu, 1, DType>(Shape1(rhs.shape_.Size()), s);
-
-          Shape<2> ret_mshape = ret_reshaped.get<2>();
-          Tensor<xpu, 2, DType> out =
-              ret->get_with_shape<xpu, 2, DType>(ret_mshape, s);
-          if (lhs_broadcast_axis == 0) {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast<0>(mlhs, ret_mshape),
-                                  broadcast<1>(mrhs, ret_mshape)));
-          } else {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast<1>(mlhs, ret_mshape),
-                                  broadcast<0>(mrhs, ret_mshape)));
-          }
-        } else {
-          // only lhs
-          Shape<3> ret_mshape = ret_reshaped.get<3>();
-          Tensor<xpu, 3, DType> out =
-              ret->get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          Tensor<xpu, 3, DType> mrhs =
-              rhs.get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          if (lhs.shape_.Size() == 1) {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast_scalar(mlhs, ret_mshape), mrhs));
-          } else {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast<1>(mlhs, ret_mshape), mrhs));
-          }
-        }
-      } else {
-        Tensor<xpu, 1, DType> mrhs =
-            rhs.get_with_shape<xpu, 1, DType>(mshadow::Shape1(rhs.shape_.Size()), s);
-        if (rhs_broadcast_axis >= 0) {
-          // only rhs
-          Shape<3> ret_mshape = ret_reshaped.get<3>();
-          Tensor<xpu, 3, DType> out =
-              ret->get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          Tensor<xpu, 3, DType> mlhs =
-              lhs.get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          if (lhs.shape_.Size() == 1) {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(mlhs, broadcast_scalar(mrhs, ret_mshape)));
-          } else {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(mlhs, broadcast<1>(mrhs, ret_mshape)));
-          }
-        } else {
-          LOG(FATAL) << "no broadcast is needed";
-        }
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
-    });
+      Tensor<xpu, 3, DType> out = ret->get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      Tensor<xpu, 3, DType> mlhs = lhs.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      Tensor<xpu, 3, DType> mrhs = rhs.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      ASSIGN_DISPATCH(out, req,
+        F<OP>(broadcast_to(mlhs, out_new_shape_), broadcast_to(mrhs, out_new_shape_)));
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> out =
+        ret->get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs =
+        lhs.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs =
+        rhs.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      ASSIGN_DISPATCH(out, req,
+        F<OP>(broadcast_to(mlhs, out_new_shape_), broadcast_to(mrhs, out_new_shape_)));
+    }
+  });
 }
 
-
 template<typename xpu, typename LHS_OP, typename RHS_OP>
 void BinaryBroadcastBackward_(const OutputGrad& out_grad,
                               const EnvArguments& env,
@@ -267,13 +200,16 @@ void BinaryBroadcastBackward_(const OutputGrad& out_grad,
                               OpReqType req_lhs_grad,
                               OpReqType req_rhs_grad,
                               RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Shape2;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(out_grad.data.type_flag_, lhs_grad->type_flag_)
+    << "Binary function only support ingrad/outgrad with the same type";
+  CHECK_EQ(out_grad.data.type_flag_, rhs_grad->type_flag_)
+    << "Binary function only support ingrad/outgrad with the same type";
+  CHECK_EQ(rhs_grad->shape_.ndim(), rhs_grad->shape_.ndim()) <<
+    "the ndim of lhs_grad and rhs_grad must be equal,"
+    " shape of lhs_grad=" << lhs_grad->shape_ << " shape of rhs_grad=" << rhs_grad->shape_;
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
     // no broadcast
     MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
@@ -285,63 +221,39 @@ void BinaryBroadcastBackward_(const OutputGrad& out_grad,
       });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs_grad->shape_, rhs_grad->shape_, &ret_reshaped,
-                     &lhs_broadcast_axis, &rhs_broadcast_axis);
-  index_t lhs_size = lhs_grad->shape_.Size();
-  index_t rhs_size = rhs_grad->shape_.Size();
-
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
   MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      if (lhs_broadcast_axis >= 0) {
-        Tensor<xpu, 1, DType> mlhs_grad =
-            lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-        if (rhs_broadcast_axis >= 0) {
-          // broadcast both
-          Tensor<xpu, 2, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 2, DType>(ret_reshaped.get<2>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          if (lhs_broadcast_axis == 0) {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<0>(F<LHS_OP>(mout_grad)));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<1>(F<RHS_OP>(mout_grad)));
-          } else {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<1>(F<LHS_OP>(mout_grad)));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<0>(F<RHS_OP>(mout_grad)));
-          }
-        } else {
-          // only broadcast lhs
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          ASSIGN_DISPATCH(
-              mlhs_grad, req_lhs_grad, sumall_except_dim<1>(F<LHS_OP>(mout_grad)));
-          ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, F<RHS_OP>(mout_grad));
-        }
-      } else {
-        if (rhs_broadcast_axis >= 0) {
-          // only broadcast rhs
-          Tensor<xpu, 3, DType> mlhs_grad =
-              lhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, F<LHS_OP>(mout_grad));
-          ASSIGN_DISPATCH(
-              mrhs_grad, req_rhs_grad, sumall_except_dim<1>(F<RHS_OP>(mout_grad)));
-        } else {
-          LOG(FATAL) << "no broadcast is needed";
-        }
+    if (do_opt) {
+      Shape<3> out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        out_new_shape[i] = out_new_shape_[i];
       }
-    });
+      Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, F<LHS_OP>(mout_grad));
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_, F<RHS_OP>(mout_grad));
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, F<LHS_OP>(mout_grad));
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_, F<RHS_OP>(mout_grad));
+    }
+  });
 }
 
 template<typename xpu>
@@ -354,112 +266,71 @@ void BroadcastMulBackward_(const OutputGrad& out_grad,
                             OpReqType req_lhs_grad,
                             OpReqType req_rhs_grad,
                             RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Shape2;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
     MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-        Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
-        CHECK_NE(req_rhs_grad, kWriteInplace);
-        ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, mlhs_data * mout_grad);
-        ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mrhs_data * mout_grad);
-      });
+      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      CHECK_NE(req_rhs_grad, kWriteInplace);
+      ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, mlhs_data * mout_grad);
+      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mrhs_data * mout_grad);
+    });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs_grad->shape_, rhs_grad->shape_, &ret_reshaped,
-                     &lhs_broadcast_axis, &rhs_broadcast_axis);
-  index_t lhs_size = lhs_grad->shape_.Size();
-  index_t rhs_size = rhs_grad->shape_.Size();
-
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
   MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      if (lhs_broadcast_axis >= 0) {
-        Tensor<xpu, 1, DType> mlhs_data =
-            lhs.data.get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-        Tensor<xpu, 1, DType> mlhs_grad =
-            lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-
-        if (rhs_broadcast_axis >= 0) {
-          // broadcast both
-          Tensor<xpu, 2, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 2, DType>(ret_reshaped.get<2>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 1, DType> mrhs_data =
-              rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          if (lhs_broadcast_axis == 0) {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<0>(
-                    mout_grad * broadcast<1>(mrhs_data, ret_reshaped.get<2>())));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-                    mout_grad * broadcast<0>(mlhs_data, ret_reshaped.get<2>())));
-          } else {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<1>(
-                    mout_grad * broadcast<0>(mrhs_data, ret_reshaped.get<2>())));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<0>(
-                    mout_grad * broadcast<1>(mlhs_data, ret_reshaped.get<2>())));
-          }
-        } else {
-          // only broadcast lhs
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mrhs_data =
-              rhs.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-
-          ASSIGN_DISPATCH(
-              mlhs_grad, req_lhs_grad, sumall_except_dim<1>(mout_grad * mrhs_data));
-          if (lhs_size == 1) {
-            ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-                            mout_grad * broadcast_scalar(mlhs_data, ret_reshaped.get<3>()));
-          } else {
-            ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-                            mout_grad * broadcast<1>(mlhs_data, ret_reshaped.get<3>()));
-          }
-        }
-      } else {
-        if (rhs_broadcast_axis >= 0) {
-          // only broadcast rhs
-          Tensor<xpu, 3, DType> mlhs_grad =
-              lhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mlhs_data =
-              lhs.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 1, DType> mrhs_data =
-              rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-
-          if (rhs_size == 1) {
-            ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-                            mout_grad * broadcast_scalar(mrhs_data, ret_reshaped.get<3>()));
-          } else {
-            ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-                            mout_grad * broadcast<1>(mrhs_data, ret_reshaped.get<3>()));
-          }
-          ASSIGN_DISPATCH(
-              mrhs_grad, req_rhs_grad, sumall_except_dim<1>(mout_grad * mlhs_data));
-        } else {
-          LOG(FATAL) << "no broadcast is needed";
-        }
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
-    });
+      mshadow::Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        broadcast_to(mlhs_data, out_new_shape_) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) * mout_grad);
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        broadcast_to(mlhs_data, out_new_shape_) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) * mout_grad);
+    }
+  });
 }
 
 template<typename xpu>
@@ -472,122 +343,73 @@ void BroadcastDivBackward_(const OutputGrad& out_grad,
   OpReqType req_lhs_grad,
   OpReqType req_rhs_grad,
   RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Shape2;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
     MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-                      F<mshadow_op::negation>(mout_grad * mlhs_data)/
-                      F<mshadow_op::square>(mrhs_data));
-      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mout_grad /  mrhs_data);    });
+        F<mshadow_op::negation>(mout_grad * mlhs_data) /
+        F<mshadow_op::square>(mrhs_data));
+      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mout_grad / mrhs_data);
+    });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs_grad->shape_, rhs_grad->shape_, &ret_reshaped,
-    &lhs_broadcast_axis, &rhs_broadcast_axis);
-  index_t lhs_size = lhs_grad->shape_.Size();
-  index_t rhs_size = rhs_grad->shape_.Size();
-
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
   MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-    if (lhs_broadcast_axis >= 0) {
-      Tensor<xpu, 1, DType> mlhs_data =
-        lhs.data.get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-      Tensor<xpu, 1, DType> mlhs_grad =
-        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-
-      if (rhs_broadcast_axis >= 0) {
-        // broadcast both
-        Shape<2> rshape = ret_reshaped.get<2>();
-        Tensor<xpu, 2, DType> mout_grad =
-          out_grad.data.get_with_shape<xpu, 2, DType>(rshape, s);
-        Tensor<xpu, 1, DType> mrhs_grad =
-          rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        Tensor<xpu, 1, DType> mrhs_data =
-          rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        if (lhs_broadcast_axis == 0) {
-          ASSIGN_DISPATCH(
-            mlhs_grad, req_lhs_grad, sumall_except_dim<0>(
-            mout_grad / broadcast<1>(mrhs_data, rshape)));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-            F<mshadow_op::negation>(mout_grad * broadcast<0>(mlhs_data, rshape)) /
-            F<mshadow_op::square>(broadcast<1>(mrhs_data, rshape))));
-        } else {
-          ASSIGN_DISPATCH(
-            mlhs_grad, req_lhs_grad, sumall_except_dim<1>(
-            mout_grad / broadcast<0>(mrhs_data, rshape)));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<0>(
-            F<mshadow_op::negation>(mout_grad * broadcast<1>(mlhs_data, rshape)) /
-            F<mshadow_op::square>(broadcast<0>(mrhs_data, rshape))));
-        }
-      } else {
-        // only broadcast lhs
-        Shape<3> rshape = ret_reshaped.get<3>();
-        Tensor<xpu, 3, DType> mout_grad =
-          out_grad.data.get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 3, DType> mrhs_grad =
-          rhs_grad->get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 3, DType> mrhs_data =
-          rhs.data.get_with_shape<xpu, 3, DType>(rshape, s);
-
-        ASSIGN_DISPATCH(
-          mlhs_grad, req_lhs_grad, sumall_except_dim<1>(mout_grad / mrhs_data));
-        if (lhs_size == 1) {
-          ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-            F<mshadow_op::negation>(mout_grad * broadcast_scalar(mlhs_data, rshape)) /
-            F<mshadow_op::square>(mrhs_data));
-        } else {
-          ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-            F<mshadow_op::negation>(mout_grad * broadcast<1>(mlhs_data, rshape)) /
-            F<mshadow_op::square>(mrhs_data));
-        }
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
+      mshadow::Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::negation>(mout_grad * broadcast_to(mlhs_data, out_new_shape_)) /
+        F<mshadow_op::square>(broadcast_to(mrhs_data, out_new_shape_)));
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, mout_grad /
+        broadcast_to(mrhs_data, out_new_shape_));
     } else {
-      if (rhs_broadcast_axis >= 0) {
-        // only broadcast rhs
-        Shape<3> rshape = ret_reshaped.get<3>();
-        Tensor<xpu, 3, DType> mlhs_grad = lhs_grad->get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 3, DType> mlhs_data = lhs.data.get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 1, DType> mrhs_grad =
-          rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        Tensor<xpu, 1, DType> mrhs_data =
-          rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        Tensor<xpu, 3, DType> mout_grad =
-          out_grad.data.get_with_shape<xpu, 3, DType>(rshape, s);
-
-        if (rhs_size == 1) {
-          ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-            mout_grad / broadcast_scalar(mrhs_data, rshape));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-            F<mshadow_op::negation>(mout_grad * mlhs_data) /
-            F<mshadow_op::square>(broadcast_scalar(mrhs_data, rshape))));
-        } else {
-          ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-            mout_grad / broadcast<1>(mrhs_data, rshape));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-            F<mshadow_op::negation>(mout_grad * mlhs_data) /
-            F<mshadow_op::square>(broadcast<1>(mrhs_data, rshape))));
-        }
-      } else {
-        LOG(FATAL) << "no broadcast is needed";
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::negation>(mout_grad * broadcast_to(mlhs_data, out_new_shape_)) /
+        F<mshadow_op::square>(broadcast_to(mrhs_data, out_new_shape_)));
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, mout_grad /
+        broadcast_to(mrhs_data, out_new_shape_));
     }
   });
 }
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 8a8049590b34..b0273f288091 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -205,7 +205,7 @@ def test_dot():
     assert reldiff(c, C.asnumpy()) < 1e-5
 
 def test_reduce():
-    sample_num = 1000
+    sample_num = 200
     def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
         for i in range(sample_num):
             ndim = np.random.randint(1, 8)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4638f9c905d1..2e34869d92ba 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -733,24 +733,27 @@ def test_convolution_grouping():
         np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3)
 
 def _gen_broadcast_data():
-    testing_shapes = [(2, 3, 4), (3, 5, 7), (4, 2, 6)]
-    shape_pairs = []
-    for n, m, k in testing_shapes:
-        shape_pairs += [((1,), (1,)),
-                       ((n,), (n,)),
-                       ((n,m), (n,m)),
-                       ((n,m,k), (n,m,k)),
-                       ((n,1), (1,n)),
-                       ((n,m,k), (n,1,1)),
-                       ((n,m,k), (1,m,1)),
-                       ((n,m,k), (1,m,k)),
-                       ((n,m,k), (n,m,1)),
-                       ((n,m,k), (1,1,k))]
-    shape_pairs += [(v, u) for (u, v) in shape_pairs]
-    return [(np.random.random(u), np.random.random(v)) for (u,v) in shape_pairs]
+    # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
+    ndim = np.random.randint(1, 8)
+    shape = np.random.randint(1, 11, size=(ndim,))
+    l_same_dim = np.random.randint(0, 5)
+    r_same_dim = np.random.randint(0, 5)
+    l_axis_flags = np.random.randint(0, 2, size=ndim)
+    r_axis_flags = np.random.randint(0, 2, size=ndim)
+    if l_same_dim == 4:
+        l_axis_flags = np.ones(ndim)
+    if r_same_dim == 4:
+        r_axis_flags = np.ones(ndim)
+    l_shape = shape.copy()
+    r_shape = shape.copy()
+    l_shape[np.where(l_axis_flags == 0)] = 1
+    r_shape[np.where(r_axis_flags == 0)] = 1
+    return [np.random.random(l_shape), np.random.random(r_shape)]
 
 def _check_broadcast_op_forward(symbol, baseline):
-    for d in _gen_broadcast_data():
+    sample_num = 200
+    for i in range(sample_num):
+        d = _gen_broadcast_data()
         x = baseline(d[0], d[1])
         y = symbol.bind(mx.cpu(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])})
         y.forward()
@@ -759,8 +762,10 @@ def _check_broadcast_op_forward(symbol, baseline):
             err, d[0].shape, d[1].shape)
 
 def _check_broadcast_op_backward(symbol, baseline):
-    for d in _gen_broadcast_data():
-        out = d[0] + d[1]
+    sample_num = 200
+    for i in range(sample_num):
+        d = _gen_broadcast_data()
+        out = np.random.random((d[0] + d[1]).shape)
         def reduce_op(shape, x):
             if shape == x.shape:
                 return x
@@ -782,7 +787,7 @@ def reduce_op(shape, x):
         err = lambda x, y: np.sum(np.abs(x-y)) / np.sum(np.abs(x))
         err_1 = err(x_1, y_1.asnumpy())
         err_2 = err(x_2, y_2.asnumpy())
-        assert err_1 < 1e-6 and err_2 < 1e-6, 'lhs error %f, rhs error %f, shapes are %s %s' % (
+        assert err_1 < 1e-5 and err_2 < 1e-5, 'lhs error %f, rhs error %f, shapes are %s %s' % (
             err_1, err_2, d[0].shape, d[1].shape)
 
 def test_broadcast_binary_op():
@@ -927,7 +932,7 @@ def test_reshape_new(src_shape, shape_args, dst_shape):
     assert(output_shape[0] == (2, 75))
 
 def test_reduce():
-    sample_num = 1000
+    sample_num = 200
     def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
         for i in range(sample_num):
             # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
@@ -969,7 +974,7 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
                       mx.symbol.sum)
 
 def test_broadcast():
-    sample_num = 1000
+    sample_num = 200
     def test_broadcast_axis():
         for i in range(sample_num):
             # Generate random data that has ndim between 1-7 and all the shape dims between 1-10

From e721aa53bd5c80fc2ac587d6fa8623ce84d5702f Mon Sep 17 00:00:00 2001
From: Yuqi Li <ziyeqinghan@gmail.com>
Date: Sat, 18 Jun 2016 00:14:01 +0800
Subject: [PATCH 025/126] fix some typos (#2450)

---
 R-package/vignettes/CallbackFunctionTutorial.Rmd             | 2 +-
 R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd | 2 +-
 R-package/vignettes/fiveMinutesNeuralNetwork.Rmd             | 2 +-
 R-package/vignettes/mnistCompetition.Rmd                     | 2 +-
 docs/packages/r/CallbackFunctionTutorial.md                  | 2 +-
 docs/packages/r/classifyRealImageWithPretrainedModel.md      | 2 +-
 docs/packages/r/fiveMinutesNeuralNetwork.md                  | 2 +-
 docs/packages/r/mnistCompetition.md                          | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/R-package/vignettes/CallbackFunctionTutorial.Rmd b/R-package/vignettes/CallbackFunctionTutorial.Rmd
index 85cd78be90b0..97b6ce3161a0 100644
--- a/R-package/vignettes/CallbackFunctionTutorial.Rmd
+++ b/R-package/vignettes/CallbackFunctionTutorial.Rmd
@@ -6,7 +6,7 @@ which can very useful in model training.
 
 This tutorial is written in Rmarkdown.
 
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/CallbackFunctionTutorial.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/CallbackFunctionTutorial.html)
 
 - You can find the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CallbackFunctionTutorial.Rmd)
 
diff --git a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
index 32fdafd38145..ad7c8f5eefad 100644
--- a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
+++ b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
@@ -13,7 +13,7 @@ This model gives the recent state-of-art prediction accuracy on image net datase
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/classifyRealImageWithPretrainedModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/classifyRealImageWithPretrainedModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
 
 Pacakge Loading
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
index efb0dba98109..66ac18ef3806 100644
--- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
+++ b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
@@ -8,7 +8,7 @@ We will show you how to do classification and regression tasks respectively. The
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/fiveMinutesNeuralNetwork.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/fiveMinutesNeuralNetwork.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd)
 
 ## Classification
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index a81613b4a59e..6387b4ba1694 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -5,7 +5,7 @@ Handwritten Digits Classification Competition
 We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
-hosted version of tutorial [here](http://mxnet.readthedocs.org/en/latest/R-package/mnistCompetition.html).
+hosted version of tutorial [here](http://mxnet.readthedocs.io/en/latest/packages/r/mnistCompetition.html).
 
 ## Data Loading
 
diff --git a/docs/packages/r/CallbackFunctionTutorial.md b/docs/packages/r/CallbackFunctionTutorial.md
index c27e009fea7c..c60833a75bcd 100644
--- a/docs/packages/r/CallbackFunctionTutorial.md
+++ b/docs/packages/r/CallbackFunctionTutorial.md
@@ -6,7 +6,7 @@ which can very useful in model training.
 
 This tutorial is written in Rmarkdown.
 
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/CallbackFunctionTutorial.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/CallbackFunctionTutorial.html)
 
 - You can find the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CallbackFunctionTutorial.Rmd)
 
diff --git a/docs/packages/r/classifyRealImageWithPretrainedModel.md b/docs/packages/r/classifyRealImageWithPretrainedModel.md
index 6d73bb409bce..2d9276c212d2 100644
--- a/docs/packages/r/classifyRealImageWithPretrainedModel.md
+++ b/docs/packages/r/classifyRealImageWithPretrainedModel.md
@@ -12,7 +12,7 @@ This model gives the recent state-of-art prediction accuracy on image net datase
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/classifyRealImageWithPretrainedModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/classifyRealImageWithPretrainedModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
 
 Pacakge Loading
diff --git a/docs/packages/r/fiveMinutesNeuralNetwork.md b/docs/packages/r/fiveMinutesNeuralNetwork.md
index 1d56c7984d7d..6102eaee7569 100644
--- a/docs/packages/r/fiveMinutesNeuralNetwork.md
+++ b/docs/packages/r/fiveMinutesNeuralNetwork.md
@@ -8,7 +8,7 @@ We will show you how to do classification and regression tasks respectively. The
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/fiveMinutesNeuralNetwork.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/fiveMinutesNeuralNetwork.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd)
 
 ## Classification
diff --git a/docs/packages/r/mnistCompetition.md b/docs/packages/r/mnistCompetition.md
index 4a0a0d71f854..a84ecb5ec326 100644
--- a/docs/packages/r/mnistCompetition.md
+++ b/docs/packages/r/mnistCompetition.md
@@ -5,7 +5,7 @@ Handwritten Digits Classification Competition
 We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
-hosted version of tutorial [here](http://mxnet.readthedocs.org/en/latest/R-package/mnistCompetition.html).
+hosted version of tutorial [here](http://mxnet.readthedocs.io/en/latest/packages/r/mnistCompetition.html).
 
 ## Data Loading
 

From 8d572112a44c0fcaba070fa9e13e1fd9b1f6321b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 17 Jun 2016 15:02:12 -0700
Subject: [PATCH 026/126] [OP] Support softmax with probability label (#2456)

---
 .gitignore                                    |  1 +
 src/operator/broadcast_reduce_op_common.h     |  4 +--
 .../elementwise_binary_broadcast_op-inl.h     |  2 +-
 src/operator/softmax_output-inl.h             | 29 +++++++++++++------
 src/operator/softmax_output.cc                |  4 +--
 tests/python/unittest/test_operator.py        | 25 +++++++++++-----
 6 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2d1e5d842da4..6d497340ad78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,4 @@ scala-package/*/*/target/
 *.project
 *.settings
 !scala-package/*/bin
+*.bak
diff --git a/src/operator/broadcast_reduce_op_common.h b/src/operator/broadcast_reduce_op_common.h
index 4ec50d4b3b56..37ad8adbc12d 100644
--- a/src/operator/broadcast_reduce_op_common.h
+++ b/src/operator/broadcast_reduce_op_common.h
@@ -18,7 +18,7 @@ namespace op {
 * \brief Check if the axes are continuous + get reducing size. E.g (1, 3) -> false, (1,2,3) -> true
 * \param is_contiguous_axes whether the axes is contiguous
 * \param reducing_size product of source shape in the given axes
-* \param axes 
+* \param axes
 * \param src_shape shape of the source tensor
 */
 inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
@@ -45,7 +45,7 @@ inline TShape GetBroadcastingAxes_(const mshadow::TShape &src_shape,
   const mshadow::TShape &target_shape) {
   std::vector<index_t> axes_vec;
   CHECK_EQ(target_shape.ndim(), src_shape.ndim());
-  for (int i = 0; i < src_shape.ndim(); ++i) {
+  for (index_t i = 0; i < src_shape.ndim(); ++i) {
     if (src_shape[i] != target_shape[i]) {
       CHECK_EQ(src_shape[i], 1) << "broadcastsing axis must have size 1, received src_shape="
         << src_shape << " target_shape=" << target_shape;
diff --git a/src/operator/elementwise_binary_broadcast_op-inl.h b/src/operator/elementwise_binary_broadcast_op-inl.h
index 89fedf5cc0c9..5cf73257b3d9 100644
--- a/src/operator/elementwise_binary_broadcast_op-inl.h
+++ b/src/operator/elementwise_binary_broadcast_op-inl.h
@@ -116,7 +116,7 @@ inline void InferBroadcastNewShapes_(bool *do_opt,
     *new_lhs_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
     *new_rhs_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
     *new_out_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
-    for (int i = 0; i < lhs_shape.ndim(); i++) {
+    for (index_t i = 0; i < lhs_shape.ndim(); i++) {
       (*new_lhs_shape)[i] = lhs_shape[i];
       (*new_rhs_shape)[i] = rhs_shape[i];
       (*new_out_shape)[i] = out_shape[i];
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 14f1f795cfe6..546f70010056 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -103,7 +103,14 @@ class SoftmaxOutputOp : public Operator {
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
-    if (param_.multi_output) {
+    if (out_data[softmaxout_enum::kOut].shape_ ==
+        in_data[softmaxout_enum::kLabel].shape_) {
+      // use probability as label
+      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+      grad = (out - label) * scalar<DType>(param_.grad_scale);
+    } else if (param_.multi_output) {
       int n = out_data[softmaxout_enum::kOut].size(0);
       int k = out_data[softmaxout_enum::kOut].size(1);
       Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
@@ -204,14 +211,18 @@ class SoftmaxOutputProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
-    if (param_.multi_output) {
-      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel,
-                         Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
-    } else {
-      TShape label_shape(dshape.ndim() - 1);
-      for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
-        label_shape[i] = dshape[i];
-      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+
+    // label.shape == data.shape: use probability as label
+    if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
+      if (param_.multi_output) {
+        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel,
+                           Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
+      } else {
+        TShape label_shape(dshape.ndim() - 1);
+        for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+          label_shape[i] = dshape[i];
+        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+      }
     }
     out_shape->clear();
     out_shape->push_back(dshape);
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index c4b14e3ac5b4..439a400b4f99 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -32,7 +32,8 @@ DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
 MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
 .describe("Perform a softmax transformation on input, backprop with logloss.")
 .add_argument("data", "Symbol", "Input data to softmax.")
-.add_argument("label", "Symbol", "Label data.")
+.add_argument("label", "Symbol", "Label data, can also be "\
+              "probability value with same shape as data")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
@@ -42,4 +43,3 @@ MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
 
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 2e34869d92ba..eadbb1bbe0a3 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -10,6 +10,13 @@
 def same(a, b):
     return np.sum(a != b) == 0
 
+def np_softmax(x):
+    x = x - np.max(x, axis=1).reshape(x.shape[0], 1)
+    x = np.exp(x)
+    x /= np.sum(x, axis=1).reshape(x.shape[0], 1)
+    return x
+
+
 def check_elementwise_sum_with_shape(shape, n):
     # forward
     inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
@@ -235,20 +242,23 @@ def check_softmax_with_ignore_label(xpu):
     assert(reldiff(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):]) < 1e-5)
 
 def check_softmax_with_shape(shape, xpu):
+    # bind with label
     X = mx.symbol.Variable('X')
     L = mx.symbol.Variable('L')
     Y = mx.symbol.SoftmaxOutput(data=X, label=L)
     x = mx.random.uniform(-1, 1, shape, ctx = xpu)
-    l = mx.nd.empty((shape[0],), ctx = xpu)
-    l[:] = np.random.randint(0, shape[1]-1, (shape[0],))
+    l = mx.random.uniform(-1, 1, shape, ctx = xpu)
+    l[:] = np_softmax(l.asnumpy())
     grad = mx.nd.empty(shape, ctx = xpu)
-
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    print('foward')
     exec1.forward()
-    print(exec1.outputs[0].asnumpy())
+    out = exec1.outputs[0].asnumpy()
+    assert_allclose(out, np_softmax(x.asnumpy()))
     exec1.backward()
-    print(grad.asnumpy())
+    assert_allclose(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy())
+
+def test_softmax():
+    check_softmax_with_shape((3, 4), mx.cpu())
 
 def check_multi_softmax_with_shape(shape, xpu):
     X = mx.symbol.Variable('X')
@@ -1047,6 +1057,7 @@ def test_flip():
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
 if __name__ == '__main__':
+    test_softmax()
     test_broadcast_binary_op()
     test_flip()
     test_crop()
@@ -1077,5 +1088,3 @@ def test_flip():
     test_reshape()
     test_reduce()
     test_broadcast()
-    #check_softmax_with_shape((3,4), mx.cpu())
-    #check_multi_softmax_with_shape((3,4,5), mx.cpu())

From 1afa3543d1bd65ff30b01dd3bd87148e6b1eda51 Mon Sep 17 00:00:00 2001
From: Xiang Liang <xlvector@gmail.com>
Date: Sat, 18 Jun 2016 06:09:34 +0800
Subject: [PATCH 027/126] add warpctc config to config.mk (#2451)

---
 make/config.mk | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/make/config.mk b/make/config.mk
index bbd19e56b5d7..aa3986a21673 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -116,6 +116,9 @@ EXTRA_OPERATORS =
 # TORCH_PATH = $(HOME)/torch
 # MXNET_PLUGINS += plugin/torch/torch.mk
 
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
 # whether to use sframe integration. This requires build sframe
 # git@github.com:dato-code/SFrame.git
 # SFRAME_PATH = $(HOME)/SFrame

From 72121a5c46722d4dd8e218dbf14af782028d40c1 Mon Sep 17 00:00:00 2001
From: Yan Li <godricly_li@126.com>
Date: Sat, 18 Jun 2016 06:09:55 +0800
Subject: [PATCH 028/126] update lstm.py reshape using shape (#2446)

changing target_shape to shape
---
 example/warpctc/lstm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/warpctc/lstm.py b/example/warpctc/lstm.py
index 97fda6b9c9d4..32ba2455e11d 100644
--- a/example/warpctc/lstm.py
+++ b/example/warpctc/lstm.py
@@ -72,7 +72,7 @@ def lstm_unroll(num_lstm_layer, seq_len,
     hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
     pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
 
-    label = mx.sym.Reshape(data=label, target_shape=(0,))
+    label = mx.sym.Reshape(data=label, shape=(-1,))
     label = mx.sym.Cast(data = label, dtype = 'int32')
     sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)
     return sm

From 7a2b67e002561f8fea23fe343f85204285b12add Mon Sep 17 00:00:00 2001
From: wzl12356 <wzlsuccess@163.com>
Date: Sat, 18 Jun 2016 07:25:07 +0800
Subject: [PATCH 029/126] fix worker node could not exit (#2440)

---
 src/kvstore/kvstore_dist.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 270d85101d9f..1fa3cd7fd322 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -37,6 +37,7 @@ class KVStoreDist : public KVStoreLocal {
   virtual ~KVStoreDist() {
     Engine::Get()->WaitForAll();
     if (IsWorkerNode()) {
+      ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
       if (get_rank() == 0) {
         // stop the executor at servers
         SendCommandToServers(kStopServer, "");

From 86d71e9f7bd7ed1e58012c5605d30ffb1eb47556 Mon Sep 17 00:00:00 2001
From: "Qiang Kou (KK)" <qkou@umail.iu.edu>
Date: Fri, 17 Jun 2016 17:23:25 -0700
Subject: [PATCH 030/126] don't run char rnn test in R (#2459)

---
 tests/travis/run_test.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index 33fe0072ff26..aaf908d92cc8 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -79,7 +79,11 @@ if [ ${TASK} == "r_test" ]; then
     wget https://s3-us-west-2.amazonaws.com/mxnet/train.csv -O train.csv
     wget https://s3-us-west-2.amazonaws.com/mxnet/test.csv -O test.csv
 
-    cat *.R > r_test.R
+    cat CallbackFunctionTutorial.R \
+    fiveMinutesNeuralNetwork.R \
+    mnistCompetition.R \
+    classifyRealImageWithPretrainedModel.R \
+    ndarrayAndSymbolTutorial.R > r_test.R
 
     Rscript r_test.R || exit -1
 

From 23d428da16eda44a8a9b846a6ce69a4755fb33b3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 17 Jun 2016 22:21:45 -0700
Subject: [PATCH 031/126] [OP] add slice_axis (#2458)

* [OP] add slice

* optimization
---
 mshadow                                |   2 +-
 src/operator/broadcast_reduce_op-inl.h |  12 +--
 src/operator/broadcast_reduce_op.cc    |   6 +-
 src/operator/matrix_op-inl.h           | 127 ++++++++++++++++++++++++-
 src/operator/matrix_op.cc              |   1 +
 tests/python/unittest/test_operator.py |  30 ++++++
 6 files changed, 168 insertions(+), 10 deletions(-)

diff --git a/mshadow b/mshadow
index 65da7de8b59f..02a4a0ef942c 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 65da7de8b59fd1736e0c2d71508937ef25b91686
+Subproject commit 02a4a0ef942ce49fb1729882e5da2b67c46aa723
diff --git a/src/operator/broadcast_reduce_op-inl.h b/src/operator/broadcast_reduce_op-inl.h
index fa6b7fbf106a..ba6d08320053 100644
--- a/src/operator/broadcast_reduce_op-inl.h
+++ b/src/operator/broadcast_reduce_op-inl.h
@@ -186,12 +186,12 @@ inline TShape ReduceChannelShape(const TShape& ishape,
 // Reduce the given axis
 template<typename xpu, typename Reducer, bool get_mask>
 void ReduceAxisImpl_(const TBlob &src,
-  const EnvArguments& env,
-  TBlob *ret,
-  OpReqType req,
-  RunContext ctx,
-  int axis,
-  bool keepdims) {
+                     const EnvArguments& env,
+                     TBlob *ret,
+                     OpReqType req,
+                     RunContext ctx,
+                     int axis,
+                     bool keepdims) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(src.type_flag_, ret->type_flag_);
diff --git a/src/operator/broadcast_reduce_op.cc b/src/operator/broadcast_reduce_op.cc
index 213aabb4b033..bd604ca2ad1e 100644
--- a/src/operator/broadcast_reduce_op.cc
+++ b/src/operator/broadcast_reduce_op.cc
@@ -7,7 +7,9 @@
 #include "./broadcast_reduce_op-inl.h"
 namespace mxnet {
 namespace op {
-  DMLC_REGISTER_PARAMETER(ReduceAxisParam);
-  DMLC_REGISTER_PARAMETER(BroadcastAxisParam);
+
+DMLC_REGISTER_PARAMETER(ReduceAxisParam);
+DMLC_REGISTER_PARAMETER(BroadcastAxisParam);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index 1ec78ede6141..cb0172b478f4 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -241,7 +241,7 @@ struct SimpleCropParam : public dmlc::Parameter<SimpleCropParam> {
   }
 };
 
-// matrix crop
+// matrix crop for multi dimensional cropping: see also slice
 template<typename xpu>
 void Crop(const TBlob &src,
           const EnvArguments& env,
@@ -310,6 +310,122 @@ inline TShape CropShape(const TShape& shp,
 }
 
 
+struct SliceParam : public dmlc::Parameter<SliceParam> {
+  int axis;
+  int begin;
+  int end;
+  DMLC_DECLARE_PARAMETER(SliceParam) {
+    DMLC_DECLARE_FIELD(axis).set_lower_bound(0)
+      .describe("The axis to be sliced");
+    DMLC_DECLARE_FIELD(begin).set_lower_bound(0)
+      .describe("The beginning index to be sliced");
+    DMLC_DECLARE_FIELD(end).set_lower_bound(0)
+      .describe("The end index to be sliced");
+  }
+};
+
+inline TShape SliceShape(const TShape& ishape,
+                         const EnvArguments& env) {
+  SliceParam param;
+  param.Init(env.kwargs);
+  CHECK(param.axis < static_cast<int>(ishape.ndim())) <<
+    "axis must be smaller than the source ndim! Recieved axis=" <<
+      param.axis << ", src_ndim=" << ishape.ndim();
+  int axis_size = static_cast<int>(ishape[param.axis]);
+  CHECK_LE(param.end, axis_size);
+  CHECK_LT(param.begin, param.end);
+
+  std::vector<mshadow::index_t> shape;
+  for (index_t i = 0; i < ishape.ndim(); ++i) {
+    if (static_cast<int>(i) == param.axis) {
+      shape.push_back(static_cast<index_t>(param.end - param.begin));
+    } else {
+      shape.push_back(ishape[i]);
+    }
+  }
+  return TShape(shape.begin(), shape.end());
+}
+
+
+template<typename xpu>
+void Slice(const TBlob &src,
+           const EnvArguments& env,
+           TBlob *ret,
+           OpReqType req,
+           RunContext ctx) {
+  using namespace mshadow::expr;
+  SliceParam param;
+  param.Init(env.kwargs);
+
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  int ndim = static_cast<int>(ret->shape_.ndim());
+
+  if (param.axis + 1 == ndim) {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+        mshadow::Tensor<xpu, 2, DType> in =
+            src.FlatTo2D<xpu, DType>(s);
+        mshadow::Tensor<xpu, 2, DType> out =
+            ret->FlatTo2D<xpu, DType>(s);
+        ASSIGN_DISPATCH(out, req, slice<1>(in, param.begin, param.end));
+      });
+  } else {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+        mshadow::Tensor<xpu, 3, DType> in =
+            src.FlatTo3D<xpu, DType>(param.axis, s);
+        mshadow::Tensor<xpu, 3, DType> out =
+            ret->FlatTo3D<xpu, DType>(param.axis, s);
+        ASSIGN_DISPATCH(out, req, slice<1>(in, param.begin, param.end));
+      });
+  }
+}
+
+// Backward pass of broadcast over the given axis
+template<typename xpu>
+void SliceGrad_(const OutputGrad& out_grad,
+                const EnvArguments& env,
+                TBlob *in_grad,
+                OpReqType req,
+                RunContext ctx) {
+  using namespace mshadow::op;
+  using namespace mshadow::expr;
+  SliceParam param;
+  param.Init(env.kwargs);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  int ndim = static_cast<int>(in_grad->shape_.ndim());
+
+  if (param.axis + 1 == ndim) {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+        mshadow::Tensor<xpu, 2, DType> ograd =
+            out_grad.data.FlatTo2D<xpu, DType>(s);
+        mshadow::Tensor<xpu, 2, DType> igrad =
+            in_grad->FlatTo2D<xpu, DType>(s);
+        if (req == kAddTo) {
+          slice<1>(igrad, param.begin, param.end) += F<identity>(ograd);
+        } else if (req == kWriteTo) {
+          igrad = 0.0f;
+          slice<1>(igrad, param.begin, param.end) = F<identity>(ograd);
+        } else {
+          CHECK_EQ(req, kNullOp);
+        }
+      });
+  } else {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+        mshadow::Tensor<xpu, 3, DType> ograd =
+            out_grad.data.FlatTo3D<xpu, DType>(param.axis, s);
+        mshadow::Tensor<xpu, 3, DType> igrad =
+            in_grad->FlatTo3D<xpu, DType>(param.axis, s);
+        if (req == kAddTo) {
+          slice<1>(igrad, param.begin, param.end) += F<identity>(ograd);
+        } else if (req == kWriteTo) {
+          igrad = 0.0f;
+          slice<1>(igrad, param.begin, param.end) = F<identity>(ograd);
+        } else {
+          CHECK_EQ(req, kNullOp);
+        }
+      });
+  }
+}
+
 struct FlipParam : public dmlc::Parameter<FlipParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(FlipParam) {
@@ -396,6 +512,15 @@ MXNET_REGISTER_SIMPLE_OP(crop, XPU)
 .set_shape_function(CropShape)
 .describe("Crop the input matrix and return a new one");
 
+// slice_axis
+MXNET_REGISTER_SIMPLE_OP(slice_axis, XPU)
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, Slice<XPU>,
+              kNoInplace, kRegisterSymbolic)
+.set_gradient(XPU::kDevMask, SliceGrad_<XPU>, kNoInplace)
+.set_shape_function(SliceShape)
+.describe("Slice the input along certain axis and return a sliced array.");
+
 // flip
 MXNET_REGISTER_SIMPLE_OP(flip, XPU)
 .set_enable_kwargs(true)
diff --git a/src/operator/matrix_op.cc b/src/operator/matrix_op.cc
index ff6d01546497..ae263121e669 100644
--- a/src/operator/matrix_op.cc
+++ b/src/operator/matrix_op.cc
@@ -10,6 +10,7 @@ namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(TransposeParam);
 DMLC_REGISTER_PARAMETER(SimpleCropParam);
+DMLC_REGISTER_PARAMETER(SliceParam);
 DMLC_REGISTER_PARAMETER(FlipParam);
 }  // op
 }  // mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index eadbb1bbe0a3..9d67dad20143 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1046,6 +1046,35 @@ def test_crop():
             y = mx.nd.crop(x, begin=tuple(begin), end=tuple(end))
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
+
+def test_slice_axis():
+    for ndim in range(1, 6):
+        shape = np.random.randint(1, 11, size=(ndim,))
+        for t in range(ndim):
+            d = shape[t]
+            b = random.randint(0, d-1)
+            e = random.randint(b+1, d)
+            idx = []
+            for i in range(ndim):
+                idx.append(slice(0, shape[i]))
+            idx[t] = slice(b, e)
+
+            X = mx.symbol.Variable('X')
+            x = mx.nd.array(np.random.normal(size=shape))
+            Y = mx.symbol.slice_axis(data=X, axis=t, begin=b, end=e)
+
+            xgrad = mx.nd.empty(x.shape)
+            exec1 = Y.bind(mx.cpu(), args = [x], args_grad = {'X': xgrad})
+            exec1.forward()
+            y = exec1.outputs[0]
+            assert_allclose(x.asnumpy()[idx], y.asnumpy())
+            exec1.backward([y])
+            xx = x.asnumpy()
+            xx[:] = 0.0
+            xx[idx] = x.asnumpy()[idx]
+            assert_allclose(xx, xgrad.asnumpy())
+
+
 def test_flip():
     for ndim in range(1, 6):
         for t in range(5):
@@ -1057,6 +1086,7 @@ def test_flip():
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
 if __name__ == '__main__':
+    test_slice_axis()
     test_softmax()
     test_broadcast_binary_op()
     test_flip()

From 7cb047464b331430d0ef347cb6f6c10791077ab2 Mon Sep 17 00:00:00 2001
From: tornadomeet <tornadomeet@users.noreply.github.com>
Date: Sat, 18 Jun 2016 13:43:10 +0800
Subject: [PATCH 032/126] add target_shape and adj to deconv for friendly using
 (#2445)

* add target_shape and adj to deconv for friendly using

* fix typo
---
 src/operator/deconvolution-inl.h       | 35 +++++++++++++++++++++++---
 tests/python/unittest/test_operator.py | 24 ++++++++++++++++++
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
index a1590956e8c7..97a1768d733b 100644
--- a/src/operator/deconvolution-inl.h
+++ b/src/operator/deconvolution-inl.h
@@ -31,6 +31,8 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   TShape kernel;
   TShape stride;
   TShape pad;
+  TShape adj;
+  TShape target_shape;
   uint32_t num_filter;
   uint32_t num_group;
   uint64_t workspace;
@@ -42,7 +44,14 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
     .describe("deconvolution stride: (y, x)");
     shape[0] = shape[1] = 0;
     DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-    .describe("pad for deconvolution: (y, x)");
+    .describe("pad for deconvolution: (y, x), a good number is : (kernel-1)/2, "
+              "if target_shape set, pad will be ignored and will be computed "
+              "automatically");
+    DMLC_DECLARE_FIELD(adj).set_default(TShape(shape, shape + 2))
+    .describe("adjustment for output shape: (y, x), if target_shape set, adj "
+               "will be ignored and will be computed automatically");
+    DMLC_DECLARE_FIELD(target_shape).set_default(TShape(shape, shape + 2))
+    .describe("output shape with targe shape : (y, x)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("deconvolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
@@ -325,6 +334,16 @@ class DeconvolutionProp : public OperatorProperty {
     out_shape->push_back(dshape);
     const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
     const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+    const index_t pad_y = static_cast<index_t>(param_.target_shape[0] > 0 ?
+        (ksize_y - 1) / 2 : param_.pad[0]);
+    const index_t pad_x = static_cast<index_t>(param_.target_shape[1] > 0 ?
+        (ksize_x - 1) / 2 : param_.pad[1]);
+    const index_t adj_y = static_cast<index_t>(param_.target_shape[0] > 0 ?
+        (param_.target_shape[0] + 2 * pad_y - ksize_y) %
+        param_.stride[0] : param_.adj[0]);
+    const index_t adj_x = static_cast<index_t>(param_.target_shape[1] > 0 ?
+        (param_.target_shape[1] + 2 * pad_x - ksize_x) %
+        param_.stride[1] : param_.adj[1]);
     CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
     CHECK_EQ(param_.num_filter % param_.num_group, 0) \
@@ -333,11 +352,21 @@ class DeconvolutionProp : public OperatorProperty {
         << "incorrect kernel size: " << param_.kernel;
     CHECK_GT(param_.stride.Size(), 0) \
         << "incorrect stride size: " << param_.stride;
+    CHECK_GE(ksize_y-1, adj_y) << "adj(y) must be samller than kernel(h)";
+    CHECK_GE(ksize_x-1, adj_x) << "adj(x) must be samller than kernel(w)";
     (*out_shape)[deconv::kOut][1] = param_.num_filter;
     (*out_shape)[deconv::kOut][2] = param_.stride[0] * (dshape[2] - 1) +
-        ksize_y - 2 * param_.pad[0];
+        ksize_y - 2 * pad_y + adj_y;
     (*out_shape)[deconv::kOut][3] = param_.stride[1] * (dshape[3] - 1) +
-        ksize_x - 2 * param_.pad[1];
+        ksize_x - 2 * pad_x + adj_x;
+    if (param_.target_shape[0] > 0) {
+      CHECK_EQ(param_.target_shape[0], (*out_shape)[deconv::kOut][2]) \
+          << "param_.target_shape[0] was not reasonable, pelase set it carefully";
+    }
+    if (param_.target_shape[1] > 0) {
+      CHECK_EQ(param_.target_shape[1], (*out_shape)[deconv::kOut][3]) \
+          << "param_.target_shape[1] was not reasonable, pelase set it carefully";
+    }
     return true;
   }
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 9d67dad20143..6ea951ced92f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -635,7 +635,31 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
     exe_deconv.backward(deconv_out_grad)
     assert reldiff(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy()) < 1e-6
 
+def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, target_shape=None):
+    data = mx.sym.Variable(name="data")
+    deconv = mx.sym.Deconvolution(
+        data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5,
+        target_shape = target_shape if target_shape is not None else (0, 0))
+    arg_names = deconv.list_arguments()
+    arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape)
+    assert out_shapes[0] == (input_shape[0], 5, 8, 8)
+
 def test_deconvolution():
+    check_deconvolution_target_shape(
+        input_shape         = (2,3,4,4),
+        kernel              = (3,3),
+        stride              = (2,2),
+        target_shape        = (8,8),
+        pad                 = (99,99),  # will be ignored
+        adj                 = (101,101),  # will be ignored
+    )
+    check_deconvolution_target_shape(
+        input_shape         = (2,3,4,4),
+        kernel              = (3,3),
+        stride              = (2,2),
+        pad                 = (1,1),
+        adj                 = (1,1),
+    )
     check_deconvolution_forward_backward(
         input_shape         = (1,1,5,5),
         num_filter          = 1,

From 3bd0c61c8779c1437aa451b61f97aa08777847fd Mon Sep 17 00:00:00 2001
From: Yuqi Li <ziyeqinghan@gmail.com>
Date: Sat, 18 Jun 2016 14:08:03 +0800
Subject: [PATCH 033/126] Add RNN model in R (#2417)

* refactor lstm model to seperate reusable rnn function like training from lstm model

* add Custom RNN model in R

* modify the documentation to add explanation of RNN and fix some typos

* remove some unuseful comments

* fix typo
---
 R-package/NAMESPACE                           |   3 +
 R-package/R/lstm.R                            | 305 +++-------------
 R-package/R/rnn.R                             | 342 ++++++++++++++++++
 R-package/R/rnn_model.R                       | 244 +++++++++++++
 R-package/man/mx.lstm.inference.Rd            |   5 +-
 R-package/man/mx.rnn.Rd                       |  69 ++++
 R-package/man/mx.rnn.forward.Rd               |  25 ++
 R-package/man/mx.rnn.inference.Rd             |  49 +++
 R-package/vignettes/CharRnnModel.Rmd          |  54 +--
 .../r/{CharRnnModel.Rmd => CharRnnModel.md}   | 138 ++++---
 docs/packages/r/index.md                      |   2 +-
 example/rnn/README.md                         |   5 -
 12 files changed, 881 insertions(+), 360 deletions(-)
 create mode 100644 R-package/R/rnn.R
 create mode 100644 R-package/R/rnn_model.R
 create mode 100644 R-package/man/mx.rnn.Rd
 create mode 100644 R-package/man/mx.rnn.forward.Rd
 create mode 100644 R-package/man/mx.rnn.inference.Rd
 rename docs/packages/r/{CharRnnModel.Rmd => CharRnnModel.md} (67%)

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 355233385cbf..956ce9767d3a 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -80,6 +80,9 @@ export(mx.nd.transpose)
 export(mx.nd.zeros)
 export(mx.opt.create)
 export(mx.opt.get.updater)
+export(mx.rnn)
+export(mx.rnn.forward)
+export(mx.rnn.inference)
 export(mx.rnorm)
 export(mx.runif)
 export(mx.set.seed)
diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R
index 3fcd0e831751..27c1c2e96eac 100644
--- a/R-package/R/lstm.R
+++ b/R-package/R/lstm.R
@@ -56,9 +56,7 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
 
     last.hidden <- list()
     for (seqidx in 1:seq.len) {
-
-        hidden = wordvec[[seqidx]]
-
+        hidden <- wordvec[[seqidx]]
         # stack lstm
         for (i in 1:num.lstm.layer) {
             dp <- ifelse(i==1, 0, dropout)
@@ -90,6 +88,7 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
     return (loss.all)
 }
 
+# lstm inference model symbol
 lstm.inference.symbol <- function(num.lstm.layer, input.size,
                                   num.hidden, num.embed, num.label, dropout=0.) {
     seqidx <- 0
@@ -99,9 +98,9 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size,
 
     param.cells <- lapply(1:num.lstm.layer, function(i) {
         cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                                 i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                                 h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                                 h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
         return (cell)
     })
     last.states <- lapply(1:num.lstm.layer, function(i) {
@@ -148,250 +147,7 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size,
     return (mx.symbol.Group(list.all))
 }
 
-is.param.name <- function(name) {
-    return (grepl('weight$', name) || grepl('bias$', name) ||
-           grepl('gamma$', name) || grepl('beta$', name) )
-}
-
-# Initialize parameters
-mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) {
-  if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol")
-  slist <- symbol$infer.shape(input.shape)
-  if (is.null(slist)) stop("Not enough information to get shapes")
-  arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE)
-  aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE)
-  return(list(arg.params=arg.params, aux.params=aux.params))
-}
-
-# Initialize the data iter
-mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) {
-  if (is.MXDataIter(X)) return(X)
-  shape <- dim(data)
-  if (is.null(shape)) {
-    num.data <- length(X)
-  } else {
-    ndim <- length(shape)
-    num.data <- shape[[ndim]]
-  }
-  if (is.null(y)) {
-    if (is.train) stop("Need to provide parameter y for training with R arrays.")
-    y <- c(1:num.data) * 0
-  }
-
-  batch.size <- min(num.data, batch.size)
-
-  return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
-}
-
-# set up rnn model with lstm cells
-setup.rnn.model <- function(rnn.sym, ctx,
-                            num.lstm.layer, seq.len,
-                            num.hidden, num.embed, num.label,
-                            batch.size, input.size,
-                            initializer=mx.init.uniform(0.01),
-                            dropout=0) {
-
-    arg.names <- rnn.sym$arguments
-    input.shapes <- list()
-    for (name in arg.names) {
-        if (grepl('init.c$', name) || grepl('init.h$', name)) {
-            input.shapes[[name]] <- c(num.hidden, batch.size)
-        }
-        else if (grepl('data$', name) || grepl('label$', name) ) {
-            if (seq.len == 1) {
-                input.shapes[[name]] <- c(batch.size)
-            } else {
-            input.shapes[[name]] <- c(seq.len, batch.size)
-            }
-        }
-    }
-    params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu())
-    args <- input.shapes
-    args$symbol <- rnn.sym
-    args$ctx <- ctx
-    args$grad.req <- "add"
-    rnn.exec <- do.call(mx.simple.bind, args)
-
-    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
-    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
-
-    grad.arrays <- list()
-    for (name in names(rnn.exec$ref.grad.arrays)) {
-        if (is.param.name(name))
-            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
-    }
-    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
-
-    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
-                 num.lstm.layer=num.lstm.layer, num.hidden=num.hidden,
-                 seq.len=seq.len, batch.size=batch.size,
-                 num.embed=num.embed))
-
-}
-
-
-calc.nll <- function(seq.label.probs, batch.size) {
-    nll = - sum(log(seq.label.probs)) / batch.size
-    return (nll)
-}
-
-get.label <- function(label, ctx) {
-    label <- as.array(label)
-    seq.len <- dim(label)[[1]]
-    batch.size <- dim(label)[[2]]
-    sm.label <- array(0, dim=c(seq.len*batch.size))
-    for (seqidx in 1:seq.len) {
-        sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,]
-    }
-    return (mx.nd.array(sm.label, ctx))
-}
-
-
-
-train.lstm <- function(model, train.data, eval.data,
-                       num.round, update.period,
-                       optimizer='sgd', ctx=mx.ctx.default(), ...) {
-    m <- model
-    seq.len <- m$seq.len
-    batch.size <- m$batch.size
-    num.lstm.layer <- m$num.lstm.layer
-    num.hidden <- m$num.hidden
-
-    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
-
-    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
-    epoch.counter <- 0
-    log.period <- max(as.integer(1000 / seq.len), 1)
-    last.perp <- 10000000.0
-
-    for (iteration in 1:num.round) {
-        nbatch <- 0
-        train.nll <- 0
-        # reset states
-        init.states <- list()
-        for (i in 1:num.lstm.layer) {
-            init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-            init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-        }
-        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-
-        tic <- Sys.time()
-
-        train.data$reset()
-
-        while (train.data$iter.next()) {
-            # set rnn input
-            rnn.input <- train.data$value()
-            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
-
-            mx.exec.forward(m$rnn.exec, is.train=TRUE)
-            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
-
-            mx.exec.backward(m$rnn.exec)
-            init.states <- list()
-            for (i in 1:num.lstm.layer) {
-                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-            # update epoch counter
-            epoch.counter <- epoch.counter + 1
-            if (epoch.counter %% update.period == 0) {
-                # the gradient of initial c and inital h should be zero
-                init.grad <- list()
-                for (i in 1:num.lstm.layer) {
-                    init.grad[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                    init.grad[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
-
-                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
-
-                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
-
-                grad.arrays <- list()
-                for (name in names(m$rnn.exec$ref.grad.arrays)) {
-                    if (is.param.name(name))
-                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
-
-            }
-
-            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size)
-
-            nbatch <- nbatch + seq.len
-            if ((epoch.counter %% log.period) == 0) {
-                cat(paste0("Epoch [", epoch.counter,
-                           "] Train: NLL=", train.nll / nbatch,
-                           ", Perp=", exp(train.nll / nbatch), "\n"))
-            }
-        }
-        train.data$reset()
-        # end of training loop
-        toc <- Sys.time()
-        cat(paste0("Iter [", iteration,
-                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
-                   " sec, NLL=", train.nll / nbatch,
-                   ", Perp=", exp(train.nll / nbatch), "\n"))
-
-        if (!is.null(eval.data)) {
-            val.nll <- 0.0
-            # validation set, reset states
-            init.states <- list()
-            for (i in 1:num.lstm.layer) {
-                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-
-            eval.data$reset()
-            nbatch <- 0
-            while (eval.data$iter.next()) {
-                # set rnn input
-                rnn.input <- eval.data$value()
-                mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
-                mx.exec.forward(m$rnn.exec, is.train=FALSE)
-                # probability of each label class, used to evaluate nll
-                seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
-                # transfer the states
-                init.states <- list()
-                for (i in 1:num.lstm.layer) {
-                    init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                    init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-                }
-                mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-                val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size)
-                nbatch <- nbatch + seq.len
-            }
-            eval.data$reset()
-            perp <- exp(val.nll / nbatch)
-            cat(paste0("Iter [", iteration,
-                       "] Val: NLL=", val.nll / nbatch,
-                       ", Perp=", exp(val.nll / nbatch), "\n"))
-        }
-    }
-
-    return (m)
-}
-
 
-check.data <- function(data, batch.size, is.train) {
-    if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) {
-        stop("The dataset should be either a mx.io.DataIter or a R list")
-    }
-    if (is.list(data)) {
-        if (is.null(data$data) || is.null(data$label)){
-            stop("Please provide dataset as list(data=R.array, label=R.array)")
-        }
-    data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train)
-    }
-    if (!is.null(data) && !data$iter.next()) {
-        data$reset()
-        if (!data$iter.next()) stop("Empty input")
-    }
-    return (data)
-}
 
 #' Training LSTM Unrolled Model
 #'
@@ -450,24 +206,36 @@ mx.lstm <- function(train.data, eval.data=NULL,
                            num.embed=num.embed,
                            num.label=num.label,
                            dropout=dropout)
+    init.states.c <- lapply(1:num.lstm.layer, function(i) {
+        state.c <- paste0("l", i, ".init.c")
+        return (state.c)
+    })
+    init.states.h <- lapply(1:num.lstm.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+    init.states.name <- c(init.states.c, init.states.h)
+
     # set up lstm model
     model <- setup.rnn.model(rnn.sym=rnn.sym,
                              ctx=ctx,
-                             num.lstm.layer=num.lstm.layer,
+                             num.rnn.layer=num.lstm.layer,
                              seq.len=seq.len,
                              num.hidden=num.hidden,
                              num.embed=num.embed,
                              num.label=num.label,
                              batch.size=batch.size,
                              input.size=input.size,
+                             init.states.name=init.states.name,
                              initializer=initializer,
                              dropout=dropout)
 
     # train lstm model
-    model <- train.lstm(model, train.data, eval.data,
+    model <- train.rnn( model, train.data, eval.data,
                         num.round=num.round,
                         update.period=update.period,
                         ctx=ctx,
+                        init.states.name=init.states.name,
                         ...)
     # change model into MXFeedForwardModel
     model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
@@ -487,7 +255,7 @@ mx.lstm <- function(train.data, eval.data=NULL,
 #'      The output dim of embedding.
 #' @param num.label  integer
 #'      The number of labels.
-#' @param batch.size integer
+#' @param batch.size integer, default=1
 #'      The batch size used for R array training.
 #' @param arg.params list
 #'      The batch size used for R array training.
@@ -495,7 +263,8 @@ mx.lstm <- function(train.data, eval.data=NULL,
 #'      Model parameter, list of name to NDArray of net's weights.
 #' @param dropout float, default=0
 #'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @return model a lstm inference model.
+#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+#'      A lstm inference model.
 #'
 #' @export
 mx.lstm.inference <- function(num.lstm.layer,
@@ -507,24 +276,35 @@ mx.lstm.inference <- function(num.lstm.layer,
                               arg.params,
                               ctx=mx.cpu(),
                               dropout=0.) {
-    sym <- lstm.inference.symbol(num.lstm.layer,
-                                 input.size,
-                                 num.hidden,
-                                 num.embed,
-                                 num.label,
-                                 dropout)
+    sym <- lstm.inference.symbol(num.lstm.layer=num.lstm.layer,
+                                 input.size=input.size,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=num.label,
+                                 dropout=dropout)
+
+    init.states.c <- lapply(1:num.lstm.layer, function(i) {
+        state.c <- paste0("l", i, ".init.c")
+        return (state.c)
+    })
+    init.states.h <- lapply(1:num.lstm.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+    init.states.name <- c(init.states.c, init.states.h)
 
     seq.len <- 1
     # set up lstm model
     model <- setup.rnn.model(rnn.sym=sym,
                              ctx=ctx,
-                             num.lstm.layer=num.lstm.layer,
+                             num.rnn.layer=num.lstm.layer,
                              seq.len=seq.len,
                              num.hidden=num.hidden,
                              num.embed=num.embed,
                              num.label=num.label,
                              batch.size=batch.size,
                              input.size=input.size,
+                             init.states.name=init.states.name,
                              initializer=mx.init.uniform(0.01),
                              dropout=dropout)
     arg.names <- names(model$rnn.exec$ref.arg.arrays)
@@ -557,11 +337,10 @@ mx.lstm.inference <- function(num.lstm.layer,
 #' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
 #'
 #' @export
-
 mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
     if (new.seq == TRUE) {
         init.states <- list()
-        for (i in 1:num.lstm.layer) {
+        for (i in 1:model$num.rnn.layer) {
             init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
             init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
         }
@@ -572,7 +351,7 @@ mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
     mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
     mx.exec.forward(model$rnn.exec, is.train=FALSE)
     init.states <- list()
-    for (i in 1:num.lstm.layer) {
+    for (i in 1:model$num.rnn.layer) {
         init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]]
         init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
     }
diff --git a/R-package/R/rnn.R b/R-package/R/rnn.R
new file mode 100644
index 000000000000..b89559a58570
--- /dev/null
+++ b/R-package/R/rnn.R
@@ -0,0 +1,342 @@
+# rnn cell symbol
+rnn <- function(num.hidden, indata, prev.state, param, seqidx, 
+                layeridx, dropout=0., batch.norm=FALSE) {
+    if (dropout > 0. )
+        indata <- mx.symbol.Dropout(data=indata, p=dropout)
+    i2h <- mx.symbol.FullyConnected(data=indata,
+                                    weight=param$i2h.weight,
+                                    bias=param$i2h.bias,
+                                    num.hidden=num.hidden,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
+    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
+                                    weight=param$h2h.weight,
+                                    bias=param$h2h.bias,
+                                    num.hidden=num.hidden,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
+    hidden <- i2h + h2h
+
+    hidden <- mx.symbol.Activation(data=hidden, act.type="tanh")
+    if (batch.norm)
+        hidden <- mx.symbol.BatchNorm(data=hidden)
+    return (list(h=hidden))
+}
+
+# unrolled rnn network
+rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.hidden, 
+                       num.embed, num.label, dropout=0., batch.norm=FALSE) {
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+    param.cells <- lapply(1:num.rnn.layer, function(i) {
+        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.rnn.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    label <- mx.symbol.Variable("label")
+    data <- mx.symbol.Variable("data")
+    embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                 weight=embed.weight, output_dim=num.embed, name="embed")
+    wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1)
+
+    last.hidden <- list()
+    for (seqidx in 1:seq.len) { 
+        hidden <- wordvec[[seqidx]]
+        # stack RNN
+        for (i in 1:num.rnn.layer) {
+            dp <- ifelse(i==1, 0, dropout)
+            next.state <- rnn(num.hidden, indata=hidden,
+                              prev.state=last.states[[i]],
+                              param=param.cells[[i]],
+                              seqidx=seqidx, layeridx=i, 
+                              dropout=dp, batch.norm=batch.norm)
+            hidden <- next.state$h
+            last.states[[i]] <- next.state
+        }
+        # decoder
+        if (dropout > 0.)
+            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+        last.hidden <- c(last.hidden, hidden)
+    }
+    last.hidden$dim <- 0
+    last.hidden$num.args <- seq.len
+    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
+    fc <- mx.symbol.FullyConnected(data=concat,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num.hidden=num.label)
+    label <- mx.symbol.transpose(data=label)
+    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
+
+    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
+    return (loss.all)
+}
+
+# rnn inference model symbol
+rnn.inference.symbol <- function(num.rnn.layer, seq.len, input.size, num.hidden, 
+                                 num.embed, num.label, dropout=0., batch.norm=FALSE) {
+    seqidx <- 0
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+    param.cells <- lapply(1:num.rnn.layer, function(i) {
+        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.rnn.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    data <- mx.symbol.Variable("data")
+    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                 weight=embed.weight, output_dim=num.embed, name="embed")
+    # stack RNN        
+    for (i in 1:num.rnn.layer) {
+        dp <- ifelse(i==1, 0, dropout)
+        next.state <- rnn(num.hidden, indata=hidden,
+                          prev.state=last.states[[i]],
+                          param=param.cells[[i]],
+                          seqidx=seqidx, layeridx=i, 
+                          dropout=dp, batch.norm=batch.norm)
+        hidden <- next.state$h
+        last.states[[i]] <- next.state
+    }
+    # decoder
+    if (dropout > 0.)
+        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+
+    fc <- mx.symbol.FullyConnected(data=hidden,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num_hidden=num.label)
+    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
+    unpack.h <- lapply(1:num.rnn.layer, function(i) {
+        state <- last.states[[i]]
+        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
+        return (state.h)
+    })
+    list.all <- c(sm, unpack.h)
+    return (mx.symbol.Group(list.all))
+}
+
+#' Training RNN Unrolled Model
+#'
+#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
+#'      The Training set.
+#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
+#'      The validation set used for validation evaluation during the progress.
+#' @param num.rnn.layer integer
+#'      The number of the layer of rnn.
+#' @param seq.len integer
+#'      The length of the input sequence.
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer
+#'      The batch size used for R array training.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param ctx mx.context, optional
+#'      The device used to perform training.
+#' @param num.round integer, default=10
+#'      The number of iterations over training data to train the model.
+#' @param update.period integer, default=1
+#'      The number of iterations to update parameters during training period.
+#' @param initializer initializer object. default=mx.init.uniform(0.01)
+#'      The initialization scheme for parameters.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param optimizer string, default="sgd"
+#'      The optimization method.
+#' @param batch.norm boolean, default=FALSE
+#'      Whether to use batch normalization.
+#' @param ... other parameters passing to \code{mx.rnn}/.
+#' @return model A trained rnn unrolled model.
+#'
+#' @export
+mx.rnn <- function( train.data, eval.data=NULL,
+                    num.rnn.layer, seq.len,
+                    num.hidden, num.embed, num.label,
+                    batch.size, input.size,
+                    ctx=mx.ctx.default(),
+                    num.round=10, update.period=1,
+                    initializer=mx.init.uniform(0.01),
+                    dropout=0, optimizer='sgd',
+                    batch.norm=FALSE,
+                    ...) {
+    # check data and change data into iterator
+    train.data <- check.data(train.data, batch.size, TRUE)
+    eval.data <- check.data(eval.data, batch.size, FALSE)
+
+    # get unrolled rnn symbol
+    rnn.sym <- rnn.unroll( num.rnn.layer=num.rnn.layer,
+                           num.hidden=num.hidden,
+                           seq.len=seq.len,
+                           input.size=input.size,
+                           num.embed=num.embed,
+                           num.label=num.label,
+                           dropout=dropout,
+                           batch.norm=batch.norm)
+    init.states.name <- lapply(1:num.rnn.layer, function(i) {
+        state <- paste0("l", i, ".init.h")
+        return (state)
+    })
+    # set up rnn model
+    model <- setup.rnn.model(rnn.sym=rnn.sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.rnn.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=initializer,
+                             dropout=dropout)
+    # train rnn model
+    model <- train.rnn( model, train.data, eval.data,
+                        num.round=num.round,
+                        update.period=update.period,
+                        ctx=ctx,
+                        init.states.name=init.states.name,
+                        ...)
+    # change model into MXFeedForwardModel
+    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
+    return(structure(model, class="MXFeedForwardModel"))
+}
+
+#' Create a RNN Inference Model
+#'
+#' @param num.rnn.layer integer
+#'      The number of the layer of rnn.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer, default=1
+#'      The batch size used for R array training.
+#' @param arg.params list
+#'      The batch size used for R array training.
+#' @param ctx mx.context, optional
+#'      Model parameter, list of name to NDArray of net's weights.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param batch.norm boolean, default=FALSE
+#'      Whether to use batch normalization.
+#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+#'      A rnn inference model.
+#'
+#' @export
+mx.rnn.inference <- function( num.rnn.layer,
+                              input.size,
+                              num.hidden,
+                              num.embed,
+                              num.label,
+                              batch.size=1,
+                              arg.params,
+                              ctx=mx.cpu(),
+                              dropout=0.,
+                              batch.norm=FALSE) {
+    sym <- rnn.inference.symbol( num.rnn.layer=num.rnn.layer,
+                                 input.size=input.size,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=num.label,
+                                 dropout=dropout,
+                                 batch.norm=batch.norm)
+    # init.states.name <- c()
+    # for (i in 1:num.rnn.layer) {
+    #     init.states.name <- c(init.states.name, paste0("l", i, ".init.c"))
+    #     init.states.name <- c(init.states.name, paste0("l", i, ".init.h"))
+    # }
+    init.states.name <- lapply(1:num.rnn.layer, function(i) {
+        state <- paste0("l", i, ".init.h")
+        return (state)
+    })
+    
+    seq.len <- 1
+    # set up rnn model
+    model <- setup.rnn.model(rnn.sym=sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.rnn.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=mx.init.uniform(0.01),
+                             dropout=dropout)
+    arg.names <- names(model$rnn.exec$ref.arg.arrays)
+    for (k in names(arg.params)) {
+        if ((k %in% arg.names) && is.param.name(k) ) {
+            rnn.input <- list()
+            rnn.input[[k]] <- arg.params[[k]]
+            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
+        }
+    }
+    init.states <- list()
+    for (i in 1:num.rnn.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+
+    return (model)
+}
+
+#' Using forward function to predict in rnn inference model
+#'
+#' @param model rnn model
+#'      A rnn inference model
+#' @param input.data, array.matrix
+#'      The input data for forward function
+#' @param new.seq boolean, default=FALSE
+#'      Whether the input is the start of a new sequence
+#'
+#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
+#'
+#' @export
+mx.rnn.forward <- function(model, input.data, new.seq=FALSE) {
+    if (new.seq == TRUE) {
+        init.states <- list()
+        for (i in 1:model$num.rnn.layer) {
+            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+        }
+        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    }
+    dim(input.data) <- c(model$batch.size)
+    data <- list(data=mx.nd.array(input.data))
+    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
+    mx.exec.forward(model$rnn.exec, is.train=FALSE)
+    init.states <- list()
+    for (i in 1:model$num.rnn.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    #print (model$rnn.exec$ref)
+    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
+    print ("prob")
+    print (prob)
+    return (list(prob=prob, model=model))
+}
diff --git a/R-package/R/rnn_model.R b/R-package/R/rnn_model.R
new file mode 100644
index 000000000000..19f53b0f6407
--- /dev/null
+++ b/R-package/R/rnn_model.R
@@ -0,0 +1,244 @@
+is.param.name <- function(name) {
+    return (grepl('weight$', name) || grepl('bias$', name) ||
+           grepl('gamma$', name) || grepl('beta$', name) )
+}
+
+# Initialize parameters
+mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) {
+  if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol")
+  slist <- symbol$infer.shape(input.shape)
+  if (is.null(slist)) stop("Not enough information to get shapes")
+  arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE)
+  aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE)
+  return(list(arg.params=arg.params, aux.params=aux.params))
+}
+
+# Initialize the data iter
+mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) {
+  if (is.MXDataIter(X)) return(X)
+  shape <- dim(data)
+  if (is.null(shape)) {
+    num.data <- length(X)
+  } else {
+    ndim <- length(shape)
+    num.data <- shape[[ndim]]
+  }
+  if (is.null(y)) {
+    if (is.train) stop("Need to provide parameter y for training with R arrays.")
+    y <- c(1:num.data) * 0
+  }
+
+  batch.size <- min(num.data, batch.size)
+
+  return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
+}
+
+# set up rnn model with rnn cells
+setup.rnn.model <- function(rnn.sym, ctx,
+                            num.rnn.layer, seq.len,
+                            num.hidden, num.embed, num.label,
+                            batch.size, input.size,
+                            init.states.name,
+                            initializer=mx.init.uniform(0.01),
+                            dropout=0) {
+
+    arg.names <- rnn.sym$arguments
+    input.shapes <- list()
+    for (name in arg.names) {
+        if (name %in% init.states.name) {
+            input.shapes[[name]] <- c(num.hidden, batch.size)
+        }
+        else if (grepl('data$', name) || grepl('label$', name) ) {
+            if (seq.len == 1) {
+                input.shapes[[name]] <- c(batch.size)
+            } else {
+            input.shapes[[name]] <- c(seq.len, batch.size)
+            }
+        }
+    }
+    params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu())
+    args <- input.shapes
+    args$symbol <- rnn.sym
+    args$ctx <- ctx
+    args$grad.req <- "add"
+    rnn.exec <- do.call(mx.simple.bind, args)
+
+    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
+    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
+
+    grad.arrays <- list()
+    for (name in names(rnn.exec$ref.grad.arrays)) {
+        if (is.param.name(name))
+            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
+    }
+    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
+
+    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
+                 num.rnn.layer=num.rnn.layer, num.hidden=num.hidden,
+                 seq.len=seq.len, batch.size=batch.size,
+                 num.embed=num.embed))
+
+}
+
+
+calc.nll <- function(seq.label.probs, batch.size) {
+    nll = - sum(log(seq.label.probs)) / batch.size
+    return (nll)
+}
+
+get.label <- function(label, ctx) {
+    label <- as.array(label)
+    seq.len <- dim(label)[[1]]
+    batch.size <- dim(label)[[2]]
+    sm.label <- array(0, dim=c(seq.len*batch.size))
+    for (seqidx in 1:seq.len) {
+        sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,]
+    }
+    return (mx.nd.array(sm.label, ctx))
+}
+
+
+# training rnn model
+train.rnn <- function (model, train.data, eval.data,
+                       num.round, update.period,
+                       init.states.name,
+                       optimizer='sgd', ctx=mx.ctx.default(), ...) {
+    m <- model
+    seq.len <- m$seq.len
+    batch.size <- m$batch.size
+    num.rnn.layer <- m$num.rnn.layer
+    num.hidden <- m$num.hidden
+
+    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
+
+    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
+    epoch.counter <- 0
+    log.period <- max(as.integer(1000 / seq.len), 1)
+    last.perp <- 10000000.0
+
+    for (iteration in 1:num.round) {
+        nbatch <- 0
+        train.nll <- 0
+        # reset states
+        init.states <- list()
+        for (name in init.states.name) {
+            init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+        }
+
+        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+
+        tic <- Sys.time()
+
+        train.data$reset()
+
+        while (train.data$iter.next()) {
+            # set rnn input
+            rnn.input <- train.data$value()
+            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
+
+            mx.exec.forward(m$rnn.exec, is.train=TRUE)
+            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
+
+            mx.exec.backward(m$rnn.exec)
+            init.states <- list()
+            for (name in init.states.name) {
+                init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+            }
+
+            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+            # update epoch counter
+            epoch.counter <- epoch.counter + 1
+            if (epoch.counter %% update.period == 0) {
+                # the gradient of initial c and inital h should be zero
+                init.grad <- list()
+                for (name in init.states.name) {
+                    init.grad[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+                }
+
+                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
+
+                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
+
+                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
+
+                grad.arrays <- list()
+                for (name in names(m$rnn.exec$ref.grad.arrays)) {
+                    if (is.param.name(name))
+                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
+                }
+                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
+
+            }
+
+            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size)
+
+            nbatch <- nbatch + seq.len
+            if ((epoch.counter %% log.period) == 0) {
+                cat(paste0("Epoch [", epoch.counter,
+                           "] Train: NLL=", train.nll / nbatch,
+                           ", Perp=", exp(train.nll / nbatch), "\n"))
+            }
+        }
+        train.data$reset()
+        # end of training loop
+        toc <- Sys.time()
+        cat(paste0("Iter [", iteration,
+                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
+                   " sec, NLL=", train.nll / nbatch,
+                   ", Perp=", exp(train.nll / nbatch), "\n"))
+
+        if (!is.null(eval.data)) {
+            val.nll <- 0.0
+            # validation set, reset states
+            init.states <- list()
+            for (name in init.states.name) {
+                init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+            }
+            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+
+            eval.data$reset()
+            nbatch <- 0
+            while (eval.data$iter.next()) {
+                # set rnn input
+                rnn.input <- eval.data$value()
+                mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
+                mx.exec.forward(m$rnn.exec, is.train=FALSE)
+                # probability of each label class, used to evaluate nll
+                seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
+                # transfer the states
+                init.states <- list()
+                for (name in init.states.name) {
+                    init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+                }
+                mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+                val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size)
+                nbatch <- nbatch + seq.len
+            }
+            eval.data$reset()
+            perp <- exp(val.nll / nbatch)
+            cat(paste0("Iter [", iteration,
+                       "] Val: NLL=", val.nll / nbatch,
+                       ", Perp=", exp(val.nll / nbatch), "\n"))
+        }
+    }
+
+    return (m)
+}
+
+# check data and translate data into iterator if data is array/matrix
+check.data <- function(data, batch.size, is.train) {
+    if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) {
+        stop("The dataset should be either a mx.io.DataIter or a R list")
+    }
+    if (is.list(data)) {
+        if (is.null(data$data) || is.null(data$label)){
+            stop("Please provide dataset as list(data=R.array, label=R.array)")
+        }
+    data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train)
+    }
+    if (!is.null(data) && !data$iter.next()) {
+        data$reset()
+        if (!data$iter.next()) stop("Empty input")
+    }
+    return (data)
+}
\ No newline at end of file
diff --git a/R-package/man/mx.lstm.inference.Rd b/R-package/man/mx.lstm.inference.Rd
index af572ee28590..19fe3b7fa368 100644
--- a/R-package/man/mx.lstm.inference.Rd
+++ b/R-package/man/mx.lstm.inference.Rd
@@ -23,7 +23,7 @@ The output dim of embedding.}
 \item{num.label}{integer
 The number of labels.}
 
-\item{batch.size}{integer
+\item{batch.size}{integer, default=1
 The batch size used for R array training.}
 
 \item{arg.params}{list
@@ -36,7 +36,8 @@ Model parameter, list of name to NDArray of net's weights.}
 A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
 }
 \value{
-model a lstm inference model.
+model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+     A lstm inference model.
 }
 \description{
 Create a LSTM Inference Model
diff --git a/R-package/man/mx.rnn.Rd b/R-package/man/mx.rnn.Rd
new file mode 100644
index 000000000000..c40915c98275
--- /dev/null
+++ b/R-package/man/mx.rnn.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rnn.R
+\name{mx.rnn}
+\alias{mx.rnn}
+\title{Training RNN Unrolled Model}
+\usage{
+mx.rnn(train.data, eval.data = NULL, num.rnn.layer, seq.len, num.hidden,
+  num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(),
+  num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01),
+  dropout = 0, optimizer = "sgd", batch.norm = FALSE, ...)
+}
+\arguments{
+\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array)
+The Training set.}
+
+\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional
+The validation set used for validation evaluation during the progress.}
+
+\item{num.rnn.layer}{integer
+The number of the layer of rnn.}
+
+\item{seq.len}{integer
+The length of the input sequence.}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer
+The batch size used for R array training.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{ctx}{mx.context, optional
+The device used to perform training.}
+
+\item{num.round}{integer, default=10
+The number of iterations over training data to train the model.}
+
+\item{update.period}{integer, default=1
+The number of iterations to update parameters during training period.}
+
+\item{initializer}{initializer object. default=mx.init.uniform(0.01)
+The initialization scheme for parameters.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{optimizer}{string, default="sgd"
+The optimization method.}
+
+\item{batch.norm}{boolean, default=FALSE
+Whether to use batch normalization.}
+
+\item{...}{other parameters passing to \code{mx.rnn}/.}
+}
+\value{
+model A trained rnn unrolled model.
+}
+\description{
+Training RNN Unrolled Model
+}
+
diff --git a/R-package/man/mx.rnn.forward.Rd b/R-package/man/mx.rnn.forward.Rd
new file mode 100644
index 000000000000..c8763b6c1ad7
--- /dev/null
+++ b/R-package/man/mx.rnn.forward.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rnn.R
+\name{mx.rnn.forward}
+\alias{mx.rnn.forward}
+\title{Using forward function to predict in rnn inference model}
+\usage{
+mx.rnn.forward(model, input.data, new.seq = FALSE)
+}
+\arguments{
+\item{model}{rnn model
+A rnn inference model}
+
+\item{input.data, }{array.matrix
+The input data for forward function}
+
+\item{new.seq}{boolean, default=FALSE
+Whether the input is the start of a new sequence}
+}
+\value{
+result A list(prob=prob, model=model) containing the result probability of each label and the model.
+}
+\description{
+Using forward function to predict in rnn inference model
+}
+
diff --git a/R-package/man/mx.rnn.inference.Rd b/R-package/man/mx.rnn.inference.Rd
new file mode 100644
index 000000000000..56e00e62f620
--- /dev/null
+++ b/R-package/man/mx.rnn.inference.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rnn.R
+\name{mx.rnn.inference}
+\alias{mx.rnn.inference}
+\title{Create a RNN Inference Model}
+\usage{
+mx.rnn.inference(num.rnn.layer, input.size, num.hidden, num.embed, num.label,
+  batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0,
+  batch.norm = FALSE)
+}
+\arguments{
+\item{num.rnn.layer}{integer
+The number of the layer of rnn.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer, default=1
+The batch size used for R array training.}
+
+\item{arg.params}{list
+The batch size used for R array training.}
+
+\item{ctx}{mx.context, optional
+Model parameter, list of name to NDArray of net's weights.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{batch.norm}{boolean, default=FALSE
+Whether to use batch normalization.}
+}
+\value{
+model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+     A rnn inference model.
+}
+\description{
+Create a RNN Inference Model
+}
+
diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd
index 9066d60f7513..1dd30ff12b4d 100644
--- a/R-package/vignettes/CharRnnModel.Rmd
+++ b/R-package/vignettes/CharRnnModel.Rmd
@@ -3,12 +3,12 @@ Char RNN Example
 
 This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
 
-Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
+Data can be found at [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare)
 
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/CharRnnModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd)
 
 Load Data 
@@ -21,10 +21,10 @@ Set basic network parameters.
 ```{r}
 batch.size = 32
 seq.len = 32
-num.hidden = 256
-num.embed = 256
-num.lstm.layer = 2
-num.round = 3
+num.hidden = 16
+num.embed = 16
+num.lstm.layer = 1
+num.round = 1
 learning.rate= 0.1
 wd=0.00001
 clip_gradient=1
@@ -161,33 +161,7 @@ model <- mx.lstm(X.train, X.val,
                  clip_gradient=clip_gradient)
 
 ```
-Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result.
-```
-Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657
-...
-Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355
-Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858
-Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419
-Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329
-...
-Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351
-Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367
-Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988
-...
-Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974
-Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519
-Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576
-Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767
-...
-Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296
-Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454
-Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604
-...
-Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281
-Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428
-Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187
 
-```
 Inference from model
 --------------------
 helper function for random sample.
@@ -225,15 +199,12 @@ choice <- function(weights) {
 ```
 we can use random output or fixed output by choosing largest probability.
 ```{r}
-make.output <- function(prob, sample=FALSE, temperature=1.) {
+make.output <- function(prob, sample=FALSE) {
     if (!sample) {
         idx <- which.max(as.array(prob))
     }
     else {
-        scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6)
-        rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature)
-        rescale <- rescale / (as.array(mx.nd.sum(rescale))[1])
-        idx <- choice(rescale)
+        idx <- choice(prob)
     }
     return (idx)
 
@@ -252,7 +223,7 @@ infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                  ctx=mx.cpu())
 ```
 generate a sequence of 75 chars using function `mx.lstm.forward`.
-```
+```{r}
 start <- 'a'
 seq.len <- 75
 random.sample <- TRUE
@@ -273,4 +244,9 @@ The result:
 ```
 ah not a drobl greens
 Settled asing lately sistering sounted to their hight
-```
\ No newline at end of file
+```
+
+Other RNN models
+----------------
+In `mxnet`, other RNN models like custom RNN is also provided.
+- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
\ No newline at end of file
diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.md
similarity index 67%
rename from docs/packages/r/CharRnnModel.Rmd
rename to docs/packages/r/CharRnnModel.md
index 9066d60f7513..4623386ff66c 100644
--- a/docs/packages/r/CharRnnModel.Rmd
+++ b/docs/packages/r/CharRnnModel.md
@@ -3,35 +3,46 @@ Char RNN Example
 
 This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
 
-Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
+Data can be found at [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) 
 
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/packages/r/CharRnnModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd)
 
 Load Data 
 ---------
 First of all, load in the data and preprocess it.
-```{r}
+
+```r
 require(mxnet)
 ```
+
+```
+## Loading required package: mxnet
+```
+
+```
+## Loading required package: methods
+```
 Set basic network parameters.
-```{r}
+
+```r
 batch.size = 32
 seq.len = 32
-num.hidden = 256
-num.embed = 256
-num.lstm.layer = 2
-num.round = 3
+num.hidden = 16
+num.embed = 16
+num.lstm.layer = 1
+num.round = 1
 learning.rate= 0.1
 wd=0.00001
 clip_gradient=1
 update.period = 1
 ```
 download the data.
-```{r}
+
+```r
 download.data <- function(data_dir) {
     dir.create(data_dir, showWarnings = FALSE)
     if (!file.exists(paste0(data_dir,'input.txt'))) {
@@ -41,7 +52,8 @@ download.data <- function(data_dir) {
 }
 ```
 Make dictionary from text.
-```{r}
+
+```r
 make.dict <- function(text, max.vocab=10000) {
     text <- strsplit(text, '')
     dic <- list()
@@ -59,7 +71,8 @@ make.dict <- function(text, max.vocab=10000) {
 }
 ```
 Transfer text into data feature.
-```{r}
+
+```r
 make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
     fi <- file(file.path, "r")
     text <- paste(readLines(fi), collapse="\n")
@@ -92,7 +105,8 @@ make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
 }
 ```
 Move tail text.
-```{r}
+
+```r
 drop.tail <- function(X, batch.size) {
     shape <- dim(X)
     nstep <- as.integer(shape[2] / batch.size)
@@ -100,7 +114,8 @@ drop.tail <- function(X, batch.size) {
 }
 ```
 get the label of X
-```{r}
+
+```r
 get.label <- function(X) {
     label <- array(0, dim=dim(X))
     d <- dim(X)[1]
@@ -114,9 +129,17 @@ get.label <- function(X) {
 }
 ```
 get training data and eval data
-```{r}
+
+```r
 download.data("./data/")
 ret <- make.data("./data/input.txt", seq.len=seq.len)
+```
+
+```
+## Total unique char: 65
+```
+
+```r
 X <- ret$data
 dic <- ret$dic
 lookup.table <- ret$lookup.table
@@ -143,7 +166,8 @@ Training Model
 --------------
 In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. 
 
-```{r}
+
+```r
 model <- mx.lstm(X.train, X.val, 
                  ctx=mx.cpu(),
                  num.round=num.round, 
@@ -159,39 +183,49 @@ model <- mx.lstm(X.train, X.val,
                  learning.rate=learning.rate,
                  wd=wd,
                  clip_gradient=clip_gradient)
+```
 
 ```
-Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result.
+## Epoch [31] Train: NLL=3.53787130224343, Perp=34.3936275728271
+## Epoch [62] Train: NLL=3.43087958036949, Perp=30.903813186055
+## Epoch [93] Train: NLL=3.39771238228587, Perp=29.8956319855751
+## Epoch [124] Train: NLL=3.37581711716687, Perp=29.2481732041015
+## Epoch [155] Train: NLL=3.34523331338447, Perp=28.3671933405139
+## Epoch [186] Train: NLL=3.30756356274787, Perp=27.31848454823
+## Epoch [217] Train: NLL=3.25642968403829, Perp=25.9566978956055
+## Epoch [248] Train: NLL=3.19825967486207, Perp=24.4898727477925
+## Epoch [279] Train: NLL=3.14013971549828, Perp=23.1070950525017
+## Epoch [310] Train: NLL=3.08747601837462, Perp=21.9216781782189
+## Epoch [341] Train: NLL=3.04015595674863, Perp=20.9085038031042
+## Epoch [372] Train: NLL=2.99839339255659, Perp=20.0532932584534
+## Epoch [403] Train: NLL=2.95940091012609, Perp=19.2864139984503
+## Epoch [434] Train: NLL=2.92603311380224, Perp=18.6534872738302
+## Epoch [465] Train: NLL=2.89482756896395, Perp=18.0803835531869
+## Epoch [496] Train: NLL=2.86668230478397, Perp=17.5786009078994
+## Epoch [527] Train: NLL=2.84089368534943, Perp=17.1310684830416
+## Epoch [558] Train: NLL=2.81725862932279, Perp=16.7309220880514
+## Epoch [589] Train: NLL=2.79518870141492, Perp=16.3657166956952
+## Epoch [620] Train: NLL=2.77445683225304, Perp=16.0299176962855
+## Epoch [651] Train: NLL=2.75490970113174, Perp=15.719621374694
+## Epoch [682] Train: NLL=2.73697900634351, Perp=15.4402696117257
+## Epoch [713] Train: NLL=2.72059739336781, Perp=15.1893935780915
+## Epoch [744] Train: NLL=2.70462837571585, Perp=14.948760335793
+## Epoch [775] Train: NLL=2.68909904683828, Perp=14.7184093476224
+## Epoch [806] Train: NLL=2.67460054451836, Perp=14.5065539595711
+## Epoch [837] Train: NLL=2.66078997776751, Perp=14.3075873113043
+## Epoch [868] Train: NLL=2.6476781639279, Perp=14.1212134100373
+## Epoch [899] Train: NLL=2.63529039846876, Perp=13.9473621677371
+## Epoch [930] Train: NLL=2.62367693518974, Perp=13.7863219168709
+## Epoch [961] Train: NLL=2.61238282674384, Perp=13.6314936713501
+## Iter [1] Train: Time: 10301.6818172932 sec, NLL=2.60536539345356, Perp=13.5361704272949
+## Iter [1] Val: NLL=2.26093848746227, Perp=9.59208699731232
 ```
-Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657
-...
-Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355
-Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858
-Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419
-Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329
-...
-Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351
-Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367
-Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988
-...
-Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974
-Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519
-Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576
-Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767
-...
-Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296
-Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454
-Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604
-...
-Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281
-Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428
-Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187
 
-```
 Inference from model
 --------------------
 helper function for random sample.
-```{r}
+
+```r
 cdf <- function(weights) {
     total <- sum(weights)
     result <- c()
@@ -224,16 +258,14 @@ choice <- function(weights) {
 }
 ```
 we can use random output or fixed output by choosing largest probability.
-```{r}
-make.output <- function(prob, sample=FALSE, temperature=1.) {
+
+```r
+make.output <- function(prob, sample=FALSE) {
     if (!sample) {
         idx <- which.max(as.array(prob))
     }
     else {
-        scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6)
-        rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature)
-        rescale <- rescale / (as.array(mx.nd.sum(rescale))[1])
-        idx <- choice(rescale)
+        idx <- choice(prob)
     }
     return (idx)
 
@@ -242,7 +274,8 @@ make.output <- function(prob, sample=FALSE, temperature=1.) {
 
 In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference.
 Build inference from model.
-```{r}
+
+```r
 infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                  input.size=vocab,
                                  num.hidden=num.hidden,
@@ -252,7 +285,7 @@ infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                  ctx=mx.cpu())
 ```
 generate a sequence of 75 chars using function `mx.lstm.forward`.
-```
+```r
 start <- 'a'
 seq.len <- 75
 random.sample <- TRUE
@@ -273,4 +306,9 @@ The result:
 ```
 ah not a drobl greens
 Settled asing lately sistering sounted to their hight
-```
\ No newline at end of file
+```
+
+Other RNN models
+----------------
+In `mxnet`, other RNN models like custom RNN is also provided.
+- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
\ No newline at end of file
diff --git a/docs/packages/r/index.md b/docs/packages/r/index.md
index ef427abc4899..829ca1d995e7 100644
--- a/docs/packages/r/index.md
+++ b/docs/packages/r/index.md
@@ -20,7 +20,7 @@ Tutorials
 * [Handwritten Digits Classification Competition](mnistCompetition.md)
 * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md)
 * [Tutorial on Callback Functions](CallbackFunctionTutorial.md)
-* [Character Language Model using RNN Model](CharRnnModel.Rmd)
+* [Character Language Model using RNN Model](CharRnnModel.md)
 
 Resources
 ---------
diff --git a/example/rnn/README.md b/example/rnn/README.md
index c3b6e225add8..294e7726268e 100644
--- a/example/rnn/README.md
+++ b/example/rnn/README.md
@@ -10,11 +10,6 @@ This folder contains RNN examples using low level symbol interface.
 - [gru_bucketing.py](gru_bucketing.py) PennTreeBank language model by using GRU
 - [char-rnn.ipynb](char-rnn.ipynb) Notebook to demo how to train a character LSTM by using ```lstm.py```
 
-## R
-
-- [lstm.R](lstm.R) Functions for building a LSTM Network
-- [char_lstm.R](char_lstm.R) demo how to train a character LSTM by using ```lstm.R```
-
 
 Performance Note:
 More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer to [Environment Variables](https://mxnet.readthedocs.org/en/latest/how_to/env_var.html).

From 68bda2a953024c2421db94f00dec95045e8f95f6 Mon Sep 17 00:00:00 2001
From: Jianfeng Zhu <eric.zjf@gmail.com>
Date: Sat, 18 Jun 2016 22:50:41 +0800
Subject: [PATCH 034/126] Update warpctc-inl.h (#2437)

* Update warpctc-inl.h

avoid compile error in cpu-only mode

* [OP] Support softmax with probability label (#2456)

* don't run char rnn test in R (#2459)

* [OP] add slice_axis (#2458)

* [OP] add slice

* optimization

* fix some typos (#2450)

* add warpctc config to config.mk (#2451)

* update lstm.py reshape using shape (#2446)

changing target_shape to shape

* fix worker node could not exit (#2440)

* add target_shape and adj to deconv for friendly using (#2445)

* add target_shape and adj to deconv for friendly using

* fix typo

* Add RNN model in R (#2417)

* refactor lstm model to seperate reusable rnn function like training from lstm model

* add Custom RNN model in R

* modify the documentation to add explanation of RNN and fix some typos

* remove some unuseful comments

* fix typo
---
 plugin/warpctc/warpctc-inl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index 8ed155e53480..aac8ff9e5568 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -125,7 +125,9 @@ class WarpCTCOp : public Operator {
                    "Error: get_workspace_size in inf_test");
     void* ctc_workspace;
 
+#if MXNET_USE_CUDA
     cudaError_t cuda_status;
+#endif
     float* activations = static_cast<float*>(data.dptr_);
     int* flat_labels = static_cast<int*>(label.dptr_);
     int* cpu_labels = flat_labels;

From 7693b93319d74462d53a30c04033a5104e5e0db1 Mon Sep 17 00:00:00 2001
From: tmatas <matassini.tommaso@gmail.com>
Date: Sat, 18 Jun 2016 19:46:01 +0200
Subject: [PATCH 035/126] Deleted make_list.py, fixed a typo in im2rec.py
 (#2463)

---
 tools/im2rec.py    |  8 ++---
 tools/make_list.py | 73 ----------------------------------------------
 2 files changed, 4 insertions(+), 77 deletions(-)
 delete mode 100644 tools/make_list.py

diff --git a/tools/im2rec.py b/tools/im2rec.py
index 81e388e0f363..c602f773bb03 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -190,9 +190,9 @@ def main():
     cgroup.add_argument('--exts', type=list, default=['.jpeg', '.jpg'],
                         help='list of acceptable image extensions.')
     cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.')
-    cgroup.add_argument('--train_ratio', type=float, default=1.0,
+    cgroup.add_argument('--train-ratio', type=float, default=1.0,
                         help='Ratio of images to use for training.')
-    cgroup.add_argument('--test_ratio', type=float, default=0,
+    cgroup.add_argument('--test-ratio', type=float, default=0,
                         help='Ratio of images to use for testing.')
     cgroup.add_argument('--recursive', type=bool, default=False,
                         help='If true recursively walk through subdirs and assign an unique label\
@@ -203,7 +203,7 @@ def main():
     rgroup.add_argument('--resize', type=int, default=0,
                         help='resize the shorter edge of image to the newsize, original images will\
         be packed by default.')
-    rgroup.add_argument('--center_crop', type=bool, default=False,
+    rgroup.add_argument('--center-crop', type=bool, default=False,
                         help='specify whether to crop the center image to make it rectangular.')
     rgroup.add_argument('--quality', type=int, default=80,
                         help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
@@ -218,7 +218,7 @@ def main():
         -1:Loads image as such including alpha channel.')
     rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'],
                         help='specify the encoding of the images.')
-    rgroup.add_argument('--saving_folder', type=str, default='.',
+    rgroup.add_argument('--saving-folder', type=str, default='.',
                         help='folder in which .rec files will be saved.')
     rgroup.add_argument('--shuffle', default=True, help='If this is set as True, \
         im2rec will randomize the image order in <prefix>.lst')
diff --git a/tools/make_list.py b/tools/make_list.py
deleted file mode 100644
index 578f2c4c3283..000000000000
--- a/tools/make_list.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import os
-import random
-import numpy as np
-import argparse
-
-def list_image(root, recursive, exts):
-    image_list = []
-    if recursive:
-        cat = {}
-        for path, subdirs, files in os.walk(root):
-            print path
-            for fname in files:
-                fpath = os.path.join(path, fname)
-                suffix = os.path.splitext(fname)[1].lower()
-                if os.path.isfile(fpath) and (suffix in exts):
-                    if path not in cat:
-                        cat[path] = len(cat)
-                    image_list.append((os.path.relpath(fpath, root), cat[path]))
-    else:
-        for fname in os.listdir(root):
-            fpath = os.path.join(root, fname)
-            suffix = os.path.splitext(fname)[1].lower()
-            if os.path.isfile(fpath) and (suffix in exts):
-                image_list.append((os.path.relpath(fpath, root), 0))
-    return image_list
-
-def write_list(path_out, image_list):
-    with open(path_out, 'w') as fout:
-        for i in xrange(len(image_list)):
-            fout.write('%d\t%d\t%s\n'%(i, image_list[i][1], image_list[i][0]))
-
-
-def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio):
-    image_list = list_image(root, recursive, exts)
-    random.shuffle(image_list)
-    N = len(image_list)
-    chunk_size = (N+num_chunks-1)/num_chunks
-    for i in xrange(num_chunks):
-        chunk = image_list[i*chunk_size:(i+1)*chunk_size]
-        if num_chunks > 1:
-            str_chunk = '_%d'%i
-        else:
-            str_chunk = ''
-        if train_ratio < 1:
-            sep = int(chunk_size*train_ratio)
-            write_list(prefix_out+str_chunk+'_train.lst', chunk[:sep])
-            write_list(prefix_out+str_chunk+'_val.lst', chunk[sep:])
-        else:
-            write_list(prefix_out+str_chunk+'.lst', chunk)
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description='Make image list files that are\
-        required by im2rec')
-    parser.add_argument('root', help='path to folder that contain images.')
-    parser.add_argument('prefix', help='prefix of output list files.')
-    parser.add_argument('--exts', type=str, nargs='+', default=['.jpeg','.jpg'],
-        help='list of acceptable image extensions.')
-    parser.add_argument('--chunks', type=int, default=1, help='number of chunks.')
-    parser.add_argument('--train_ratio', type=float, default=1.0,
-        help='Percent of images to use for training.')
-    parser.add_argument('--recursive', type=bool, default=False,
-        help='If true recursively walk through subdirs and assign an unique label\
-        to images in each folder. Otherwise only include images in the root folder\
-        and give them label 0.')
-    args = parser.parse_args()
-    
-    make_list(args.prefix, args.root, args.recursive,
-        args.exts, args.chunks, args.train_ratio)
-
-if __name__ == '__main__':
-    main()

From 30b97b6ef193e8d265144a4e7b4c8154415619f5 Mon Sep 17 00:00:00 2001
From: yuwei wang <yuweiw823@gmail.com>
Date: Sat, 18 Jun 2016 10:46:17 -0700
Subject: [PATCH 036/126] start work on the docs (#2431)

---
 .gitignore                                  |   1 +
 docs/_static/js/auto_module_index.js        |   3 +-
 docs/_static/mxnet-theme/index.html         |  14 +-
 docs/_static/mxnet-theme/layout.html        |   2 +-
 docs/_static/mxnet-theme/navbar.html        |  57 ++-
 docs/_static/mxnet.css                      | 399 +++++++++++++++++---
 docs/_static/selectlang.js                  |  25 ++
 docs/zh/system/{engine.md => dep_engine.md} |   0
 docs/zh/system/index.md                     |   2 +-
 9 files changed, 433 insertions(+), 70 deletions(-)
 create mode 100644 docs/_static/selectlang.js
 rename docs/zh/system/{engine.md => dep_engine.md} (100%)

diff --git a/.gitignore b/.gitignore
index 6d497340ad78..3eda10de52c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,4 @@ scala-package/*/*/target/
 *.settings
 !scala-package/*/bin
 *.bak
+*/node_modules/
diff --git a/docs/_static/js/auto_module_index.js b/docs/_static/js/auto_module_index.js
index b918ecdc1635..e0238ed391f8 100644
--- a/docs/_static/js/auto_module_index.js
+++ b/docs/_static/js/auto_module_index.js
@@ -21,5 +21,4 @@ function auto_index(module) {
     html += "</ul>";
     li_node.append(html);
   });
-}
-
+}
\ No newline at end of file
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index e0898a8b0567..a0901e42783d 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -1,8 +1,12 @@
-<div class="splash">
+<div id="splash">
   <div class="container">
     <div class="row">
       <div class="col-lg-12">
-        <h1>Flexible and Efficient Library for Deep Learning</h1>
+        <div id="banner-title">Flexible and Efficient Library for Deep Learning</div>
+        <div id="get_start">
+          <a href="get_started/" id="get_start_btn">Get Started</a>
+        </div>
+        <!-- end of get started button -->
         <div id="social">
           <span>
             <iframe src="https://ghbtns.com/github-btn.html?user=dmlc&repo=mxnet&type=star&count=true&v=2"
@@ -10,10 +14,8 @@ <h1>Flexible and Efficient Library for Deep Learning</h1>
             <iframe src="https://ghbtns.com/github-btn.html?user=dmlc&repo=mxnet&type=fork&count=true&v=2"
                     frameborder="0" scrolling="0" width="100px" height="20px"></iframe>
           </span>
-        </div> <!-- end of social -->
-        <div class="get_start">
-          <a href="get_started/" class="get_start_btn">Get Started</a>
-        </div> <!-- end of get started button -->
+        </div>
+        <!-- end of social -->
       </div>
     </div>
   </div>
diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index 62ebf399e936..62d784adf8ee 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -56,7 +56,7 @@
       };
     </script>
 
-    {% for name in ['jquery.js', 'underscore.js', 'doctools.js', 'searchtools.js'] %}
+    {% for name in ['jquery.js', 'underscore.js', 'doctools.js', 'searchtools.js', 'selectlang.js'] %}
     <script type="text/javascript" src="{{ pathto('_static/' + name, 1) }}"></script>
     {% endfor %}
 
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index 587665f7912c..127cb40b2045 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -1,3 +1,4 @@
+<!-- Previous Navbar Layout
 <div class="navbar navbar-default navbar-fixed-top">
   <div class="container">
     <div class="navbar-header">
@@ -35,6 +36,60 @@
         <li> <a href="{{url_root}}index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
         <li> <a href="{{url_root}}/zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
       </ul>
-    </div> <!-- navbar -->
+    </div>
+  </div>
+</div>
+Previous Navbar Layout End -->
+
+<div class="navbar navbar-fixed-top">
+  <div class="container">
+    <div id="header-inner" class="innder">
+      <h1 id="logo-wrap">
+        <a href="{{ url_root }}" id="logo"><img src="http://data.dmlc.ml/theme/mxnet.png"></a>
+      </h1>
+      <nav id="main-nav">
+        {% for name in ['Get Started', 'Tutorials', 'How To'] %}
+        <a class="main-nav-link" href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a>
+        {% endfor %}
+
+        {% for name in ['Packages'] %}
+        <span id="dropdown-menu-position-anchor">
+          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
+          <ul id="package-dropdown-menu" class="dropdown-menu">
+            {% for lang in ['Python', 'R', 'Julia', 'C++', 'Scala'] %}
+            <li><a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/{{lang.lower()}}/index.html">
+                {{lang}}
+            </a></li>
+            {% endfor %}
+          </ul>
+          {% endfor %}
+        </span>
+        <a class="main-nav-link" href="{{url_root}}system/index.html">System</a>
+        <div id="search-input-wrap">
+          <div id="search-input-icon">
+            <i class="fa fa-search"></i>
+          </div>
+          <input type="search" id="search-input" placeholder="Search...">
+          {{searchform('', False)}}
+        </div>
+
+      </nav>
+      <script> function getRootPath(){ return "{{url_root}}" } </script>
+      <div id="lang-select-wrap"> 
+        <label id="lang-select-label">
+          <!-- <i class="fa fa-globe"></i> -->
+          <span></span>
+        </label>
+        <select id="lang-select">
+          <option value="en">English</option>
+          <option value="zh">简体中文</option>
+        </select>
+      </div>
+  <!--     <a id="mobile-nav-toggle">
+        <span class="mobile-nav-toggle-bar"></span>
+        <span class="mobile-nav-toggle-bar"></span>
+        <span class="mobile-nav-toggle-bar"></span>
+      </a> -->
+    </div>
   </div>
 </div>
diff --git a/docs/_static/mxnet.css b/docs/_static/mxnet.css
index f4862a706b9b..83de570c5683 100644
--- a/docs/_static/mxnet.css
+++ b/docs/_static/mxnet.css
@@ -1,6 +1,300 @@
+/* basic style */
+a, abbr, acronym, address, applet, big, blockquote, body, caption, cite, code, dd, del, dfn, div, dl, dt, em, fieldset, form, h1, h2, h3, h4, h5, h6, html, iframe, img, ins, kbd, label, legend, li, object, ol, p, q, s, samp, small, span, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, ul, var {
+    margin: 0;
+    padding: 0;
+    border: 0;
+    outline: 0;
+    font-weight: inherit;
+    font-style: inherit;
+    font-family: inherit;
+    font-size: 100 %;
+    vertical-align: baseline
+}
+
+body {
+    background: #fff;
+    color: #000;
+    font-family: Lato, Helvetica, "Helvetica Neue", Arial, sans-serif;
+    font-size: 16px;
+    font-weight: 400;
+    line-height: 1.6;
+    text-rendering: optimizeLegibility;
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+}
+
+ol,
+ul {
+    list-style: none
+}
+
+table {
+    border-collapse: separate;
+    border-spacing: 0
+}
+
+caption,
+table,
+td,
+th {
+    vertical-align: middle
+}
+
+caption,
+td,
+th {
+    text-align: left;
+    font-weight: 400
+}
+
+a:hover, 
+a:focus,
+a:active {
+    text-decoration: none;
+}
+
+a img {
+    border: none
+}
+
+
+
+html {
+    box-sizing: border-box;
+}
+
+*,
+: after,
+: before {
+    box-sizing: inherit
+}
+
+button::-moz-focus-inner,
+input[type=button]::-moz-focus-inner,
+input[type=reset]::-moz-focus-inner,
+input[type=submit]::-moz-focus-inner {
+    padding: 0;
+    margin: 0;
+    border: 0
+}
+
+button,
+input,
+select {
+    margin: 0;
+    padding: 0;
+    border: 0
+}
+
+@media screen {
+    body,
+    html {
+        height: 100 %;
+    }
+}
+/* basic end */
+
+/*** code style ***/
+/*code block style*/
+.highlight {
+    border-radius: 4px;
+}
+
+pre {
+    border: 0;
+    line-height: 1.6;
+    margin: 0 0 16px;
+    padding: 10px 16px;
+    word-break: break-all;
+    word-wrap: break-word;
+}
+
+/*code inline style*/
+code.docutils, code.literal {
+    padding: 3px 5px;
+}
+/*** code style end ***/
+
+body > .container {
+    padding-top: 80px
+}
+
 /* header section */
-.splash{
-    padding:5em 0 1em 0;
+/* navbar */
+.navbar {
+    background-color:#0079b2;
+    opacity: 0.9;
+    border: 0px;
+    height: 60px;
+    padding: 0 80px;
+    margin-bottom: 0px;
+}
+
+#header-inner {
+    display: -webkit-box;
+    display: -webkit-flex;
+    display: -ms-flexbox;
+    display: box;
+    display: flex;
+    -webkit-box-orient: horizontal;
+    -moz-box-orient: horizontal;
+    -webkit-box-lines: single;
+    -moz-box-lines: single;
+    -webkit-flex-flow: row nowrap;
+    -ms-flex-flow: row nowrap;
+    flex-flow: row;
+    -webkit-box-align: center;
+    -ms-flex-align: center;
+    -webkit-align-items: center;
+    align-items: center
+}
+
+@media screen and(max-width: 768 px) {
+    #header-inner {
+        -webkit-box-pack: center;
+       -ms-flex-pack: center;
+       -webkit-justify-content: center;
+        justify-content: center
+    }
+}
+
+#logo-wrap {
+    -webkit-box-flex: 1;
+    box-flex: 1;
+   -webkit-flex: 0 50 px;
+   -ms-flex: 0 50 px;
+    flex: 0 50 px
+}
+
+#logo {
+    width: 150px;
+    display: block;
+    float: left;
+    height: 60px;
+    padding: 10px 0 0 0;
+}
+
+#logo > img {
+  display: block;
+  width: 110px;
+}
+
+#main-nav {
+    display: none;
+    -webkit-box-flex: 1;
+    box-flex: 1;
+    -webkit-flex: 1 auto;
+    -ms-flex: 1 auto;
+    flex: 1 auto;
+}
+
+@media screen and (min-width:769px) {
+    #main-nav {
+        display: block
+    }
+}
+
+.main-nav-link {
+    color: #fff;
+    text-decoration: none;
+    line-height: 50px;
+    opacity: .7;
+    -webkit-transition: .2s;
+    transition: .2s;
+    font-family: Lato, "Helvetica Neue", Helvetica, Arial, sans-serif;
+    display: inline-block;
+    padding: 0 15px
+}
+
+.main-nav-link:hover {
+    opacity: 1;
+    color: #1094e8;
+    text-decoration: none;
+}
+
+#dropdown-menu-position-anchor {
+    position: relative;
+}
+
+#package-dropdown-menu {
+    top: 36px;
+    border-radius: 4px; 
+    padding: 0;
+}
+
+#package-dropdown-menu > li > a {
+    color: #0079b2;
+    padding: 6px 16px;
+
+}
+
+#search-input-wrap {
+    display: none;
+    padding-left: 6px;
+    padding-bottom: 8px;
+    border-bottom: 1px solid #999
+}
+
+#search-input-icon,
+#search-input-wrap.on {
+    display: inline-block
+}
+
+#search-input-icon {
+    color: #fff;
+    padding-right: .5em;
+    opacity: .7
+}
+
+#search-input {
+    background: none;
+    font-size: inherit;
+    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+    color: #fff;
+    outline: none;
+    -webkit-appearance: none
+}
+
+#lang-select-wrap {
+    display: none;
+    position: relative
+}
+
+@media screen and (min-width:769px) {
+    #lang-select-wrap {
+        display: block
+    }
+}
+
+#lang-select-label {
+    color: #fff;
+    opacity: .7;
+    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+    line-height: 50px
+}
+
+#lang-select-label span {
+    padding-left: 8px
+}
+
+#lang-select-label i {
+    opacity: .7
+}
+
+#lang-select {
+    opacity: 0;
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    -webkit-appearance: menulist-button;
+    font-size: inherit
+}
+
+/* banner */
+#splash{
+    padding:60px 0 0 0;
     background-color:#0079b2;
     /* background-image:url(../img/bg.jpg); */
     background-size:cover;
@@ -9,27 +303,59 @@
     text-align:center
 }
 
-.splash h1{
+#splash #banner {
+    text-align: center
+}
+
+#splash #banner-title {
+    padding: 20px 0 10px 0;
+    font-size: 40px;
+    line-height: 1.15;
+    font-weight: 300;
+    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+}
+
+@media screen and (min-width:769px) {
+    #splash #banner-title {
+        padding-top: 100px;
+    }
+}
+
+#splash h1{
     font-size: 40px;
     margin-bottom: 20px;
 }
-.splash .social{
-    margin:2em 0
+
+#splash #social{
+    margin:2em 0 4em 0;
 }
 
-.splash .get_start {
+#splash #get_start {
     margin:2em 0
 }
 
-.splash .get_start_btn {
-  border: 2px solid #FFFFFF;
-  border-radius: 5px;
-  color: #FFFFFF;
-  display: inline-block;
-  font-size: 26px;
-  padding: 9px 20px;
+#splash #get_start_btn {
+    border: 1.8px solid #FFFFFF;
+    border-radius: 2px;
+    color: #FFFFFF;
+    display: inline-block;
+    font-size: 22px;
+    font-family: Helvetica, Helvetica Neue, Arial, sans-serif;
+    padding: 8px 20px;
+    -webkit-transition: .2s;
+    transition: .2s;
+}
+
+#splash #get_start_btn:hover {
+    background-color: #FFFFFF;
+    color: #0079b2;
+    opacity: 0.9;
 }
 
+
+
+
+
 .section-tout{
     padding:3em 0 3em;
     border-bottom:1px solid rgba(0,0,0,.05);
@@ -112,45 +438,11 @@ div.sphinxsidebar ul ul { margin-left: 15px }
     padding-right: 15px
 }
 
-body>.container {
-    padding-top: 80px
-}
 
-body {
-    font-size: 16px;
-}
 
-pre {
-    font-size: 14px;
-}
 
-/* navbar */
-.navbar {
-    background-color:#0079b2;
-    border: 0px;
-    height: 65px;
-}
-.navbar-right li {
-    display:inline-block;
-    vertical-align:top;
-    padding: 22px 4px;
-}
-
-.navbar-left li {
-    display:inline-block;
-    vertical-align:top;
-    padding: 17px 10px;
-    /* margin: 0 5px; */
-}
+/*embed end */
 
-.navbar-left li a {
-    font-size: 22px;
-    color: #fff;
-}
-
-.navbar-left > li > a:hover{
-    color:#fff;
-}
 .flag-icon {
   background-size: contain;
   background-position: 50%;
@@ -179,18 +471,7 @@ pre {
 /*    padding: 10px; */
 /* } */
 
-.navbar-brand >img {
-    width: 110px;
-}
 
-.dropdown-menu li  {
-    padding: 0px 0px;
-    width: 120px;
-}
-.dropdown-menu li a {
-    color: #0079b2;
-    font-size: 20px;
-}
 
 .section h1 {
     padding-top: 90px;
diff --git a/docs/_static/selectlang.js b/docs/_static/selectlang.js
new file mode 100644
index 000000000000..25337abcb22b
--- /dev/null
+++ b/docs/_static/selectlang.js
@@ -0,0 +1,25 @@
+function changeLanguage(langSelect, langSelectLabel, rootpath){
+	langSelect.change(function() {
+		var lang = langSelect.val();
+		if(lang == 'zh'){
+			location.href = rootpath + 'zh/index.html';
+		} else {
+			location.href = rootpath + 'index.html';	
+		}
+	});
+}
+
+$(document).ready(function () {
+	var langSelect = $("#lang-select");
+	var langSelectLabel = $("#lang-select-label > span");
+	currHref = location.href;
+	
+	if(/\/zh\//.test(currHref)){
+		langSelect.val("zh");
+	} else {
+		langSelect.val("en");
+	}
+	langSelectLabel.text($("option:selected").text());
+
+	changeLanguage(langSelect, langSelectLabel, getRootPath());
+})
\ No newline at end of file
diff --git a/docs/zh/system/engine.md b/docs/zh/system/dep_engine.md
similarity index 100%
rename from docs/zh/system/engine.md
rename to docs/zh/system/dep_engine.md
diff --git a/docs/zh/system/index.md b/docs/zh/system/index.md
index 2798b9531daa..31811ca34547 100644
--- a/docs/zh/system/index.md
+++ b/docs/zh/system/index.md
@@ -21,7 +21,7 @@
 
 上面显示的是 mxnet 的主要的模块以及它们之间如何进行交互. 这些模块是
 
-- [运行时依赖引擎](engine.md): 根据操作的读写依赖关系来调度和执行这些操作.
+- [运行时依赖引擎](dep_engine.md): 根据操作的读写依赖关系来调度和执行这些操作.
 - Storage Allocator: 可以高效的申请内存和重复利用内存, 包括 CPU 的主存和 GPU 的显存.
 - Resource Manager: 管理全局资源, 包括 随机数产生器以及临时空间.
 - NDArray: 动态的,异步的n维数组, 为MXNet 提供命令式编程模型.

From 2d0ca16c76b6ff3cd325cb59a2108f01cfef1d64 Mon Sep 17 00:00:00 2001
From: yuwei wang <yuweiw823@gmail.com>
Date: Sat, 18 Jun 2016 19:46:19 -0700
Subject: [PATCH 037/126] add type="text/css" for mxnet.css (#2466)

---
 docs/_static/mxnet-theme/layout.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index 62d784adf8ee..1f444c640dbf 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -76,7 +76,7 @@
     <link rel="stylesheet" href="{{ pathto('_static/pygments.css', 1) }}" type="text/css" />
     {%- endif %}
 
-    <link rel="stylesheet" href="{{ pathto('_static/mxnet.css', 1) }}">
+    <link rel="stylesheet" href="{{ pathto('_static/mxnet.css', 1) }}" type="text/css">
 {%- endmacro %}
 
 <html lang="en">

From 8c4c00558d539a1c3b1900af5914aec73a6180fc Mon Sep 17 00:00:00 2001
From: Sheng Li <qcl6355@gmail.com>
Date: Sun, 19 Jun 2016 20:31:10 +0800
Subject: [PATCH 038/126] rm lstm-model in this branch

---
 example/lstm-word-segment/data_helper.py      | 102 ------
 example/lstm-word-segment/lstm.py             | 295 ------------------
 .../lstm-word-segment/predict/cpp/Makefile    |  31 --
 .../predict/cpp/lstm-word-segment-predict     | Bin 36554 -> 0 bytes
 .../predict/cpp/lstm-word-segment-predict.cc  | 205 ------------
 .../predict/lstm-word-segment-predict.cc      | 220 -------------
 .../lstm-word-segment/predict/lstm_predict.py |  68 ----
 .../predict/mxnet_predict.py                  | 210 -------------
 example/lstm-word-segment/train.py            |  54 ----
 9 files changed, 1185 deletions(-)
 delete mode 100755 example/lstm-word-segment/data_helper.py
 delete mode 100644 example/lstm-word-segment/lstm.py
 delete mode 100644 example/lstm-word-segment/predict/cpp/Makefile
 delete mode 100755 example/lstm-word-segment/predict/cpp/lstm-word-segment-predict
 delete mode 100644 example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc
 delete mode 100644 example/lstm-word-segment/predict/lstm-word-segment-predict.cc
 delete mode 100755 example/lstm-word-segment/predict/lstm_predict.py
 delete mode 100644 example/lstm-word-segment/predict/mxnet_predict.py
 delete mode 100755 example/lstm-word-segment/train.py

diff --git a/example/lstm-word-segment/data_helper.py b/example/lstm-word-segment/data_helper.py
deleted file mode 100755
index 0632a7e7aba6..000000000000
--- a/example/lstm-word-segment/data_helper.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python
-import sys
-import codecs
-import numpy as np
-
-LabelVocab = {'B':0, 'M':1, 'E':2, 'S':3}
-
-def gold_to_conll(infile):
-    for line in codecs.open(infile, 'r', 'utf-8'):
-        words = line.strip().split()
-        for word in words:
-            num_chars = len(word)
-            for idx, char in enumerate(word):
-                char = char.encode('utf-8')
-                if num_chars == 1:
-                    print '%s\t%s' % (char, 'S')
-                else:
-                    if idx == 0:
-                        print '%s\t%s' % (char, 'B')
-                    elif idx == num_chars - 1:
-                        print '%s\t%s' % (char, 'E')
-                    else:
-                        print '%s\t%s' % (char, 'M')
-        print
-
-def load_data(infile, vocab=None, train=True):
-    if vocab is None:
-        vocab = {}
-        vocab['#_beg_#'] = 0
-        vocab['#_end_#'] = 1
-        vocab['#_unknown_#'] = 2
-    X_data = []
-    y_data = []
-    x = []
-    y = []
-    for line in open(infile):
-        line = line.strip()
-        if line == "": # begin a new sentence:
-            if len(x) != 0:
-                X_data.append(x)
-                y_data.append(y)
-                x = []
-                y = []
-        else:
-            w, label = line.split('\t')
-            y.append(LabelVocab[label])
-            if w not in vocab:
-                if train:
-                    vocab[w] = len(vocab)
-                    x.append(vocab[w])
-                else:
-                    x.append(vocab['#_unknown_#'])
-            else:
-                x.append(vocab[w])
-    
-    if len(x) != 0:
-        X_data.append(x)
-        y_data.append(y)
-    return X_data, y_data, vocab
-
-def reshape_data(sentences, labels, vocab, context_size=5, step=10):
-    padding_num = int((context_size - 1) / 2)
-    x = []
-    y = []
-    for sen, label in zip(sentences, labels):
-        predict_word_num = len(sen)
-        add_num = step - predict_word_num % step
-        for i in range(add_num):
-            sen.append(vocab['#_end_#'])
-            label.append(LabelVocab['S'])
-
-        for _ in range(padding_num):
-            sen.insert(0, vocab['#_beg_#'])
-            sen.append(vocab['#_end_#'])
-        
-        x_t = []
-        y_t = []
-        for i in range(padding_num, len(sen)-padding_num):
-            if len(x_t) == step:
-                x.append(x_t)
-                y.append(y_t)
-                x_t = []
-                y_t = []
-            x_t.append(sen[i-padding_num:i+padding_num+1])
-            y_t.append(label[i-padding_num])
-
-        if len(x_t) == step:
-            x.append(x_t)
-            y.append(y_t)
-    
-    return np.array(x), np.array(y)
-    
-
-if __name__ == '__main__':
-    test_path = "test.conll"
-    x, y, vocab = load_data(test_path)
-    print 'vocab size %d' % (len(vocab))
-    X_data, y_data = reshape_data(x, y, vocab)
-    print X_data.shape, y_data.shape
-    print X_data[0]
-    print y_data[0]
-    
diff --git a/example/lstm-word-segment/lstm.py b/example/lstm-word-segment/lstm.py
deleted file mode 100644
index 20aced9f58bd..000000000000
--- a/example/lstm-word-segment/lstm.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import mxnet as mx
-import numpy as np
-import time
-import math
-from collections import namedtuple
-
-logs = sys.stderr
-
-LSTMState = namedtuple("LSTMState", ['c', 'h'])
-LSTMParam = namedtuple('LSTMParam', ['i2h_weight', 'i2h_bias', 'h2h_weight', 'h2h_bias'])
-LSTMModel = namedtuple('LSTMModel', ['lstm_exec', 'symbol', 'init_states', 'last_states', 'seq_data', 'seq_labels', 'seq_outputs', 'param_blocks'])
-
-
-def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout):
-    """LSTM Memory Unit"""
-    i2h = mx.sym.FullyConnected(data=indata, weight=param.i2h_weight, bias=param.i2h_bias,
-                                num_hidden=num_hidden * 4, name='t%d_l%d_i2h' % (seqidx, layeridx))
-    h2h = mx.sym.FullyConnected(data=prev_state.h, weight=param.h2h_weight, bias=param.h2h_bias,
-                                num_hidden=num_hidden * 4, name='t%d_l%d_h2h' % (seqidx, layeridx))
-    gates = i2h + h2h
-    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4, name='t%d_l%d_slice' % (seqidx, layeridx))
-
-    # input gate
-    input_gate = mx.sym.Activation(slice_gates[0], act_type='sigmoid')
-    input_transform = mx.sym.Activation(slice_gates[1], act_type='tanh')
-    # forget gate
-    forget_gate = mx.sym.Activation(slice_gates[2], act_type='sigmoid')
-    # output gate
-    output_gate = mx.sym.Activation(slice_gates[3], act_type='sigmoid')
-    next_c = (forget_gate * prev_state.c) + (input_gate * input_transform)
-    next_h = output_gate * mx.sym.Activation(next_c, act_type='tanh')
-
-    return LSTMState(c=next_c, h=next_h)
-
-
-def unroll_lstm(num_lstm_layer, num_hidden, step_size, context_size, vocab_size, num_embed, num_label, dropout=0.):
-    # initialize the parameter sysmbols
-    embed_weight = mx.sym.Variable('embed_weight')
-    cls_weight = mx.sym.Variable('cls_weight')
-    cls_bias = mx.sym.Variable('cls_bias')
-
-    param_cells = []
-    last_states = []
-    for i in range(num_lstm_layer):
-        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable('l%d_i2h_weight' % i),
-                                     i2h_bias=mx.sym.Variable('l%d_i2h_bias' % i),
-                                     h2h_weight=mx.sym.Variable('l%d_h2h_weight' % i),
-                                     h2h_bias=mx.sym.Variable('l%d_h2h_bias' % i)))
-        state = LSTMState(c=mx.sym.Variable('l%d_init_c' % i), h=mx.sym.Variable('l%d_init_h' % i))
-        last_states.append(state)
-
-    # embedding layer
-    # data = mx.sym.Variable('data')
-    # label = mx.sym.Variable('label')
-    # embed = mx.sym.Embedding(data=data, weight=embed_weight,
-    #         input_dim=vocab_size, output_dim=num_embed, name='embed')
-    # wordvec = mx.sym.SliceChannel(data=embed, num_outputs=context_size, squeeze_axis=1)
-    last_hidden = []
-    for seqidx in range(step_size):
-        # embedding layer
-        data = mx.sym.Variable("t%d_data" % seqidx)
-        hidden = mx.sym.Embedding(data=data, weight=embed_weight,
-                input_dim=vocab_size, output_dim=num_embed, name='t%d_embed' % seqidx)
-
-        # stack LSTM
-        for i in range(num_lstm_layer):
-            if i == 0:
-                dp = 0.
-            else:
-                dp = dropout
-            next_state = lstm(num_hidden, indata=hidden, prev_state=last_states[i],
-                              param=param_cells[i], seqidx=seqidx, layeridx=i, dropout=dropout)
-            hidden = next_state.h
-            last_states[i] = next_state
-
-        # decoder
-        if dropout > 0.:
-            hidden = mx.sym.Dropout(data=hidden, p=dropout)
-        last_hidden.append(hidden)
-
-    out_prob = []
-    for seqidx in range(step_size):
-        fc = mx.sym.FullyConnected(data=last_hidden[seqidx], weight=cls_weight,
-                bias=cls_bias, num_hidden=num_label, name='t%d_cls' % seqidx)
-        label = mx.sym.Variable('t%d_label' % seqidx)
-        sm = mx.sym.SoftmaxOutput(data=fc, label=label, name='t%d_sm' % seqidx)
-        out_prob.append(sm)
-
-    # concat = mx.sym.Concat(*last_hidden, dim=0)
-    # fc = mx.sym.FullyConnected(data=concat, weight=cls_weight, bias=cls_bias, num_hidden=num_label)
-    # label = mx.sym.Variable("label")
-    # sm = mx.sym.SoftmaxOutput(data=fc, label=label, name='sm')
-
-    # hidden_concat = mx.sym.Concat(*last_hidden, dim=0)
-    # use last hidden h as feature
-    # fc = mx.sym.FullyConnected(data=last_hidden[-1], weight=cls_weight, bias=cls_bias, num_hidden=num_label)
-    # sm = mx.sym.SoftmaxOutput(data=fc, label=label, name='sm')
-
-    # out_prob = [sm]
-
-    for i in range(num_lstm_layer):
-        state = last_states[i]
-        state = LSTMState(c=mx.sym.BlockGrad(state.c, name='l%d_last_c' % i),
-                          h=mx.sym.BlockGrad(state.h, name='l%d_last_h' % i))
-        last_states[i] = state
-
-    unpack_c = [state.c for state in last_states]
-    unpack_h = [state.h for state in last_states]
-    list_all = out_prob + unpack_c + unpack_h
-    return mx.sym.Group(list_all)
-
-
-def is_param_name(name):
-    return name.endswith('weight') or name.endswith('bias') or \
-        name.endswith('gamma') or name.endswith('beta')
-
-def setup_lstm_model(ctx, num_lstm_layer, step_size, context_size, num_hidden, num_embed, num_label,
-        batch_size, vocab_size, initializer, dropout=0.):
-
-    lstm_sym = unroll_lstm(num_lstm_layer=num_lstm_layer, num_hidden=num_hidden, step_size=step_size,
-                           context_size=context_size, vocab_size=vocab_size,
-                           num_embed=num_embed, num_label=num_label, dropout=dropout)
-
-    arg_names = lstm_sym.list_arguments()
-
-    input_shapes = {}
-    for name in arg_names:
-        if name.endswith('init_c') or name.endswith('init_h'):
-            input_shapes[name] = (batch_size, num_hidden)
-        elif name.endswith('data'):
-            input_shapes[name] = (batch_size, context_size)
-        elif name == 'label':
-            input_shapes[name] = (batch_size * step_size, )
-        elif name.endswith('label'):
-            input_shapes[name] = (batch_size,)
-        else:
-            pass
-
-    arg_shape, out_shape, aux_shape = lstm_sym.infer_shape(**input_shapes)
-    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
-    args_grad = {}
-    for shape, name, in zip(arg_shape, arg_names):
-        if is_param_name(name):
-            print >> logs, 'parameter argument', name, shape
-            args_grad[name] = mx.nd.zeros(shape, ctx)
-        else:
-            print >> logs, 'input argument', name, shape
-
-    lstm_exec = lstm_sym.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add')
-
-    param_blocks = []
-    arg_dict = dict(zip(arg_names, lstm_exec.arg_arrays))
-    for i, name in enumerate(arg_names):
-        if is_param_name(name):
-            initializer(name, arg_dict[name])
-            param_blocks.append( (i, arg_dict[name], args_grad[name], name) )
-        else:
-            assert name not in args_grad
-
-    out_dict = dict(zip(lstm_sym.list_outputs(), lstm_exec.outputs))
-
-    init_states = [LSTMState(c=arg_dict['l%d_init_c' % i],
-                             h=arg_dict['l%d_init_h' % i]) for i in range(num_lstm_layer)]
-    seq_data = [arg_dict['t%d_data' % i] for i in range(step_size)]
-    last_states = [LSTMState(c=out_dict['l%d_last_c_output' % i],
-                             h=out_dict['l%d_last_h_output' % i]) for i in range(num_lstm_layer)]
-    seq_outputs = [out_dict['t%d_sm_output' % i] for i in range(step_size)]
-    seq_labels = [arg_dict['t%d_label' % i] for i in range(step_size)]
-
-    return LSTMModel(lstm_exec=lstm_exec, symbol=lstm_sym, init_states=init_states,
-                     last_states=last_states, seq_data=seq_data, seq_labels=seq_labels,
-                     seq_outputs=seq_outputs, param_blocks=param_blocks)
-
-
-def set_lstm_inputs(m, x_batch, y_batch):
-    step_size = len(m.seq_data)
-    batch_size = m.seq_data[0].shape[0]
-    # print 'x batch shape %s' % str(x_batch[:, 0, :].shape)
-    # print 'y batch shape %s' % str(y_batch.shape)
-    for seqidx in range(step_size):
-        m.seq_data[seqidx][:] = x_batch[:, seqidx, :]
-        m.seq_labels[seqidx][:] = y_batch[:, seqidx]
-
-
-# shape : num-instance * context-size
-def train_lstm(model, X_train_batch, y_train_batch, X_val_batch, y_val_batch,
-        num_epoch, optimizer='RMSProp', max_grad_norm=5.0, learning_rate=0.001, **kwargs):
-    print >> logs, 'Training with train shape=%s' % str(X_train_batch.shape)
-    print >> logs, 'Training with dev shape=%s' % str(X_val_batch.shape)
-
-    m = model
-    batch_size = m.seq_data[0].shape[0]
-    step_size = len(m.seq_data)
-    print >> logs, 'batch_size=%d' % batch_size
-    print >> logs, 'step_size=%d' % step_size
-    eta = 1e-4
-
-    opt = mx.optimizer.create(optimizer, **kwargs)
-    opt.lr = learning_rate
-    updater = mx.optimizer.get_updater(opt)
-
-    for iteration in range(num_epoch):
-        # reset states
-        for state in m.init_states:
-            state.c[:] = 0.0
-            state.h[:] = 0.0
-
-        tic = time.time()
-        num_correct = 0.
-        num_total = 0.
-        for begin in range(0, X_train_batch.shape[0], batch_size):
-            batchX = X_train_batch[begin:begin+batch_size]
-            batchY = y_train_batch[begin:begin+batch_size]
-            if batchX.shape[0] != batch_size:
-                continue
-
-            # m.seq_data[:] = batchX
-            # m.seq_labels[:] = batchY
-            set_lstm_inputs(m, batchX, batchY)
-
-            m.lstm_exec.forward(is_train=True)
-
-            m.lstm_exec.backward()
-            # transfer the states
-            for init, last in zip(m.init_states, m.last_states):
-                last.c.copyto(init.c)
-                last.h.copyto(init.h)
-
-            # update parameters
-            norm = 0.
-            for idx, weight, grad, name in m.param_blocks:
-                grad /= batch_size
-                l2_norm = mx.nd.norm(grad).asscalar()
-                norm += l2_norm * l2_norm;
-            norm = math.sqrt(norm)
-            for idx, weight, grad, name in m.param_blocks:
-                if norm > max_grad_norm:
-                    grad *= (max_grad_norm / norm)
-                updater(idx, grad, weight)
-                # reset gradient to zero
-                grad[:] = 0.0
-
-            pred = np.array([np.argmax(ypred.asnumpy(), axis=1) for ypred in m.seq_outputs])
-            pred = pred.transpose()
-            num_correct += sum((batchY == pred).flatten())
-            num_total += batch_size * step_size
-
-        # end of training epoch
-        toc = time.time()
-        train_acc = num_correct * 100.0 / num_total
-
-        # saving checkpoint
-        prefix = 'lstm'
-        m.symbol.save('checkpoint/%s-symbol.json' % prefix)
-        save_dict = { ('arg:%s' % k) :v  for k, v in m.lstm_exec.arg_dict.items() if is_param_name(k) }
-        save_dict.update({('aux:%s' % k) : v for k, v in m.lstm_exec.aux_dict.items()})
-        param_name = 'checkpoint/%s-%04d.params' % (prefix, iteration)
-        mx.nd.save(param_name, save_dict)
-        print >> logs, 'Saved checkpoint to %s' % param_name
-
-        # evaluate on dev data
-        num_correct = 0.
-        num_total = 0.
-        for begin in range(0, X_val_batch.shape[0], batch_size):
-            batchX = X_val_batch[begin:begin+batch_size]
-            batchY = y_val_batch[begin:begin+batch_size]
-            if batchX.shape[0] != batch_size:
-                continue
-
-            # m.seq_data[:] = batchX
-            # m.seq_labels[:] = batchY
-            set_lstm_inputs(m, batchX, batchY)
-
-            m.lstm_exec.forward(is_train=False)
-            pred = np.array([np.argmax(ypred.asnumpy(), axis=1) for ypred in m.seq_outputs])
-            pred = pred.transpose()
-            num_correct += sum((batchY == pred).flatten())
-            num_total += batch_size * step_size
-    
-        dev_acc = num_correct * 100 / float(num_total)
-        print >> logs, 'Iter [%d] Train: Time: %.3fs, Training Accuracy:%.3f---Dev Accuracy thus far: %.3f' \
-            % (iteration, toc - tic, train_acc, dev_acc)
-    
-
-if __name__ == '__main__':
-    lstm_model = setup_lstm_model(ctx=mx.cpu(0), num_lstm_layer=1,
-                                  context_size = 7,
-                                  num_hidden=100, num_embed=300,
-                                  num_label=4, batch_size=50,
-                                  vocab_size=1000,
-                                  initializer=mx.initializer.Uniform(0.1),
-                                  dropout=0.5)
diff --git a/example/lstm-word-segment/predict/cpp/Makefile b/example/lstm-word-segment/predict/cpp/Makefile
deleted file mode 100644
index 5047a15bba65..000000000000
--- a/example/lstm-word-segment/predict/cpp/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-# Special thanks to https://github.com/pertusa for the Makefile
-CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall
-
-# Added for openblas
-# export OPENBLAS_ROOT=/usr/local/opt/openblas
-
-# CFLAGS+= -I${OPENBLAS_ROOT}/include
-# LDFLAGS=-L${OPENBLAS_ROOT}/lib -lopenblas
-
-# Added for opencv
-# CFLAGS+= `pkg-config --cflags opencv`
-# LDFLAGS+=`pkg-config --libs opencv`
-
-# Added for mxnet
-export MXNET_ROOT=/home/lisheng.ls/mxnet
-
-CFLAGS+= -I$(MXNET_ROOT)/include 
-LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so
-
-lstm-word-segment-predict: lstm-word-segment-predict.o
-	g++ -O3 -o lstm-word-segment-predict lstm-word-segment-predict.o $(LDFLAGS)
-
-lstm-word-segment-predict.o: lstm-word-segment-predict.cc
-	g++ -O3 -c lstm-word-segment-predict.cc ${CFLAGS}
-	
-clean: 
-	rm lstm-word-segment-predict
-	rm -f *.d *.o
-
-lint:
-	python ../../../dmlc-core/scripts/lint.py mxnet "cpp" ./
diff --git a/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict b/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict
deleted file mode 100755
index 950408da9309b5681c56777f595be9f8f4087810..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36554
zcmeHw3wTu3wf~u!Oo#}X0Z}6gjA>9%i5Uo(AeIS9;0#VMCE-zT>0~mQKqN0SGZ1`~
z*d*w27^T#|Ra^V#dRxoYwsKo*f!YwBL9dU1Lh(_FigJcX4Fw@SbAM}}y(ecTGgo@Q
z{`-CZ`}M%g-fQi(*Is+=wb$NfpUGKki(PY#27@Fnqx4OQpd(Wq45fWblEceDj?0rS
zkQ~wk=?W<o7&AT&Rzj(k6KOwZEz&ejXF{5SPo7aj<QW|-Bho80G+NLuAt9yI-r(in
zSuLHrL<2=CNj8R{dL--CWeh}G!7?J9MZ(}}yg*AuJra-V9pUwk@OmP(ars3m`bX^O
zQ_joFjSfHrJ1CKg*!N({o0m#I2vkwNZ-GY8x4^+NBHg2*C6S7H-$Ff-^L#HWCtS$e
zOQsi@%P&$f4%gS#RL;n~zP4&gZB2c!c}jEMj43m6v;7U(Q(3=>Khe%xu!yxJ^k@S_
zGLONBY=FvtanlC_hrT>^QA)b?^cTzTyX3RU`xlHt*(`jBmUKF4SB~L{D>oT8XF8+`
zX~ZZD>%uhKD5qpd`<69D6O!m1_{_nl2p<`r`S_IJa|=FneM?KEg-FYIzMQ8gUi&!R
z_)6upYTs6;|KVZAt>IJY|8cUae9XBE{cqg9^on<~c0RQI{f&3+e|y60iFG3i)83sh
z+w$qz13$f@ack}`-uvSk=gQ~i6d(F%a`7S8`foc<m2|z-?SDRX=A+-+JL2L;Mil<;
z(F_tT;q@#!Iyyngv#`fR_zN%z65)%J@cGfO#Pl~M;h&y_{$G>O+mgsLkc7|S;fdur
zl>~o0NxLVK(7&04K0k?^tCHYrlkn+F(y#xMME`$IBIh44X%mg(?MdiwNFt9LlOz$J
z%}MzGTax~&N#uVb3I9p3Sz7|T9Z1q{TatEvk_5jjiJoiWToUznVUl*2CgJ~T5<UMY
z37>;W<QbQQo}76iIWJ1m?zKtyY)`@`9Zob6|MDdC{Yl#GOk!78Ch6~#B=qJa{K+>b
zvXkeM&@WDcmo7})-=#_T-vfLi{VY#He?OLz(Nem!r^Eq@!RwpQbEY&>YMkd_$YG$P
zs|9+_MC!<N;Ct|T0eTxHjg>m9Di|^Y_Da`!d`6=a{o@@3Mz3qp-z!1y;PfrLWCPlv
zb~oMZV3;R3pUKdJ74&C36`WE_?m#)ww>TXPaS5lN&gmyDuh0nj_Y0hVcY%Xpp62*&
zj4#RG<Evm8E9dhKu7`N~Q<z^Qe`lpeU&{Glqp0<Z<6Ag>BT|yHXO4qmy4klRg}I-}
z$<**1EoCzKFB{U|nS8!<$I;&eJ|w@5w`=2!MgdQ9N?Z>D-^}$d#nB(*`q@(GU>qC<
zjnP@b-&?^DLO<W)^zq{*UB9BC&WBx-e}%7pd3LS;`nu+NUx2nz^kD#YPm+87D)&O)
z@*01@*Hl>R@%w##$?YsJaaZ}8IJ&%~u(qMzSMI5-^+|5`^16n4w?E)%3b@@GX+px1
zrKL^2s<}-*)Tybj2}tf+7nB9;Id0kGUlAZ)&N6>lAh*#|)8xFl%wJPn=5dxybr*vw
z&yD5-o|;;am-%Xa-hka7C<fkld(czs4m6-Dz}yvJMcD;qxo&%&yQHDY-Q=lX?sHQ?
zP5p9KFg3?r<g4}sYXe$g6)P1*rA;+;K6jIkYPcI4YHPe}G<G)(3hE9td1?ZFr>oLl
z?5fPc$6kzUy0_NnX)0bN@vb&Bt@bokY5k1$Am3f$X>?b5{0X}|jk@bT?_MVBq&sOJ
zsU&e1-S)5TA?s&NbMZn9i`K0ft1uP~O*C>CEPrva9sP6r>Kbc3ULShtUFr4&o1Ltv
zcvVs3E#;_Kjp6^Bs?R6R(duM2L)tB}L+{L@L{T*)s)j%}K%go<-k!!rUwu_^9Uq~2
zzQ8TPKw~gaw!+g0vrNo+s^I*;$yTS@p_ZC@zYnuESYLB{(C4misPYxp)s?czqfJYO
zqe-Y)I9FSlFW{_)*5b9u;ld`LC*YIdRQ!P|@3q&`J=qNSiN?SRlvKG_`qsF;8VzaO
zp9_D}2;I^!`2$T2Yl;`%jIr|;V?gRXb=t7lr(w#}cwsxWzRF;=(+k&O_pb0XiMi-4
zE*3U$b6H@zH?XGB2Ojom;Jdn}%2&^v4jW3U$|keBa!tVJFTHtH9kl}V4Q@AtY*-D;
zt#W&6Ya6_pQR!u@@i#X3eL5Kzl+{f4`+O^x7uSNtRJXIi4=3eYSyj`-WN(;WT^sbT
z5KYg}x8e2p1Jp#7r<4-ZM{xtR;Pce+;Yrxlg`ngb;ndD@cX^JxxNL^I^yac@?pkmw
zUks{TO>Z^T8q<W_FgV_351bzBJQLbqXs0gvnP$iX*PvJAwQ`;C-9po4fqc%s!Jm+G
zE_wRmQphn~Bh;kOM$qo2A*R_3MSE&}QLfZ@>CN6jGXRa~?tf<hxeb`!RFlZrINSn6
z9qgL5Q<F>fS6u2P<>>oV7@gu@(;e>N!Co6?40V8P9qeFqQ6Hb!$J%pYiuFDp)+DyN
zV6OcQ#*L=Q>@+Kx=B=qmqh$fK#4WZV7;raKYbzQwpo!$qg|DqD_M#m&!j)LT>)l>Y
zz`Ftj<>lze`PoNwo}$4r^6!B)=tU!z{e~vbavvYZ^BE%yE&AYI-rVeN^fmb#>OHkJ
zV74lU_2-7=z5vd1FmoYV-27%f0c%gx6tG93c_J+T2KU0UN+(wSvgz(}*cNh(XRa5z
zq3+UjNMMFT8q#{xQ0p(IWvYI)7^U-3m9XLSFbDF$^}gnSyK-d!OH!;!773GVm<CT<
zjPYaxh}A1PeAw@JeNCE=5!01*c+e<nvDFK#aD%F$i5mr+RlrwUyPR2)eiE>Wt&NVR
zs`K!;n$VaG<QlklhSTLrsH$a7jr9IC^NJRWI&GamZn@X%_7h~6+>7d0!-BCw_=9z9
zP4Q}E82Ug%t%L-9<jTQJY{tH_&R6I6vBX=~NKjqFDojx~ID;?Sr`CBee*p6(iCDt^
zd{v%+2ZWV=zeWdCeU&t?u+TjX<?~$5Ifd@2*;BKpYx$*1fzQph+hf4|7|5O{6)s&`
zFvsb(XHU~*V|ic)Z^S>`+OU-JMw~pF@z2OU1gA1AVZ^^-IGMzDpToEjVL6Qgw}l*!
z40xnOq2b8kW{F(}oRMAt*uXvn(LW8Q?-T}U?G4gw#=Q!iuiEe_y=o2ajD|}y@l6-i
zAI<KSwDREC8r*?hAkER0@72}U%FVj+ROxmIPyFff^#2H5Op_LHp32(EfQL!TIJ|<D
zr%Ee1{Ms`s(YX}q4h{zoJcFD`YUglZ@<!Y_8KsSY>FRe^u#~iy&vN{W--nT=#F;IX
zi2FIhi!)uqJ9PB(3Bj*U9o{LBNNwH<1`!`|4onDf|4sM;Lhwu6#}j_GKq3|Q$Aq7!
z!;AY%!bk6gk<X~8U{p9Bz(@^VHXWX9N?en4c#}XP&C%h>*2I;k!&BSha_I1+YjMds
zJjo=kQXL)x7QL40@Gz|CRiVSfyrS0%9i9(4D{Iu@M?|S5sab~~sl&JE@aa1IeLB2F
zhu@&X3tOh*M|Aj6I{HmIy#Bmsvkrfej(&>{PtP*MwN;0|SRj#h>hPE7@cVW6OLh2e
z9X>;cKcd55ro;E>@HpLwUaAg%c@&H1OFH}&I()wlKTe02JQWNg8=0WPr|IxE9p0+L
zU#Y`q=<u03yiJGyh7LbThtJaCb9DHLI((iEf0YjJ(BZGv;bk3uk`7<0!%x=Xm+J7>
z=<pRf{Ixp#3LXAB9llYApQ6Jz>+sn+e2Wf$y$*k$4xgjLZ_wfGI{YI#yi14Qq{Dwx
zhu^HjPu1bK=<w5Y_^mp8t`6U+!_Uy+_v`RC=<wY-e4Y+}M2FX}7d<-sOdY+d!}CX~
ztW<9OC`}HTKXKS3xxF)BR{za}2ZpVC(st`ABReqtJ&+7cJce(}1P5{iuOOmG&j7B8
zZxU=ExSPQT2{sbk$>0|W26HL0g~7WBCYK%A#Nh1&ldFzwVDN7UCKnxPVenG~)3l2;
zGWZFC$t6cB82lK)<ccGu4BkjEx!{O{!S@qPt~Zjy;O`SmE;nLh@Ert`tBqJ0+(a<B
z*oeg7+XyDt8tMBAh>4X1lS_^CF!)x2$(2UB8N85Sa-oq<2G1v$TxVnpgXa)TE;F);
z!7~XaR~gyB;Hd<Yi;T1|_!@%AHAWg4oJlab#7G5$#}Z7gFjC6k(FBtVj5ruPoM3W&
zksJma2_}~pu`&3w*?`H_MXU_|E5YLlmKc13U~+kpzAvf&?-6VxxQD@S5_~1W-3&fR
za3;Z>41SSda%qt*4BkaBxw6P625%>rTv%iSgMULXxvoeHgP$UpTvnu!!A}rOt}0T&
z;Kv9i7ZoXG@J52kHANf@zMo)nNs$}|f1hA-MG+f=?;x05P{hjMCW6WJL?i~^MliXY
zNZ%K1{0Syk6X{{_tpt;ciF7k~A;IJ-BApDLPcXTN$QB0AA(&i4WD|pD5<H#Y4Gf-2
zFu90G3xlsAm|R1ok-?b+lS_zHFnBD%<O(9C3?5A|xqyfRaH$*`CYzeo&}v)jD+YBL
zdT{IR>TJt|Hc&{?hJqy>G*=217c43$UsNXF^U+ewT{--GmX*@3wyb^wqhxt(T-inV
zG@eJ6NF}LdHlZ^(T73_#%iGZ&T@z$Q<0R*ITkhS1=_QA++e@`%<zw|yDB6BjR=Q;6
zgskjVms8d7vspIqB2oGvf~@?6rD%fCbg2WGR31yWPnyF?NN)A*gd`=LgczyY5|R{i
z67pQ?!ywrwmC0ePai%ir>?C4<i}E=lA8xi0EPmO8tb-M4`Vl`QQFnbFuOH&Z^z#th
zksQ8)>*uI?j!2a^xr+9x^9T+<l(mKQ622?T(ebMV4ru6?Of{;?iiKKGtR9B{l0!vV
z4$@XPtJSQkHqJ&v+?7?@kp?#EzsI2~I&uiTk)Z=wrPN}h`r2nCLm;b>2B;f4Q0;I^
z)Ce%TM*hI*Du}Ly=pJR5XBogomyl=R;p9*oN9yX|$*9}1Hh?PhP*y(@IrMCnM4bpd
zpVb4nU>glmvAt7Pin20jkaAGEjg(FUVFj?A^n-i`@;jKWin6Rk*~y7)D0l${pdl1p
z$nHVH*nn{wkrOZA_XAH$7_u)Rq16gmY8C%65Gh#zbCIWY3)wtgg6u40ufqpf#~`+d
zV@XEpDP5{HW+0x=Dg{vo)m7KQzmoxU10WSPMoVhrN;XglNue&J60qL{Ag{85<+q@e
zF2Zls<vT?_Wd89@HYnPm%ArTL_)x3;Elb<2a2YLk=1G>e&0uYLuCS1Xd=nU1I=Xkz
zt)lwSr!bF@RSqpw&lXv2W=s3kSfgE`pJ_%L`Xy`6ng3BBd#fYUA-R<Iu@AAd9Rh1P
zWRk<ps(jA?-R)Z1A17Sv?ld|7&%yVpB@&yuO)a{VpD|YILaezmETohQVM&chw~@BU
zOh~64+Q2`QwUm(XN6kppe>q2jKE%icC9Bp!=Ah1E#hP~HYumJ@clMi@ux>LLbrogN
zYQ!?+ma$DLqDC>0)DFa?l#Ky3`Uo{T1M9IILPh&qu5bl(Ta=Y2hgW>r+eD+<0wz>%
z3(}6)Aca$T$7On!@lh}Ar+Eh*a3XSs>ftLT>9pk<)+!a8F0hPBjnUzJ3>0*jBDd}}
zII+!>TR*?T()J{TaiQ~Dkdnk?1Fp&4GIcOtwXCa$>t=FpA68@ODhbarIzzMN&}vm3
zbCychGY@D#=v4N~=S~p^5=RcDbUT#;w6C1+3gyU3ZI(?|{_IjdFHt^rDla=jt1`p_
z<jg;3Y5P4yE(y&jX+IknMTPc*veM@~_lYzAK;Ru&>6JsoL8qB+$k}lSnsLDukb2cK
zBu~`uJg+&z9kxNWP!&>3%6PqA+EnauwmC8#Q1}w7-2Q^4&5GH)mDa32%T6Oq^1;0z
z5gJ#oLpzb*0%Mub0;$p08zC{a(o0nN#>J|oJr^}yiYqHez2}pGf%Z;U2zg8UIEMaE
z#xOEvV|baWc<1yZ92u`k%?2?tkF|w%9@6ZI8}l}r(;ej2)JM<IP-r%;*0OQY06Dxq
ztA%8wr2;9|56pAQ(WRaODMqXXd760-!iiEV%&?RLt?ydooe`sa?v2)WP5JvRbN9FQ
zo0V6TotAsA24(A~hR|3`TPd=krWD!o%sZ{08ZB)z7@VfTCQI8D3?8AuZ~{gKTM6#K
z<dML*?JA5+fwD_^wP5m##ma#Kv^qDO5=N<1OoF|OOtt0XV{&-bcKZv}mIm|H-ylqb
z+Pc#qw{}^PUD8*Ok0Spe8-UNMEw{a1ZK?bM`J?!}iqE-f%b7nTJ%MyT2oHns%)V;N
z>$?H;0eBtQhFyZi8z|k0&&xo(Mx?KS;za<bh;--bdqCWe&ygYnnD~#TA^#_QsHx+k
z4meUh|LeqH59$$n(bi!XOxp6ycd~57`Q3qu%Bj$Z`vJcZUSP=I>Ax6I>pmkP^7~c|
z3#I%9*&}eiG=gNxbW#3@F5koRmiFn$QK`V#PZryc2^&$~a4GN6UgDRmw)!dA52j8Z
zRf1g-d-Yk&f?36Z(Uu4Ph?ZuRSlU=x7PQkWXC@58p0vZ}t$@|!Vgf8y)wQPw2DXzB
z9ThN4?B~=vV5nW<DYp;cKJ#*hiF2r}@P8OeIK(u~_W-the=9BhA@iE)HZ;LjAS;#@
zwgTM*H>0gU7lJdbK$ea^CX9O9DbzyK3;&9ldLx#hS#vDy&6s}wjo|x;ww!3#%tvOn
zoWI+0?*S4!eBI;d61wxTS`RuoJVt{Jd)bD}sq7)Y&Uy%ylFHR5Awy(6W^VggOZ!CF
zy!{3H**~B}ij2lyw}bW$?DnDqWm4oU_PZUl&xm|PIY!5}&q)URk5pejz8dRWsRPPU
zHI1YUnY3N1a&(|HC-M&YFch66<-`^h(1Id{NZGB8Qjq9q38LO?W-F_<b09-$q1LfL
zt4I5^svAJ1)w_xD*ES>?=30&^jnZ7kdG~>h>cX8fGdcoh6M(g_f;N^8*2G<`i6Yu{
z3RDfDxc^kIz@0To&z8X*yrhq$eu7D@-8`s2rxNr(4ArPxKgQQZB!#j&atTw~ZZeK;
z5;3~8qJIM*jC=ysg-ZLw^9K~UU+IoSU{d-OBTy8xTKtIA$rmE^D`L=27$}zJ4bz-%
zcl$UI{4lnK-mPM$s^gix@nN`<X=iX4G;ty3QZQjvqAGrlV;ZBFpK(lS6tj^qP$L_L
zy>x>>bDxG`E5BR7Fz7`6>tCo9HWE7Q+k_Q^@gS3s55`o%A1$gE^O8OSGN|O9J#?dK
z@BEi#1>Y%HR`A^d_ip7><mXVX+Dzpbn%`ooQ2Ixnc4;XovL#-T<&Yy<;b}n_e>0Lz
zRF83X8+rQpAT?}G*Efl%1=anavP_WFJ5OptJ;dRafVabjXuD9_-+s{Y;J2{u3qI->
zD2~h~Ow6V>m@YQ7$0OIER5P&lgD|nRA4DtfK;_=U)wrTJ>~wCoBWrnuq#B{6?MF%+
zQU`%Mx-%W@{4VksDphZ=pTrLAyR!10+<L69wA|U%*$Gm)YiB7-x}<M7B=!G-#hL02
zgWRh(Y#s1aNQ>kt{Yd22{&ZRSV6}1C?yb-_rg&%I%(5L+G|<EMKU)bB^t5eMK9H4N
z^3D^pWka`opg)Mp+qKFEQCU{@Y4zXZ^;>Rwbec_)f}<9pruseTaoIleI-m@vchh;J
z+<McbYd{bf9~EKDq8;7KAf58TK=-o9ov4CwiTJgY9b?GKCuQ0J=cRIJUYg8yw6vFd
zU<X7|cG};vcUqQo_L{3VSSIZ{ZLyB{z9oCVWzvDu7I(L0-VwRA6N~WL_g&szSEwY-
ze$b_;PTJoV!`IN>_H|OlHF7AeC{&Qf_WMQoy8~r%=mKmZ1EUCwO|iEVm7Mvn2j1IZ
zMZ0RwM=%3aOqaU=LXf3t`y3<Ck)1B(giHC5R+#UAQVz=_dWTnQ_Yti-M_8W2+PEi8
zn<=}C?I-PTMXFeL9qi!7seHoDZj{e)d{b(qE&UBNCml4|o<2dgH|2_{HfbL}I+4RT
z6L5tlT7jcggy^i56?}gv4sbwyC3vC7#VS!HJnH~noO3WvRDtSX%kOY0RVfa-J(3y9
zbhoO$(@Rr>&LMzy96_6+eaVY_xY?>+9wiAk;tqYQM4fn2Ah)N1mrFUzNIR)(?VYSU
z<e40nw$~s49e3H!s!h=L*|w|-)W^p2Ix0Y;R=V}=T>#~_kyQNVH9&^}Pzr8eU07#q
z>LI4J!Q0p1aw@ml53<Esy#&Rr{R7HwOFQj9SmW9j<tHgQgrK|R@1aTBqXd79l}BS+
z&)Aw6pkJQvz-aREChf3t*2R}q)<sKOI?0bdLQegiF<~KCX<~h%zon(vGO6o`4J?A@
zkoqW7=JQ$i0ib%NrBI3LXQE-Kx#1)lLY=LsUSfIA9x^xK!ZBebE;=dH`=2y%&Z#ww
zGei%U8D#H4`NmS!a&H@xoa;RYehC^FP`}AY@5;)tpWRN!lIqJ5v_flVQC3b&&uM1{
zJ8)sjY{S~3WDto>PtsP6WmUgWTiX7FN->LdAxLOrk)xKQJCR&~+%K`_JKC^~-9eAI
z)t^y!L(>?*_PRd=f>~q7gW6sfVyfR`R3<V7SLj(2Ev>kz#W4vQE@=5)o)j2YV0msX
zZB}VFvg1SQU6=Zc<7oW5SWrk^^TERzN@eB&7`wOL?&t|_iuz+{B$7o-7Fj&AT21HT
zkT*qbCX!c)9kb=hB$TE@t2T91bs}11b=5nW3`bQzX*#q>W#bNV%-%6<nrgFGScTpM
zDtOgAZIF83)Jze2VD_dQ>HW1{|A7TDx-X(WFx?*kpU8B8G1NE3P}8#sqPD0{5w!uJ
zZd2C#YaI~Dj4Ry1dk^yf1F{iyA~87ux*d<uE%Z842X-0}VAM}h7WEI>!MIcT0_zi7
zRBW!$3|FW=tt9_1f$RCv)oFMkOWSQQ=?$^Vyrq2%**VUxoOE{OQi5sD(1<MiTLph~
zWI8aRZ&$@Rg7-B!|F8ZAOGnD6w$A2eQl)hp%m!Bep?Vd`7b;GZZ!ET2+GL=~?}f^<
ztbtLoVvRbs?d09CdZoz<h{Jabo}n#99csFDH|F4L)G(TB`T^Dzn{p&NfvYztUH2UC
zL96!+bmBXZF_CQerXI=%P?y~#v2~frht4ti)<I>1^3fvux61c}sC-5eF7gpZVz3>$
zOCmjt*R>O9`dizGfQzOB%uuv(AKD<3u+qZ&L$>LKMk9Ol=)0om5;&+RwNIdXA1`S?
zDTn7_SbhigVkc--x?vIz-p^TC)HCnH;O=CdY5`r8P}?%T$>)MqeTnG)lR!RW2W|Ft
z6p`0>;REandtwbOxK<e1@yNrd-P_5<3gpCz^m$Yy4>d)44$bKg%LB^%{)mh|Mxowr
z!DCJwkFqF_am1sDc(6Mpwt=-g*eFz`e)%5CJuzH62Dh77FU0&hgJ(}{|3c4_)K#cP
z3lS~;uFy2~!*>CPrYxDvHZeWGVzc53O<4d0?I4Jbb}*C|CnDItyi0^)|DqB=FlgvD
zMCAK*d5H`fm#kcu<x;Lop^25o5VQ#j&q`6Rpg|zpm%CE3#0G|%rwxqyT_!hM)imMN
zj~E8@+ed%~?pGa77@EB)S!~Myu9miTaKN6W1KhD^u>mp%^0sH9N=FYi?AjV+LrrHW
zH*0VaN(W{;O^%MG+9ITW0ZnbEUeMXZarJI$3~4(UD}~sUYUpa9U7>Fjh5TtH?Z+%_
z^jA_YkahIX8ibALF0I916E`}i>-Y}%Yq-aND^kvC0FD24b`)931-O%j%6n&;dhjS^
z%HWC$F~Rnc6c17vDNp@`3dw2i0rRB#WPXipVD=I{YBsBHQIC+5wZckheJinL1EeW^
z=;-N8M16`_-Lr@6Y|!sT@2}W%(iC+%T8T7~nTO2VCfZ;`wCn1ktpI<LdJ$3dfP(Ma
zbi>JcoPt>FHwN0WdR>;R?1fQ{88j+x;d%>JFC_6oH_8}=){isPy9k7PdY$^H90v>a
zg?BK&-$t&tgJ}*2Q0m1fb}2o*53yO)`Vn@nYjK=S2W~SlrsOQT3h5>U%8u?IgQ;-;
z)_>U@v+Q2lx}TqXuYH&K9yK2#^d8bJu)XgOPT_)bx&5p*ce}_AFumq!Gp1`EYcDv?
zeiAsXZDl39i_txR)*ra-TRq(s&LTI;jM0F0=ZCMe$qMc&3)A?aC7uzlw5r9>0Wuml
z_4uEo`?i8|TC3^q4=QNDvE&3iH3}54g&GDISS;4t*xT`m(c5unXjYb{xZ9}>QV>ok
zc4G{B)olZ~aGx*tx4*FVWlcMm(`f9B^TroO+4Vl7^NefZLud;_^7g*X?(eXz_>$kN
zvH=TT&^v~$d&*JBuAT%XZYF|#Y;_IYP18C&W-qBoEugZl;tT?iY8qzui8}qp=)(<@
zcq@=2lo2%d-p-z`6hcqe>J7s|>&)$}?UTW`o!Dt&-!dyp*Z7{d;fvrMS?iP$&1>q4
z(bR9WUY|mh-Y4bIkk&?1YbLF=z(z>a`=(xL+zfw1)ejXciT+eUInD0z@zCgcS7<r9
zFb{s_OV{Kc93dO!{C%shm3O{x#5zIK=1rI3eboXMva$>hzr@`%J9i>|yYO6#t#TOQ
z*|O4IGWiSV&M%C_Eq|Zo-d{t<&ipg<SgLE^7oc%k3ir4SJ^r<{WnTimE({JPdKw;u
zxh8)m@BGY&V*=;SV@A3CU|_UM`3m++cPl?eKZ&-)@R24S$)+49@fzP67%0J<g{cpy
zrvbPW{Hd7X6A~F&>kzSvF!1zyie?A|;|^YoHj3?MaYE4BzT-AHWOXEQ3Fo$xjYPf$
zvVtW}<s7NdsT?BPc!}*X$qME0S{t{-CH?e_&lOsorj9_@+Tf?pkdd(+Ihah473@}b
zS?<lIbt63M5V0qr7oa&>VY*z(XHI3G`qMWdSSV<<lc6#TafKJb2+syD?_I>~-x)H<
zN~%oXIVG5%vt3v!ymRTyC6p?kJM9dQIP3~tP}2TF@O8`qjBu)~1aTiTN^bq8WNG^u
zT1GvmlKYjc{3(cuc(<hf9?h~t8*I2GFd+pxl|MOm9yi?_PWdLzZRnULPY#W8Dr2w;
zLw1|oTAaa-S>Q&+x_~oDS12gO)(h?@Ih;xgb_hPJll(qA)A2eWw0tg@R-$Ab1_?av
zBI|r*OkVHLu+M>Tkt?7&^~N{I2D{Wkq!4eCIu<&^W>4M7(xd7%tl+r%SETBfq(Bq~
zdEv`Q2c0kNpUUkAg%7b7g)VW0Zb>Wg!qR%!fz>cNucFa{h%x6?#reAe-bCkBa{g=l
zyh>IAS!wE>Brj@TEQbK{ldI4rba17VVsXdW)qApnr?eldo4y2TYv+jG^g$8jaKVVE
z6D4`S*8XrOud~1PXrAovi}SL-SB}Km-*~|1V}DNkQbCzYMloAfUXep{=@E1&9i~$f
zPB~l>!Y(P+{z`Du6}3Oqi?zQS&@7G&=`l8(&AHQLhTQ&2a59#mGs5c=8evAP5khZd
zhrJ!uxW8Nk+Z)4e55^1cf}xp10>k!PN<T9{lV*OnH|%{{J4Y1u_lLt|QC;e8q|E*v
zBOSrSM$<sU5kG|j=JE9l!bsX3R{RMTxf0|t8-}X~Hw-ucpye$MbiJ<-&kn}>sCo%_
z>o*X1D$38_m_4vx7ZfaZDqrzmp|KyQVxHoM22UR%aq2O*=s_EP69Bis&R`FbRc6Z}
z*vrisQGU++mjWy0(B-tyjiM47g#A=6{}Alj1~XDOlM1i{S9S+7(Me&{@X_j*XiaRC
zx(c<Wl|3(YDSshPf0Jh7*51i{AHjYP;m6{mJ^v53srQ2gcDC)S$tj1i_l;d_NcZzV
zLaMev5OplHMY`I^(xd7Lq=WP4YxNU){qL@y`$#+}H_I8iluvjWbKTiBjE)z&9NJRf
z<>2Ze55Uzk9!n!UfVKA;F}_3fa~lNyZ}c<Rb!NA0z3-qMC7FiU+f8D}`qvI)4~vP}
zNf2OsG?%rXsiZb5tpv`Q)?*rDMRiT?lv{iHT?Q5RH-XWbKZ7+XM!74MdgjDWUfQ-Y
zPP<%6M4LD82_xmuR9UghT}JJAWfwLOJMn9z6Tx?#;ibe{o_s{!shUc{ckQ)CcURWP
zli(%xn{);QFEe;16I?9k_d1m$!DEA4&aYm>4gmL_SoBMjyU+o63o@Wdx)q7vZ-mFt
z+P~^r>__@xX_2qKqWRp_``lJQv`L-Q-`=_QCp04`sH?!YcZ0l*ig001{Nn0;SfHId
zKTKig9nMh9jlpP5G)tQY)U=oFeP_syY=UYdu7~skhUXDW3X}nw%4hi&!xf%bYPI1J
zQFzKyq-CzjAK2fLcYbN2&6^rLK&EywFkaXTdlGWC2s{SHyp5I(kR*5_ZVu_5hn?OH
zpGfWkvuroiO@Ae(zY0C|R|jnLhq<=E3ZISsd^g)xiWepLeKzk3pLeBAqt4F09{(Ap
zv7x3OFQ)JZ>ZbVD)KxarX5WUt6Xi@3)8*vk=Vv#1nml!WL1!B-*|M{<ZTO2V`YZ3@
zlJ=^CnKRiRnrrWW(B20iG3iJ$5Le?Z6TYf!Df;Kicx}YYnf@UDn7v6t<E!we%9ZXq
zPou>CTCg^U{av}+D~a3+sZ?6Tg+PK08oQ$D<)$0=07E}0&*>f*u%e(3<Curkih0wC
zyZuhw)Muo^Q6lXDUC*(Bfo7aCbmQRZz7*8M1B7PKZNT%<M%3%Uqx4RsrSx!}V5Ic4
zXv67&feNI3Nbf`1c?NVeJZDi({LT#wY(?4v16Gl4_zZG@|CY}O298h)Z!!t(RKT=z
zAWt{aW~6k{59sMK+_6wHG+Paqj~JG=!7$89c=q=^ko_7)p?xK$m!vFCpYbYWgA7u-
zb#8jbe9MKa(^{n2V{g1>T2>};p!!AlY&keEa4{hY(ye9bX@l?QyQ9>_BpGJP$#h62
zO&&NUkWw2Ncq&UjmP%<`<dEj1TOTmaNzVwI=A_%!rxc}6T4ydu&uLAapWc~bT45ZX
zo>P!MsUY138qm&3PrI4^fPpUJ^%6cA_(4N8As|PqabCLZw&Cg4g7mZkPB;riXYi>2
zokDcrv))*gp0UmZzO5-Mho@(tDr0SQL;+D<%0(6W0qYjje>y?^qVyc2JUrc2AlUGl
zBp<z-Y7%T>G1WsO>!{sUQ&GCjIG>S2PU8O%=qf<hO>}Yc5&z{0_*a4SCDiLjJqJ>%
z_aN!=0n%kSWls8}^`y&nsRikIt;6P}JDP{5=K%qQf*`8U;tP$N6Bf+X6iDOtHu!hL
zmUkmf*{Y4(gT~@$TgB9tliHH0t)g^?aR59adR$}B4UGpq^Qyo>%}2<S++mV?J&ngY
zvVm4Jb^4XzY~-O=P+)ktUav!yJ*aQR9QqZnzus6#;|aaCqJEC?$GYJl`F{fX4WOrY
zHKp{Nhy10c6hp!e6kSOCJ_4Uo%w0MyOi5ep&~%R>eZW+lK=-7NmlNt3eIkCdFdrPy
z#}dr(lpKv8pBLb_J_TBg&WjeasUg9<C=&C6_^k)OZpe2E`INiPPrgL9f$B`Y+!XjG
z_$c<ysX;rFqkV<axt$fIR~Y*e_H~{nVG(#<2Yy@NL(7;Si0SX39{|tR)S~oK;{kYq
z!dNd*c%f7dJ^}ENUl|zqH#~$$`MK!lyr^9cnFAZlrkZp}SD45$skfSc%fdKp!tuma
zgeXe4E`xs~xn}`C2|n6Kc%}z72iTaiIUq;pKuNlzHla$#penult`vM=MlsjGfFb*0
zls#iGZ%dV4Hkgl@rK5&v9jVgO#%{weQ>BN^W&m5wS0n$6R8ZpY(o7Q#mtT+UvdzYJ
zli>#@^TQ_7uTAFPnxqaB&@j=vlhSpX%qL9JC-HP+{iyhW!F<YK__YDqdyM9FM(GDe
zjh!YkWR-tqFyD89)NL@ooF;u>$PZs29X5syzetmwFq`rHvU%puhfBv(?|yN()HlqG
z;vc4s0e*j)Lz4CmH>2?2a5D<QE5$z>eHk!}Yd3_U7S&|_y%~Ko|Hv#ooMQeoMcQaa
z>mc}v;oCWq^t|Du;dkSu*G%T0kC#3$neQDhZA_v3zo(c7#z|XK%;&~QFQ%B^7$^NT
z#e86#bg$XGWt{ZmVG}lwlU^NW-ZV~%3^PA6PWn^YJH{_BllER{K0a3J8fixU?UCm8
zvC>9M9rE8F<5`y>9k_TT^6i(Hk^j{tuNwgT<uZtK`m&#!kl!-i3|TtI1M|=8pBnh5
z2L7pme`?^L8u-U(VAC`QYiP?<Ep6ejXdUZ6yW&%Nw+meY|5N&znGU9R%jjWxSBxH}
zcgxT<LQA6WF&)Wa@!nMWgKWCQ`_;v}C)n8;@c3gqcJ0S6S?OZ;tVr3jQ>6ISc?01e
zF*+!TzA<OTg&NFngQZ5326Hi`pK|%czL=fSp+383MJg_ALA9%*f~8_lb{}8OX}?An
zd!~z&&I{;jyv)H;fp5vsU?F!OE2TH?XqRXox9+SzTQ1e;ajwK*J{%HmGZ~!9%X#(x
z=ZScaeSAYR@Jx`dxjbFW(`ufs;^{h`KFrf6dHNhr_wn>FPe0`8S)Qiaw017x=~X<P
z!PB`sUCh&Jp048QI-Wkv(<gcQ98dT0^e|69<mp+Sre4Y0=jl~Eox#(&JYCGwYM!p*
z={lZ1%+n`%`W#R9@$@iHKji6Io~CB<_IY|0PiOFSE>9Qpw3?@@c)E_K5A*a%o<7IZ
zeLOwPQwbN|s5R3z32)w?Y|G8g%bsSNnq$w+$+z2WlNS1_Y_cby;iu$JmZWT)0pj(r
zNb$-qmaY&v925GQ8l~*|hJY^{@7S6W@GR%h^7>$QWv~X}0cxtGY=Y>8u2Oc@ntJfl
zQaaG1l2v#QT@4Nx^?(~?O}<(WRp7bC+JKZz?=zIL@uD$&v-g;0H#M+>z--?NcQwu*
z@vhk_)D;kz;%#QWX0NX?;AU^R)p)o)O--IP8V`}b&5Md?&r?_9MZE?zfv;@VuGR+L
zOeSS}8|vzO^#M}k-{B&^NWR#}dF2@$EGK?dF8m@dj{0o@e*>9e_=x)DMhBq5xDz%8
z4SorKBQSPk^z{Qq2P+D2hFJ_6{EDw%gd!X{G>G<vzkY-xj_|7DCQ0~7!EpgH<f}z}
z;m;dQ8bv<|hl~6@=}FWVewEJHh*Z=U>jQh^3GiY)K$cx%-4pP4ks)6!>I?sydZ|Y7
z6ekq&3p$amL^=6(0So`Wh1c)q^+|rxosgfSr8_uY)EDbX3-6#<Pel8ozUY5jTz#=V
zHSz^mtY=i8*6jHD4**9x5$maVFM0oXAaRLxT7$Gch&-iqwyLjh<9^*nZ%o0}$;VNH
zaDK-#B7F$3zJ5=pgHiNkvP3#|jh2dUQBS~+A)~J^_8)snGzzf}3;KBb-Nfq){dbo*
z7-=`JZzJZqE57~BC?gV4U&Lt<@mu!j>QXko{pWzw*WY@JgUF<<x3ENdNesLOM#qnX
zQYSL{`gFWbSKA_%NW0?Xk8l4V$kOo<!zA`=A|6b9e}(;M+$HH9WGuS+Vn4TknMM&$
z9}k}Z8Hp&`r@w5XtNT3be+*i(T~S}`XGFXn8y^^9-=clduYT0kw=dpD*zZOGE<vwB
zgRsDVi2{9nsiHz7<w;+h{zW~JrlMS5zt5vp?DJ@B;>Rl<9)ZflOY~o=)asw%_38PM
z{<7%-<VT@C{!2M=cv1hlI6TqPCDKvIN84s4+Lx_TOUDXAUKR&3K4R|Zs!I|%R$TXy
zapEF(L~E7MoA7@$zZ128-x?I)niIG2k8fY2O8jktba_U{kU|q?yI5!Sg(gTM*3Vdc
zO7uP^7H^K;zr^BG@y0HP9>ww|tkhzC(ifVb6|wHb;xCZIdJ>Bt9=(r=#Se{3LFFc_
z9Kx^1(j$IDrbCZnc~gS;8B}0`rx5;CU&(}3LHNyB{3!TAhaSc9CPdH>eokL#k}itg
zZ^q)sNWw40;xC4usnDZX-XvXucwdR|mrBAv=qs5dSfiNtx&o7QnIz`34h=nERjo|K
zU)&`RQj@)=NT&t{F<Or1gOTEAWI7l^m^ryk!i#f(t5~Tdy@u3)cIj_r=_0;#y@yXC
z{wIMq;I27dKc4|l{H<~44HhsYydw^u$?@^$6ZE{D<gvxcGmqhij#oYBvnh`M9gKeH
zxF_q^ql|u()WX+EbcbEPVECc^rNTt|`7NU#YA4SFFToDu?d@fb7waog(bdEF47H!L
zz~c}2hg=uHSti0~GW<xXGuOfD&@)B4nv>8!o&^6(;8}c>$_iHB#=a%#XcGGIaK4!s
z_s+QvM)@mF|1t1Z*#8+%1;=Q~S>O}NKNSmGqIfMH;1ki`1H3K4{92!c{+EpZp#K5L
zu0JH9e>Vw!APN2==sS^|R|8M_iI0~vhtZFe=<m?!qGy?O&Exd(@nc$&;I}b8T3!7&
z9w8=_ryuxHMvFv$IIq9V*dQg6e?k)c&A^kLIJlj(aAxgE=r;jRdWerpv>Nw!iQ0Xh
z;fKay+LeU<oh10-*f3DLjsgdhknUILn#A!fH#rzgzfGZw;szy>XGs$L<|O#%Isa~l
zgYm6q-`Ll3e9s&W-_7w)VIwmda*DW{94-9^@TBKXuIEx-aDvebs=@TjB=|fWkTU(x
zaWG=`+Z5!|ar2!h9@Q9zXQVM-xxkMyUMA6N5A>HK3H{?q@UJAnUyqyAMB`NgyiMOA
z&wGGRG+uWn!9M{!^E2KG#=&aP7;Q^J|8f%irI@;j+MSpLzd8y23E;_3#_toJVfe99
ze7vq_fhYaM$07Rz_Q{Flc{K_CB*TxCY%?890rZ;;y2ii{CE_zX3I5x_CyGNxe;~^I
zLsbRi+rqvj>G33do=Sp$mElK9^x6)(eC!+RQ4;#kli+9IhA&aSZU&ysi#R_}k%Yc6
z3BElE{$DvhC)2?t*OFbpXCmv!bZ7|veT?BpN_|BdemA$z<D)b=H_g}Zr#Sr|(J7KY
z-p?EWK2d+iUX&PbPl9&=Z{uz06MCa>BK|E5kNp52FC(wC0eB`S_rp(f{L{eGd=&53
z&*1&t$?4<cuJtnhdP%hWMPo=F=z;GKKjI{JF#J&ZM}fBuMN6BL(Em0G{ma0U{2~s3
z@F&NBXZxt-6|5D});Mg`NpIbm4t(n__Iuma4NX;3{J!OMz&xc9@!D#<fov~DlPL?-
z`kh0AvgqTnAQ%pd$KowV@RjL^L*%8O<OUGx!adJ*%bWt2+l}ARx}9$Kt@axbgsh%N
z-=g3iwE;x?iH7y!vAbBTmtZ|)LhQLJ`q8bk27zI;FmH$tr;GolMT-;B<7yBMilVrb
z1zZTqQ-j|V<J8{U6z~Uv_~|kQQ<-15(Cu=Tl_O-`U|>6Upw5jjJM{>8CqmfiP>4a-
z;9g$aP>G0hRfv1zr_g2+PU9PEeF6Mrc*eA8H-K{jA{OyXJLN}2kwDWLNqbUI6|AdU
zgDN^m`!z9pIKWBW?zsyKN{SH}u8896G0Azp=-UelN}Pp2QU{|AAmp4L!9t!yhv1gE
zbIXbmE2?0QtC-^B;Aw-y!9ts8;Y?Ua7anvbzQ8_pAp$5Zre8oKh#iH8N-#o*+$V+z
zVcYa$L?EKX@j)tc*N8zH8jj217>t?%`_TgGMdOZXv3$NJrcAu>h;?&5@rgv7r3BTZ
z@tt%r{HU*5#6s8{_3mpT_>t8L`=v;KiGvjW9pm>=bV3oKP{i-M1#yjdup5elbzU)`
zD5lR)ooIp4k_3esN<J7yj6!b3meUh15q5{9ibfoxP)M<)Zs#qatU@TOpx;LU7SV?U
zq1)mr#Ye!>+ZYcd#B1;%KL{O!2tu(qEnZoC)I%1ni3Jr(T%I^8;(3HMOk6QGoMKEE
zu(*g%u`19aBgJ*dUFB~;1kCy>3g!vLi%?RrRauEHQeSKoym;F4549Cvb3s`&T;9;4
z!3b-*8JHkw=-13a=9YA^LjM7?P#-5$%t8@G^KT9n7NeMKM3{<3)PzSC5#M5nw1L$P
zUu>Yop|tTUG2(*N!Qbc5im||+GY8A4uW2s+QU>$53;|-<+|uH2ab|oDvH5U5em8mS
zzIf><tXMoQMy6=&wEvyaWus2=yyH8@YYQRm;!)?hI#PVNL{;@se1|NF=NrQ}c5VCz
zMex<HkAo3>+5Rc&4zZKvu_)vE?nWqNu`#1X=<gN0Gp3pBd=dm+B6(2LSb7Acqb2e0
z97{2#4W^nQPGV}*NhAs)jkRSk0%troO?iXCS6N`xp+qc}V?3IIr^e&{hw(UL*pO>Y
z5X$*$_MKlB@-(IirpKh=W8+BO`YjSgCe)m(e(oh$49|D=#jlG0p!ug?8H@jCQHkm2
zU=6igS^9NH7gsh;mABe+5d3*j`P{t4KAPKfBO?MwM`KyDSu_|-GcLkv97nbS@(8*y
zJIk1OlII;+GnTt%g*>n_8};b!I2yJywgwM(>GcO=T^aSY?tkk2hu#Bdf(?zM9kWHS
z(-t9)QpDxtQj1ZG4RRW%$DuJ};|PXEn2ux5?soL^<^SWg>TjMaIoj<u-CE(4!WY0F
zI{$CjM&CmF&)7|=2*Ufdac7gnbB>!zuo@N&cqlp=OE<2_Sx&*s#TgPd33RAb?y00(
zteA+lsK=wr>Lx7vT`8O8LtGN?Sy_N0K*u#o#w{xF12o(flOXy(b{CO`69(;#=lG8~
zri+iv%cY3=gynS&_1dNv2Vi_bZt_8uq~^p*#`cE=M~?5%d564?uc$weh=xFqCHgM}
Cxwk_A

diff --git a/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc b/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc
deleted file mode 100644
index 865bd1122df7..000000000000
--- a/example/lstm-word-segment/predict/cpp/lstm-word-segment-predict.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/*!
- * Copyright (c) 2016
- * \file lstm-word-segment-predict.cc
- * \brief C++ predict example of mxnet : lstm word segment
- */
-
-#include <stdio.h>
-
-// Path for c_predict_api
-#include <mxnet/c_predict_api.h>
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <string>
-#include <unordered_map>
-
-using namespace std;
-
-class BufferFile {
-    public:
-        std::string file_path_;
-        int length_;
-        char *buffer_;
-
-        explicit BufferFile(std::string file_path): file_path_(file_path) {
-            std::ifstream ifs(file_path_.c_str(), std::ios::in | std::ios::binary);
-            if (!ifs) {
-                std::cerr << "Can't open the file. Please check " << file_path << ". \n";
-            }
-            ifs.seekg(0, std::ios::end);
-            length_ = ifs.tellg();
-            ifs.seekg(0, std::ios::beg);
-            std::cerr << file_path_.c_str() << " ... " << length_ << " bytes\n";
-
-            buffer_ = new char[sizeof(char) * length_];
-            ifs.read(buffer_, length_);
-            ifs.close();
-        }
-
-        int GetLength() {
-            return length_;
-        }
-
-        char *GetBuffer() {
-            return buffer_;
-        }
-
-        ~BufferFile() {
-            delete [] buffer_;
-            buffer_ = NULL;
-        }
-};
-
-char PrintOutputResult(const std::vector<float> &data, unordered_map<int, char> &idx2label) {
-    float best_acc = 0.0;
-    int best_idx = 0;
-
-    for (int i = 0; i < static_cast<int>(data.size()); ++i) {
-        if (data[i] > best_acc) {
-            best_acc = data[i];
-            best_idx = i;
-        }
-    }
-
-    return idx2label[best_idx];
-}
-
-void ReadVocabMap(const std::string &map_file, unordered_map<string, int> &dict) {
-    ifstream m_file(map_file.c_str());
-    if (!m_file) {
-        cerr << "open file " << map_file << " failed." << endl;
-    }
-
-    string line;
-    while (getline(m_file, line)) {
-        int idx = line.find_first_of('');
-        if (idx != -1) {
-            string w = line.substr(0, idx);
-            int index = atoi(line.substr(idx+1).c_str());
-            dict[w] = index;
-        }
-    }
-
-    m_file.close();
-}
-
-int GetUTF8Vec(const string &text, vector<std::string> &utf8_array) {
-    utf8_array.clear();
-    int idx = 0;
-    while (idx < text.size()) {
-        if ((text[idx] & 0x80) == 0) {  // single byte character
-            utf8_array.push_back(text.substr(idx, 1));
-            ++idx;
-        } else if ((text[idx] & 0xE0) == 0xC0) {  // double bytes
-            utf8_array.push_back(text.substr(idx, 2));
-            idx += 2;
-        } else if ((text[idx] & 0xF0) == 0xE0) {  // triple bytes
-            utf8_array.push_back(text.substr(idx, 3));
-            idx += 3;
-        } else {
-            ++idx;
-        }
-    }
-    return idx;
-}
-
-int main(int argc, char *argv[]) {
-    unordered_map<int, char> idx2label;
-    idx2label[0] = 'B';
-    idx2label[1] = 'M';
-    idx2label[2] = 'E';
-    idx2label[3] = 'S';
-
-    // load vocabulary
-    unordered_map<string, int> vocab_dict;
-    ReadVocabMap("../vocab_map", vocab_dict);
-
-    string symbol_file = "../../checkpoint/lstm-symbol.json";
-    string param_file = "../../checkpoint/lstm-0099.params";
-    BufferFile symbol_data(symbol_file);
-    BufferFile param_data(param_file);
-
-    int dev_type = 2; // 1: cpu, 2: gpu
-    int dev_id = 0; // arbitrary
-    mx_uint num_input_nodes = 3; // data, init_c, init_h
-    const char *input_key[3] = { "data", "l0_init_c", "l0_init_h" };
-    const char **input_keys = input_key;
-
-    mx_uint batch_size = 1;
-    mx_uint num_hidden = 300;
-    mx_uint context_size = 7;
-
-    const mx_uint input_shape_indptr[4] = {0, 2, 4, 6};
-    const mx_uint input_shape_data[6] = { batch_size, context_size, batch_size, num_hidden, batch_size, num_hidden };
-
-    PredictorHandle out = 0; // alias for void *
-
-    // Create Predictor
-    MXPredCreate((const char *)symbol_data.GetBuffer(),
-                (const char *)param_data.GetBuffer(),
-                static_cast<size_t>(param_data.GetLength()),
-                dev_type,
-                dev_id,
-                num_input_nodes,
-                input_keys,
-                input_shape_indptr,
-                input_shape_data,
-                &out);
-
-    vector<mx_float> init_c = vector<mx_float>(batch_size * num_hidden);
-    vector<mx_float> init_h = vector<mx_float>(batch_size * num_hidden);
-    vector<mx_float> data = vector<mx_float>(batch_size * context_size);
-    string input_str;
-    vector<string> utf8_arr;
-    int window = (int) (context_size - 1) / 2;
-    while (getline(cin, input_str)) {
-        utf8_arr.clear();
-        GetUTF8Vec(input_str, utf8_arr);
-
-        init_c.clear(); init_h.clear();
-        MXPredSetInput(out, "l0_init_c", init_c.data(), batch_size * num_hidden);
-        MXPredSetInput(out, "l0_init_h", init_h.data(), batch_size * num_hidden);
-        for (size_t i = 0; i < utf8_arr.size(); ++i) {
-            data.clear();
-            data.resize(batch_size * context_size);
-            for (int j = -window; j <= window; ++j) {
-                if (i+j < 0 || i+j >= utf8_arr.size()) {
-                    data[j+window] = vocab_dict["P"];
-                } else {
-                    if (vocab_dict.find(utf8_arr[i]) != vocab_dict.end()) {
-                        data[j+window] = vocab_dict[utf8_arr[i+j]];
-                    } else {
-                        data[j+window] = vocab_dict["U"];
-                    }
-                }
-            }
-            MXPredSetInput(out, "data", data.data(), batch_size * context_size);
-            // Do Predict
-            MXPredForward(out);
-            // Get Output
-            mx_uint output_index = 0;
-            mx_uint *shape = 0;
-            mx_uint shape_len;
-
-            MXPredGetOutputShape(out, output_index, &shape, &shape_len);
-            size_t size = 1;
-            for (mx_uint k = 0; k < shape_len; ++k) size *= shape[k];
-            vector<float> result(size);
-
-            MXPredGetOutput(out, output_index, &(result[0]), size);
-
-            // Print Output Label
-            char char_label = PrintOutputResult(result, idx2label);
-            switch(char_label) {
-                case 'B': cout << utf8_arr[i]; break;
-                case 'M': cout << utf8_arr[i]; break;
-                case 'E': cout << utf8_arr[i] << " "; break;
-                case 'S': cout << utf8_arr[i] << " "; break;
-            }
-        }
-        cout << endl;
-    }
-    MXPredFree(out);
-}
diff --git a/example/lstm-word-segment/predict/lstm-word-segment-predict.cc b/example/lstm-word-segment/predict/lstm-word-segment-predict.cc
deleted file mode 100644
index f1087c8260d8..000000000000
--- a/example/lstm-word-segment/predict/lstm-word-segment-predict.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/*!
- * Copyright (c) 2016
- * \file lstm-word-segment-predict.cc
- * \brief C++ predict example of mxnet : lstm word segment
- */
-
-#include <stdio.h>
-
-// Path for c_predict_api
-#include <mxnet/c_predict_api.h>
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <string>
-#include <unordered_map>
-
-using namespace std;
-
-class BufferFile {
-    public:
-        std::string file_path_;
-        int length_;
-        char *buffer_;
-
-        explicit BufferFile(std::string file_path): file_path_(file_path) {
-            std::ifstream ifs(file_path_.c_str(), std::ios::in | std::ios::binary);
-            if (!ifs) {
-                std::cerr << "Can't open the file. Please check " << file_path << ". \n";
-            }
-            ifs.seekg(0, std::ios::end);
-            length_ = ifs.tellg();
-            ifs.seekg(0, std::ios::beg);
-            std::cerr << file_path_.c_str() << " ... " << length_ << " bytes\n";
-
-            buffer_ = new char[sizeof(char) * length_];
-            ifs.read(buffer_, length_);
-            ifs.close();
-        }
-
-        int GetLength() {
-            return length_;
-        }
-
-        char *GetBuffer() {
-            return buffer_;
-        }
-
-        ~BufferFile() {
-            delete [] buffer_;
-            buffer_ = NULL;
-        }
-};
-
-void PrintOutputResult(const std::vector<float> &data, unordered_map<int, char> &idx2label, mx_uint shape, vector<char> &res) {
-    float best_acc = 0.0;
-    int best_idx = 0;
-
-    for (int i = 0; i < static_cast<int>(data.size()); ++i) {
-        if (i % shape == 0) {
-            res.push_back(idx2label[best_idx]);
-            best_acc = data[i];
-            best_idx = i % shape;
-        } else {
-            if (data[i] > best_acc) {
-                best_acc = data[i];
-                best_idx = i;
-            }
-        }
-    }
-}
-
-void ReadVocabMap(const std::string &map_file, unordered_map<string, int> &dict) {
-    ifstream m_file(map_file.c_str());
-    if (!m_file) {
-        cerr << "open file " << map_file << " failed." << endl;
-    }
-
-    string line;
-    while (getline(m_file, line)) {
-        int idx = line.find_first_of('');
-        if (idx != -1) {
-            string w = line.substr(0, idx);
-            int index = atoi(line.substr(idx+1).c_str());
-            dict[w] = index;
-        }
-    }
-
-    m_file.close();
-}
-
-int GetUTF8Vec(const string &text, vector<std::string> &utf8_array) {
-    utf8_array.clear();
-    int idx = 0;
-    while (idx < (int) text.size()) {
-        if ((text[idx] & 0x80) == 0) {  // single byte character
-            utf8_array.push_back(text.substr(idx, 1));
-            ++idx;
-        } else if ((text[idx] & 0xE0) == 0xC0) {  // double bytes
-            utf8_array.push_back(text.substr(idx, 2));
-            idx += 2;
-        } else if ((text[idx] & 0xF0) == 0xE0) {  // triple bytes
-            utf8_array.push_back(text.substr(idx, 3));
-            idx += 3;
-        } else {
-            ++idx;
-        }
-    }
-    return idx;
-}
-
-int main(int argc, char *argv[]) {
-    unordered_map<int, char> idx2label;
-    idx2label[0] = 'B';
-    idx2label[1] = 'M';
-    idx2label[2] = 'E';
-    idx2label[3] = 'S';
-
-    // load vocabulary
-    unordered_map<string, int> vocab_dict;
-    ReadVocabMap("../vocab_map", vocab_dict);
-
-    string symbol_file = "../../checkpoint/lstm-symbol.json";
-    string param_file = "../../checkpoint/lstm-0099.params";
-    BufferFile symbol_data(symbol_file);
-    BufferFile param_data(param_file);
-
-    int dev_type = 1; // 1: cpu, 2: gpu
-    int dev_id = 0; // arbitrary
-    mx_uint num_input_nodes = 3; // data, init_c, init_h
-    const char *input_key[3] = { "data", "l0_init_c", "l0_init_h" };
-    const char **input_keys = input_key;
-
-    mx_uint batch_size = 16;
-    mx_uint num_hidden = 300;
-    mx_uint context_size = 7;
-
-    const mx_uint input_shape_indptr[4] = {0, 2, 4, 6};
-    const mx_uint input_shape_data[6] = { batch_size, context_size, batch_size, num_hidden, batch_size, num_hidden };
-
-    PredictorHandle out = 0; // alias for void *
-
-    // Create Predictor
-    MXPredCreate((const char *)symbol_data.GetBuffer(),
-                (const char *)param_data.GetBuffer(),
-                static_cast<size_t>(param_data.GetLength()),
-                dev_type,
-                dev_id,
-                num_input_nodes,
-                input_keys,
-                input_shape_indptr,
-                input_shape_data,
-                &out);
-
-    vector<mx_float> init_c = vector<mx_float>(batch_size * num_hidden);
-    vector<mx_float> init_h = vector<mx_float>(batch_size * num_hidden);
-    string input_str;
-    vector<string> utf8_arr;
-    int window = (int) (context_size - 1) / 2;
-    while (cin>>input_str) {
-        utf8_arr.clear();
-        GetUTF8Vec(input_str, utf8_arr);
-        int num_of_char = utf8_arr.size();
-        int padding_num = batch_size - num_of_char % batch_size;
-        int parts = (num_of_char + padding_num) / batch_size;
-        vector<mx_float> padding_data = vector<mx_float>(parts * batch_size * context_size);
-
-        init_c.clear(); init_h.clear();
-        MXPredSetInput(out, "l0_init_c", init_c.data(), batch_size * num_hidden);
-        MXPredSetInput(out, "l0_init_h", init_h.data(), batch_size * num_hidden);
-        for (size_t i = 0; i < utf8_arr.size(); ++i) {
-            for (int j = -window; j <= window; ++j) {
-                if (i+j < 0 || i+j >= utf8_arr.size()) {
-                    padding_data[i+j+window] = vocab_dict["P"];
-                } else {
-                    if (vocab_dict.find(utf8_arr[i]) != vocab_dict.end()) {
-                        padding_data[i+j+window] = vocab_dict[utf8_arr[i+j]];
-                    } else {
-                        padding_data[i+j+window] = vocab_dict["U"];
-                    }
-                }
-            }
-        }
-        vector<mx_float> data = vector<mx_float>(batch_size * context_size);
-        vector<char> label_result;
-        for (int k = 0; k < parts; ++k) {
-            for (int j = 0; j < batch_size * context_size; ++j) {
-                data[j] = padding_data[k * batch_size + j];
-            }
-
-            MXPredSetInput(out, "data", data.data(), batch_size * context_size);
-            // Do Predict
-            MXPredForward(out);
-            // Get Output
-            mx_uint output_index = 0;
-            mx_uint *shape = 0;
-            mx_uint shape_len;
-
-            MXPredGetOutputShape(out, output_index, &shape, &shape_len);
-            size_t size = 1;
-            for (mx_uint k = 0; k < shape_len; ++k) size *= shape[k];
-            vector<float> result(size);
-
-            MXPredGetOutput(out, output_index, &(result[0]), size);
-            // Print Output Label
-            PrintOutputResult(result, idx2label, shape_len, label_result);
-        }
-
-        for (size_t i = 0; i < utf8_arr.size(); ++i) {
-            switch(label_result[i]) {
-                case 'B': cout << utf8_arr[i]; break;
-                case 'M': cout << utf8_arr[i]; break;
-                case 'E': cout << utf8_arr[i] << " "; break;
-                case 'S': cout << utf8_arr[i] << " "; break;
-            }
-        }
-        cout << endl;
-    }
-    MXPredFree(out);
-}
diff --git a/example/lstm-word-segment/predict/lstm_predict.py b/example/lstm-word-segment/predict/lstm_predict.py
deleted file mode 100755
index abaeba8b646f..000000000000
--- a/example/lstm-word-segment/predict/lstm_predict.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import mxnet_predict
-import numpy as np
-import time
-
-symbol_file = '../checkpoint/lstm-symbol.json'
-param_file = '../checkpoint/lstm-0000.params'
-vocab_map = {}
-for line in open('vocab_map'):
-    w, idx = line.strip().split('')
-    vocab_map[w] = int(idx)
-
-# lstm_segment input data format: (batch_size, context_size)
-batch_size = 8
-context_size = 7
-num_hidden = 300
-input_shapes = { 'data':(batch_size, context_size), 
-       'l0_init_c': (batch_size, num_hidden), 'l0_init_h': (batch_size, num_hidden)}
-
-lstm_predict_handle = mxnet_predict.Predictor(open(symbol_file).read(), open(param_file).read(), input_shapes, dev_type='cpu')
-
-input_str = '至于计算机的使用'
-
-def reshape_input(s, context_size):
-    padding_num = int((context_size - 1) / 2)
-    unicode_str = unicode(s, 'utf-8')
-    idx_sen = []
-    for char in unicode_str:
-        schar = char.encode('utf-8')
-        if schar in vocab_map:
-            idx_sen.append(vocab_map[schar])
-        else:
-            # unknown symbol
-            idx_sen.append(vocab_map['U'])
-
-    for _ in range(padding_num):
-        idx_sen.insert(0, vocab_map['P'])
-        idx_sen.append(vocab_map['P'])
-
-    x = []
-    for i in range(len(unicode_str)):
-        x.append(idx_sen[i:i+context_size])
-
-    return np.array(x)
-
-init_c = np.zeros((batch_size, num_hidden))
-init_h = np.zeros((batch_size, num_hidden))
-
-x_data = reshape_input(input_str, context_size)
-num_of_char = x_data.shape[0]
-
-Idx2Label = {0:'B', 1: 'M', 2: 'E', 3: 'S'}
-
-print input_str
-input_data_dict = {'l0_init_c': init_c, 'l0_init_h': init_h, 'data': x_data}
-start = time.time()
-lstm_predict_handle.forward(**input_data_dict)
-output = lstm_predict_handle.get_output(0)
-print [ Idx2Label[x] for x in np.argmax(output, axis=1) ]
-print 'elapsed %.2fs' % (time.time() - start)
-
-# for i in range(num_of_char):
-#     input_data_dict['data'] = x_data[i]
-#     lstm_predict_handle.forward(**input_data_dict)
-#     output = lstm_predict_handle.get_output(0)
-#     print Idx2Label[np.argmax(output, axis=1)[0]]
diff --git a/example/lstm-word-segment/predict/mxnet_predict.py b/example/lstm-word-segment/predict/mxnet_predict.py
deleted file mode 100644
index e0dd766e3b56..000000000000
--- a/example/lstm-word-segment/predict/mxnet_predict.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# coding: utf-8
-# pylint: disable=invalid-name, too-many-arguments
-"""Lightweight API for mxnet prediction.
-
-This is for prediction only, use mxnet python package instead for most tasks.
-"""
-from __future__ import absolute_import
-
-import os
-import sys
-import ctypes
-import numpy as np
-
-__all__ = ["Predictor", "load_ndarray_file"]
-
-if sys.version_info[0] == 3:
-    py_str = lambda x: x.decode('utf-8')
-else:
-    py_str = lambda x: x
-
-def c_str(string):
-    """"Convert a python string to C string."""
-    return ctypes.c_char_p(string.encode('utf-8'))
-
-def c_array(ctype, values):
-    """Create ctypes array from a python array."""
-    return (ctype * len(values))(*values)
-
-def _find_lib_path():
-    """Find mxnet library."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    api_path = os.path.join(curr_path, '../../../lib/')
-    dll_path = [curr_path, api_path]
-    dll_path = [os.path.join(p, 'libmxnet.so') for p in dll_path] + \
-        [os.path.join(p, 'libmxnet_predict.so') for p in dll_path]
-    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(lib_path) == 0:
-        raise RuntimeError('Cannot find the files.\n' +
-                           'List of candidates:\n' + str('\n'.join(dll_path)))
-    return lib_path
-
-
-def _load_lib():
-    """Load libary by searching possible path."""
-    lib_path = _find_lib_path()
-    lib = ctypes.cdll.LoadLibrary(lib_path[0])
-    # DMatrix functions
-    lib.MXGetLastError.restype = ctypes.c_char_p
-    return lib
-
-
-def _check_call(ret):
-    """Check the return value of API."""
-    if ret != 0:
-        raise RuntimeError(py_str(_LIB.MXGetLastError()))
-
-_LIB = _load_lib()
-# type definitions
-mx_uint = ctypes.c_uint
-mx_float = ctypes.c_float
-mx_float_p = ctypes.POINTER(mx_float)
-PredictorHandle = ctypes.c_void_p
-NDListHandle = ctypes.c_void_p
-
-devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3}
-
-class Predictor(object):
-    """A predictor class that runs prediction.
-
-    Parameters
-    ----------
-    symbol_json_str : str
-        Path to the symbol file.
-
-    param_raw_bytes : str, bytes
-        The raw parameter bytes.
-
-    input_shapes : dict of str to tuple
-        The shape of input data
-
-    dev_type : str, optional
-        The device type of the predictor.
-
-    dev_id : int, optional
-        The device id of the predictor.
-    """
-    def __init__(self, symbol_file,
-                 param_raw_bytes, input_shapes,
-                 dev_type="cpu", dev_id=0):
-        dev_type = devstr2type[dev_type]
-        indptr = [0]
-        sdata = []
-        keys = []
-        for k, v  in input_shapes.items():
-            if not isinstance(v, tuple):
-                raise ValueError("Expect input_shapes to be dict str->tuple")
-            keys.append(c_str(k))
-            sdata.extend(v)
-            indptr.append(len(sdata))
-        handle = PredictorHandle()
-        param_raw_bytes = bytearray(param_raw_bytes)
-        ptr = (ctypes.c_char * len(param_raw_bytes)).from_buffer(param_raw_bytes)
-        _check_call(_LIB.MXPredCreate(
-            c_str(symbol_file),
-            ptr, len(param_raw_bytes),
-            ctypes.c_int(dev_type), ctypes.c_int(dev_id),
-            mx_uint(len(indptr) - 1),
-            c_array(ctypes.c_char_p, keys),
-            c_array(mx_uint, indptr),
-            c_array(mx_uint, sdata),
-            ctypes.byref(handle)))
-        self.handle = handle
-
-    def __del__(self):
-        _check_call(_LIB.MXPredFree(self.handle))
-
-    def forward(self, **kwargs):
-        """Perform forward to get the output.
-
-        Parameters
-        ----------
-        **kwargs
-            Keyword arguments of input variable name to data.
-
-        Examples
-        --------
-        >>> predictor.forward(data=mydata)
-        >>> out = predictor.get_output(0)
-        """
-        for k, v in kwargs.items():
-            if not isinstance(v, np.ndarray):
-                raise ValueError("Expect numpy ndarray as input")
-            v = np.ascontiguousarray(v, dtype=np.float32)
-            _check_call(_LIB.MXPredSetInput(
-                self.handle, c_str(k),
-                v.ctypes.data_as(mx_float_p),
-                mx_uint(v.size)))
-        _check_call(_LIB.MXPredForward(self.handle))
-
-    def get_output(self, index):
-        """Get the index-th output.
-
-        Parameters
-        ----------
-        index : int
-            The index of output.
-
-        Returns
-        -------
-        out : numpy array.
-            The output array.
-        """
-        pdata = ctypes.POINTER(mx_uint)()
-        ndim = mx_uint()
-        _check_call(_LIB.MXPredGetOutputShape(
-            self.handle, index,
-            ctypes.byref(pdata),
-            ctypes.byref(ndim)))
-        shape = tuple(pdata[:ndim.value])
-        data = np.empty(shape, dtype=np.float32)
-        _check_call(_LIB.MXPredGetOutput(
-            self.handle, mx_uint(index),
-            data.ctypes.data_as(mx_float_p),
-            mx_uint(data.size)))
-        return data
-
-
-def load_ndarray_file(nd_bytes):
-    """Load ndarray file and return as list of numpy array.
-
-    Parameters
-    ----------
-    nd_bytes : str or bytes
-        The internal ndarray bytes
-
-    Returns
-    -------
-    out : dict of str to numpy array or list of numpy array
-        The output list or dict, depending on whether the saved type is list or dict.
-    """
-    handle = NDListHandle()
-    olen = mx_uint()
-    nd_bytes = bytearray(nd_bytes)
-    ptr = (ctypes.c_char * len(nd_bytes)).from_buffer(nd_bytes)
-    _check_call(_LIB.MXNDListCreate(
-        ptr, len(nd_bytes),
-        ctypes.byref(handle), ctypes.byref(olen)))
-    keys = []
-    arrs = []
-
-    for i in range(olen.value):
-        key = ctypes.c_char_p()
-        cptr = mx_float_p()
-        pdata = ctypes.POINTER(mx_uint)()
-        ndim = mx_uint()
-        _check_call(_LIB.MXNDListGet(
-            handle, mx_uint(i), ctypes.byref(key),
-            ctypes.byref(cptr), ctypes.byref(pdata), ctypes.byref(ndim)))
-        shape = tuple(pdata[:ndim.value])
-        dbuffer = (mx_float * np.prod(shape)).from_address(ctypes.addressof(cptr.contents))
-        ret = np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
-        ret = np.array(ret, dtype=np.float32)
-        keys.append(py_str(key.value))
-        arrs.append(ret)
-    _check_call(_LIB.MXNDListFree(handle))
-
-    if len(keys) == 0 or len(keys[0]) == 0:
-        return arrs
-    else:
-        return {keys[i] : arrs[i] for i in range(len(keys))}
diff --git a/example/lstm-word-segment/train.py b/example/lstm-word-segment/train.py
deleted file mode 100755
index 398481011c77..000000000000
--- a/example/lstm-word-segment/train.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-logs = sys.stderr
-
-import mxnet as mx
-import data_helper
-import lstm
-import time
-
-step_size = 10
-context_size = 5
-batch_size = 64
-num_hidden = 150
-num_embed = 100
-
-print >> logs, 'context size = %d' % context_size
-print >> logs, 'batch size = %d' % batch_size
-print >> logs, 'step size = %d' % step_size
-
-train_path, dev_path = 'train.conll', 'test.conll'
-x, y, vocab = data_helper.load_data(train_path)
-x_dev, y_dev, _ = data_helper.load_data(dev_path, vocab, False)
-# save vocabulary
-vocab_file = open('vocab_map', 'w')
-for k, v in vocab.items():
-    print >> vocab_file, '%s%s' % (k, v)
-vocab_file.close()
-
-print >> logs, 'vocabulary size=%d' % len(vocab)
-num_label = len(data_helper.LabelVocab)
-print >> logs, 'output labels = %d' % num_label
-
-
-X_data, y_data = data_helper.reshape_data(x, y, vocab, context_size, step_size)
-X_dev_data, y_dev_data = data_helper.reshape_data(x_dev, y_dev, vocab, context_size, step_size)
-print >> logs, 'training data shape %s' % str(X_data.shape)
-
-num_epoch = 100
-
-lstm_model = lstm.setup_lstm_model(ctx=mx.gpu(1), num_lstm_layer=1,
-                                   step_size=step_size,
-                                   context_size=context_size,
-                                   num_hidden=num_hidden,
-                                   num_embed=num_embed,
-                                   num_label=num_label,
-                                   batch_size=batch_size,
-                                   vocab_size=len(vocab),
-                                   initializer=mx.initializer.Uniform(0.1),
-                                   dropout=0.5)
-
-# default optimizer is RMSProp, you can choose SGD with learning_rate=0.1
-lstm.train_lstm(lstm_model, X_data, y_data, X_dev_data, y_dev_data, num_epoch=num_epoch,
-		optimizer='rmsprop', learning_rate=0.001)

From 560c3124be99e9a8d6b323f014f2478a60e6376b Mon Sep 17 00:00:00 2001
From: Minjie Wang <wmjlyjemaine@gmail.com>
Date: Mon, 20 Jun 2016 02:17:04 -0400
Subject: [PATCH 039/126] Add minimum operator

---
 python/mxnet/ndarray.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 7a3f5a7f1f67..e8ef02bfdb8c 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -761,6 +761,32 @@ def maximum(lhs, rhs):
         None)
     # pylint: enable= no-member, protected-access
 
+def minimum(lhs, rhs):
+    """ Perform minimum operator
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    # pylint: disable= no-member, protected-access
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        NDArray._minimum,
+        lambda x, y: x if x < y else y,
+        NDArray._minimum_scalar,
+        None)
+    # pylint: enable= no-member, protected-access
+
 def true_divide(lhs, rhs):
     """ Same as numpy's true_divide. It adjusts the output type to present the best answer,
     regardless of input types.

From 04754548f392ed9347a1fcd6599343e967c599e5 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@users.noreply.github.com>
Date: Mon, 20 Jun 2016 14:31:22 -0700
Subject: [PATCH 040/126] allow partial set paramter in module (#2482)

allow partial set parameter in module
---
 python/mxnet/module/base_module.py | 10 ++++++++--
 python/mxnet/module/module.py      |  5 +++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 5549965ca702..4465d49437f9 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -452,7 +452,7 @@ def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=Non
         """
         raise NotImplementedError()
 
-    def set_params(self, arg_params, aux_params):
+    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True):
         """Assign parameter and aux state values.
 
         Parameters
@@ -461,9 +461,15 @@ def set_params(self, arg_params, aux_params):
             Dictionary of name to value (`NDArray`) mapping.
         aux_params : dict
             Dictionary of name to value (`NDArray`) mapping.
+        allow_missing : bool
+            If true, params could contain missing values, and the initializer will be
+            called to fill those missing params.
+        force_init : bool
+            If true, will force re-initialize even if already initialized.
+
         """
         self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
-                         allow_missing=False, force_init=True)
+                         allow_missing=allow_missing, force_init=force_init)
 
     def save_params(self, fname):
         """Save model parameters to file.
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 3156ac98948c..e327a6b4c9f9 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -169,7 +169,7 @@ def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=Non
         def _impl(name, arr, cache):
             """Internal helper for parameter initialization"""
             if cache is not None:
-                if cache.has_key(name):
+                if name in cache:
                     cache_arr = cache[name]
 
                     # just in case the cached array is just the target itself
@@ -177,7 +177,8 @@ def _impl(name, arr, cache):
                         cache_arr.copyto(arr)
                 else:
                     assert allow_missing
-                    initializer(name, arr)
+                    if initializer != None:
+                        initializer(name, arr)
             else:
                 initializer(name, arr)
 

From c39abe4e117e8ab987efa04b4ad7b1bcd5505317 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 20 Jun 2016 20:29:04 -0700
Subject: [PATCH 041/126] [OP] Fix target_shape infer in deconv (#2483)

---
 src/operator/cudnn_deconvolution-inl.h |  7 ++-
 src/operator/deconvolution-inl.h       | 82 +++++++++++++++++---------
 src/operator/matrix_op-inl.h           |  1 -
 3 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
index f804419b9c4f..b937b88e1bcd 100644
--- a/src/operator/cudnn_deconvolution-inl.h
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -49,6 +49,7 @@ class CuDNNDeconvolutionOp : public Operator {
     Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
+
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(wmat.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
@@ -216,6 +217,8 @@ class CuDNNDeconvolutionOp : public Operator {
       size_t back_size_w = 0;
       Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
       Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
+      index_t pad_y, pad_x, adj_y, adj_x;
+      param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
       data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
       out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
       weight_offset_ = data.shape_[1] / param_.num_group * param_.num_filter / param_.num_group
@@ -242,8 +245,8 @@ class CuDNNDeconvolutionOp : public Operator {
                                           param_.kernel[1]), CUDNN_STATUS_SUCCESS);
       #endif
       CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
-                                               param_.pad[0],
-                                               param_.pad[1],
+                                               pad_y,
+                                               pad_x,
                                                param_.stride[0],
                                                param_.stride[1],
                                                1,
diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
index 97a1768d733b..c8a732f6f42b 100644
--- a/src/operator/deconvolution-inl.h
+++ b/src/operator/deconvolution-inl.h
@@ -41,25 +41,51 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
     int shape[] = {1, 1};
     DMLC_DECLARE_FIELD(kernel).describe("deconvolution kernel size: (y, x)");
     DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
-    .describe("deconvolution stride: (y, x)");
+        .describe("deconvolution stride: (y, x)");
     shape[0] = shape[1] = 0;
     DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-    .describe("pad for deconvolution: (y, x), a good number is : (kernel-1)/2, "
-              "if target_shape set, pad will be ignored and will be computed "
-              "automatically");
+        .describe("pad for deconvolution: (y, x), a good number is : (kernel-1)/2, "
+                  "if target_shape set, pad will be ignored and will be computed "
+                  "automatically");
     DMLC_DECLARE_FIELD(adj).set_default(TShape(shape, shape + 2))
-    .describe("adjustment for output shape: (y, x), if target_shape set, adj "
-               "will be ignored and will be computed automatically");
+        .describe("adjustment for output shape: (y, x), if target_shape set, adj "
+                  "will be ignored and will be computed automatically");
     DMLC_DECLARE_FIELD(target_shape).set_default(TShape(shape, shape + 2))
-    .describe("output shape with targe shape : (y, x)");
+        .describe("output shape with targe shape : (y, x)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
-    .describe("deconvolution filter(channel) number");
+        .describe("deconvolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
-    .describe("number of groups partition");
+        .describe("number of groups partition");
     DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
-    .describe("Tmp workspace for deconvolution (MB)");
+        .describe("Tmp workspace for deconvolution (MB)");
     DMLC_DECLARE_FIELD(no_bias).set_default(true)
-    .describe("Whether to disable bias parameter.");
+        .describe("Whether to disable bias parameter.");
+  }
+
+  inline void InferPad(index_t input_y, index_t input_x,
+                       index_t* o_pad_y, index_t* o_pad_x,
+                       index_t* o_adj_y, index_t* o_adj_x) const {
+    index_t& pad_y = *o_pad_y;
+    index_t& pad_x = *o_pad_x;
+    index_t& adj_y = *o_adj_y;
+    index_t& adj_x = *o_adj_x;
+    if (target_shape[0] != 0 || target_shape[1] != 0) {
+      pad_y = stride[0] * (input_y - 1) + kernel[0];
+      pad_x = stride[1] * (input_x - 1) + kernel[1];
+      CHECK_GE(pad_y, target_shape[0])
+          << "too big target shape";
+      CHECK_GE(pad_x, target_shape[1])
+          << "too big target shape";
+      pad_y -= target_shape[0];
+      pad_x -= target_shape[1];
+      adj_y = pad_y % 2; pad_y = (pad_y + 1) / 2;
+      adj_x = pad_x % 2; pad_x = (pad_x + 1) / 2;
+    } else {
+      pad_y = pad[0];
+      pad_x = pad[1];
+      adj_y = adj[0];
+      adj_x = adj[1];
+    }
   }
 };
 
@@ -86,6 +112,10 @@ class DeconvolutionOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> data = in_data[deconv::kData].get<xpu, 4, DType>(s);
     Tensor<xpu, 4, DType> out = out_data[deconv::kOut].get<xpu, 4, DType>(s);
+
+    index_t pad_y, pad_x, adj_y, adj_x;
+    param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
+
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
                data.shape_[1] / param_.num_group,
@@ -112,7 +142,7 @@ class DeconvolutionOp : public Operator {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+      if (pad_y == 0 && pad_x == 0) {
         temp_col = unpack_patch2col(out.Slice(i, i + step),
                                     param_.kernel[0],
                                     param_.kernel[1],
@@ -121,7 +151,7 @@ class DeconvolutionOp : public Operator {
                                     1, 1);  // Deconvolution only support dilate equals 1
       } else {
         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-                                        param_.pad[0], param_.pad[1]),
+                                        pad_y, pad_x),
                                     param_.kernel[0],
                                     param_.kernel[1],
                                     param_.stride[0],
@@ -134,7 +164,7 @@ class DeconvolutionOp : public Operator {
                                               gstride * (gid + 1));
         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
       }
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+      if (pad_y == 0 && pad_x == 0) {
         out.Slice(i, i + step) = pack_col2patch(temp_col,
                                    out.Slice(i, i + step).shape_,
                                    param_.kernel[0],
@@ -143,8 +173,8 @@ class DeconvolutionOp : public Operator {
                                    1);  // Deconvolution only support dilate equals 1
       } else {
         Shape<4> pshape = out.Slice(i, i + step).shape_;
-        pshape[2] += 2 * param_.pad[0];
-        pshape[3] += 2 * param_.pad[1];
+        pshape[2] += 2 * pad_y;
+        pshape[3] += 2 * pad_x;
         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
                                         pshape,
                                         param_.kernel[0],
@@ -193,6 +223,9 @@ class DeconvolutionOp : public Operator {
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
+    index_t pad_y, pad_x, adj_y, adj_x;
+    param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
+
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
@@ -209,7 +242,7 @@ class DeconvolutionOp : public Operator {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+      if (pad_y == 0 && pad_x == 0) {
         temp_col = unpack_patch2col(grad.Slice(i, i + step),
                                      param_.kernel[0],
                                      param_.kernel[1],
@@ -217,7 +250,7 @@ class DeconvolutionOp : public Operator {
                                      param_.stride[1],
                                      1, 1);  // Deconvolution only support dilate equals 1
       } else {
-        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), pad_y, pad_x),
                                      param_.kernel[0],
                                      param_.kernel[1],
                                      param_.stride[0],
@@ -332,18 +365,11 @@ class DeconvolutionProp : public OperatorProperty {
     }
     out_shape->clear();
     out_shape->push_back(dshape);
+    // osize = stride * (isize - 1) + ksize - 2 * pad + adj
     const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
     const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-    const index_t pad_y = static_cast<index_t>(param_.target_shape[0] > 0 ?
-        (ksize_y - 1) / 2 : param_.pad[0]);
-    const index_t pad_x = static_cast<index_t>(param_.target_shape[1] > 0 ?
-        (ksize_x - 1) / 2 : param_.pad[1]);
-    const index_t adj_y = static_cast<index_t>(param_.target_shape[0] > 0 ?
-        (param_.target_shape[0] + 2 * pad_y - ksize_y) %
-        param_.stride[0] : param_.adj[0]);
-    const index_t adj_x = static_cast<index_t>(param_.target_shape[1] > 0 ?
-        (param_.target_shape[1] + 2 * pad_x - ksize_x) %
-        param_.stride[1] : param_.adj[1]);
+    index_t pad_y, pad_x, adj_y, adj_x;
+    param_.InferPad(dshape[2], dshape[3], &pad_y, &pad_x, &adj_y, &adj_x);
     CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
     CHECK_EQ(param_.num_filter % param_.num_group, 0) \
diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index cb0172b478f4..de20ff54f2e8 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -356,7 +356,6 @@ void Slice(const TBlob &src,
   using namespace mshadow::expr;
   SliceParam param;
   param.Init(env.kwargs);
-
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int ndim = static_cast<int>(ret->shape_.ndim());
 

From 3655bd539e7e279a84b2969ff7c930935dc5066c Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Tue, 21 Jun 2016 12:42:40 +0800
Subject: [PATCH 042/126] create mxnet/docs/zh/packages/python/ndarray.md
 (#2487)

---
 docs/zh/packages/python/ndarray.md | 161 +++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 docs/zh/packages/python/ndarray.md

diff --git a/docs/zh/packages/python/ndarray.md b/docs/zh/packages/python/ndarray.md
new file mode 100644
index 000000000000..3b2fad0a199c
--- /dev/null
+++ b/docs/zh/packages/python/ndarray.md
@@ -0,0 +1,161 @@
+NDArray API
+===========
+
+NDArray 程序包 (`mxnet.ndarray`) 包含类似于 `numpy.ndarray` 的 张量计算包.  它的语法很相近, 除了增加了一些处理 I/O 和多设备的调用.
+
+Create NDArray
+--------------
+
+类似 `numpy`, 你可以按照下面的方式来创建 `mxnet.ndarray` :
+```python
+>>> import mxnet as mx
+>>> # all-zero array of dimension 100x50
+>>> a = mx.nd.zeros((100, 50))
+>>> # all-one array of dimension 256x32x128x1
+>>> b = mx.nd.ones((256, 32, 128, 1))
+>>> # initialize array with contents
+>>> c = mx.nd.array([[1, 2, 3], [4, 5, 6]])
+```
+
+NDArray operations
+-------------------
+
+我们提供了几个基本的 ndarray 操作, 比如说算术和切片. 更多的操作正在开发中!
+
+### 算术操作
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.zeros((100, 50))
+>>> a.shape
+(100L, 50L)
+>>> b = mx.nd.ones((100, 50))
+>>> # c and d will be calculated in parallel here!
+>>> c = a + b
+>>> d = a - b
+>>> # inplace operation, b's contents will be modified, but c and d won't be affected.
+>>> b += d
+```
+
+### 切片操作
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.zeros((100, 50))
+>>> a[0:10] = 1   # first 10 rows will become 1
+```
+
+Conversion from/to `numpy.ndarray`
+----------------------------------
+
+MXNet NDArray 提供了很自然的方式来支持`mxnet.ndarray` 和 `numpy.ndarray` 之间的互相转换:
+
+```python
+>>> import mxnet as mx
+>>> import numpy as np
+>>> a = np.array([1,2,3])
+>>> b = mx.nd.array(a)                  # convert from numpy array
+>>> b
+<mxnet.ndarray.NDArray object at ...>
+>>> b.asnumpy()                         # convert to numpy array
+array([ 1., 2., 3.], dtype=float32)
+```
+
+Save Load NDArray
+-----------------
+
+你可以一种使用 pickle 来保存和加载 NDArray.
+我们也提供了一些函数来简化 NDArray 的列表或者字典的加载与保存操作.
+
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.zeros((100, 200))
+>>> b = mx.nd.zeros((100, 200))
+>>> # save list of NDArrays
+>>> mx.nd.save("/path/to/array/file", [a, b])
+>>> # save dictionary of NDArrays to AWS S3
+>>> mx.nd.save("s3://path/to/s3/array", {'A' : a, 'B' : b})
+>>> # save list of NDArrays to hdfs.
+>>> mx.nd.save("hdfs://path/to/hdfs/array", [a, b])
+>>> from_file = mx.nd.load("/path/to/array/file")
+>>> from_s3 = mx.nd.load("s3://path/to/s3/array")
+>>> from_hdfs = mx.nd.load("hdfs://path/to/hdfs/array")
+```
+
+使用 `save` 和 `load` 的好的一方面是:
+- 你可以在所有的 `mxnet` 的其他编程语言的绑定中相同的接口.
+- 已经支持 S3 和 HDFS
+
+Multi-device Support
+--------------------
+设备信息是存储在 `mxnet.Context` 数据结构中. 当我们在 mxnet 中创建 ndarray 的时候, 我们要么使用上下文参数(默认是 CPU 上下文) 在指定的设备上创建, 或者按照下面的例子中的方式使用 `with` 表达式:
+
+```python
+>>> import mxnet as mx
+>>> cpu_a = mx.nd.zeros((100, 200))
+>>> cpu_a.context
+cpu(0)
+>>> with mx.Context(mx.gpu(0)):
+>>>   gpu_a = mx.nd.ones((100, 200))
+>>> gpu_a.context
+gpu(0)
+>>> ctx = mx.Context(mx.gpu(0))
+>>> gpu_b = mx.nd.zeros((100, 200), ctx)
+>>> gpu_b.context
+gpu(0)
+```
+
+现在我们还 *不支持* 涉及不同上下文环境中的多个 ndarray 的操作. 为了支持这种情况下的操作, 我们首先使用 `copyto` 方法将不同的上下文环境中的 ndarray 拷贝到同一个上下文环境中, 然后执行相应的操作:
+
+```python
+>>> import mxnet as mx
+>>> x = mx.nd.zeros((100, 200))
+>>> with mx.Context(mx.gpu(0)):
+>>>   y = mx.nd.zeros((100, 200))
+>>> z = x + y
+mxnet.base.MXNetError: [13:29:12] src/ndarray/ndarray.cc:33: Check failed: lhs.ctx() == rhs.ctx() operands context mismatch
+>>> cpu_y = mx.nd.zeros((100, 200))
+>>> y.copyto(cpu_y)
+>>> z = x + cpu_y
+```
+
+```eval_rst
+.. raw:: html
+
+    <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+```
+
+NDArray API Reference
+---------------------
+
+```eval_rst
+.. automodule:: mxnet.ndarray
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.ndarray");</script>
+```
+
+NDArray Random API Reference
+----------------------------
+
+```eval_rst
+.. automodule:: mxnet.random
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.random");</script>
+```
+
+
+Context API Reference
+---------------------
+
+```eval_rst
+.. automodule:: mxnet.context
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.context");</script>
+```

From 8e88cb7148fb50c1191b64b2bd3f2d33de023c05 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@users.noreply.github.com>
Date: Tue, 21 Jun 2016 02:33:38 -0700
Subject: [PATCH 043/126] end 2 end neural art example (#2490)

---
 example/neural-style/README.md                |   2 +
 example/neural-style/end_to_end/README.md     |  20 +++
 example/neural-style/end_to_end/basic.py      | 162 ++++++++++++++++++
 .../end_to_end/boost_inference.py             |  38 ++++
 .../neural-style/end_to_end/boost_train.py    | 147 ++++++++++++++++
 .../end_to_end/data_processing.py             |  67 ++++++++
 example/neural-style/end_to_end/gen_v3.py     |  72 ++++++++
 example/neural-style/end_to_end/gen_v4.py     |  86 ++++++++++
 8 files changed, 594 insertions(+)
 create mode 100644 example/neural-style/end_to_end/README.md
 create mode 100644 example/neural-style/end_to_end/basic.py
 create mode 100644 example/neural-style/end_to_end/boost_inference.py
 create mode 100644 example/neural-style/end_to_end/boost_train.py
 create mode 100644 example/neural-style/end_to_end/data_processing.py
 create mode 100644 example/neural-style/end_to_end/gen_v3.py
 create mode 100644 example/neural-style/end_to_end/gen_v4.py

diff --git a/example/neural-style/README.md b/example/neural-style/README.md
index 50402cc4de07..31c29c3513e3 100644
--- a/example/neural-style/README.md
+++ b/example/neural-style/README.md
@@ -21,3 +21,5 @@ It takes 30 secs for a Titan X to generate the above 600x400 image.
 * The current implementation is based the
   [torch implementation](https://github.com/jcjohnson/neural-style). But we may
   change it dramatically in the near future.
+
+* We will release multi-GPU version soon.
diff --git a/example/neural-style/end_to_end/README.md b/example/neural-style/end_to_end/README.md
new file mode 100644
index 000000000000..2f19bf51abe4
--- /dev/null
+++ b/example/neural-style/end_to_end/README.md
@@ -0,0 +1,20 @@
+# End to End Neural Art
+
+This is an implementation of blog: [http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html](http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html)
+
+
+We will release a Multi-GPU training code soon.
+
+## How to use
+
+
+1. First use `download.sh` to download pre-trained model and sample inputs
+
+2. Then prepare training dataset according to the blog
+
+3. Modify [boost_train.py](boost_train.py)
+
+## Pretrained Model
+
+Weight [https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip](https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip)
+Inference [boost_inference.py](boost_inference.py)
diff --git a/example/neural-style/end_to_end/basic.py b/example/neural-style/end_to_end/basic.py
new file mode 100644
index 000000000000..ed9d3f601554
--- /dev/null
+++ b/example/neural-style/end_to_end/basic.py
@@ -0,0 +1,162 @@
+import sys
+sys.path.insert(0, "../../mxnet/python/")
+
+import mxnet as mx
+import numpy as np
+import model_vgg19 as vgg
+
+class PretrainedInit(mx.init.Initializer):
+    def __init__(self, prefix, params, verbose=False):
+        self.prefix_len = len(prefix) + 1
+        self.verbose = verbose
+        self.arg_params = {k : v for k, v in params.items() if k.startswith("arg:")}
+        self.aux_params = {k : v for k, v in params.items() if k.startswith("aux:")}
+        self.arg_names = set([k[4:] for k in self.arg_params.keys()])
+        self.aux_names = set([k[4:] for k in self.aux_params.keys()])
+
+    def __call__(self, name, arr):
+        key = name[self.prefix_len:]
+        if key in self.arg_names:
+            if self.verbose:
+                print("Init %s" % name)
+            self.arg_params["arg:" + key].copyto(arr)
+        elif key in self.aux_params:
+            if self.verbose:
+                print("Init %s" % name)
+            self.aux_params["aux:" + key].copyto(arr)
+        else:
+            print("Unknown params: %s, init with 0" % name)
+            arr[:] = 0.
+
+
+def style_gram_symbol(input_shape, style):
+    _, output_shapes, _ = style.infer_shape(**input_shape)
+    gram_list = []
+    grad_scale = []
+    for i in range(len(style.list_outputs())):
+        shape = output_shapes[i]
+        x = mx.sym.Reshape(style[i], shape=(int(shape[1]), int(np.prod(shape[2:]))))
+        # use fully connected to quickly do dot(x, x^T)
+        gram = mx.sym.FullyConnected(x, x, no_bias=True, num_hidden=shape[1])
+        gram_list.append(gram)
+        grad_scale.append(np.prod(shape[1:]) * shape[1])
+    return mx.sym.Group(gram_list), grad_scale
+
+
+def get_loss(gram, content):
+    gram_loss = []
+    for i in range(len(gram.list_outputs())):
+        gvar = mx.sym.Variable("target_gram_%d" % i)
+        gram_loss.append(mx.sym.sum(mx.sym.square(gvar - gram[i])))
+    cvar = mx.sym.Variable("target_content")
+    content_loss = mx.sym.sum(mx.sym.square(cvar - content))
+    return mx.sym.Group(gram_loss), content_loss
+
+def get_content_module(prefix, dshape, ctx, params):
+    sym = vgg.get_vgg_symbol(prefix, True)
+    init = PretrainedInit(prefix, params)
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False)
+    mod.init_params(init)
+    return mod
+
+def get_style_module(prefix, dshape, ctx, params):
+    input_shape = {"%s_data" % prefix : dshape}
+    style, content = vgg.get_vgg_symbol(prefix)
+    gram, gscale = style_gram_symbol(input_shape, style)
+    init = PretrainedInit(prefix, params)
+    mod = mx.mod.Module(symbol=gram,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False)
+    mod.init_params(init)
+    return mod
+
+
+def get_loss_module(prefix, dshape, ctx, params):
+    input_shape = {"%s_data" % prefix : dshape}
+    style, content = vgg.get_vgg_symbol(prefix)
+    gram, gscale = style_gram_symbol(input_shape, style)
+    style_loss, content_loss = get_loss(gram, content)
+    sym = mx.sym.Group([style_loss, content_loss])
+    init = PretrainedInit(prefix, params)
+    gram_size = len(gram.list_outputs())
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    mod.bind(data_shapes=[("%s_data" % prefix, dshape)],
+             for_training=True, inputs_need_grad=True)
+    mod.init_params(init)
+    return mod, gscale
+
+
+
+if __name__ == "__main__":
+    from data_processing import PreprocessContentImage, PreprocessStyleImage
+    from data_processing import PostprocessImage, SaveImage
+    vgg_params = mx.nd.load("./model/vgg19.params")
+    style_weight = 2
+    content_weight = 10
+    long_edge = 384
+    content_np = PreprocessContentImage("./input/IMG_4343.jpg", long_edge)
+    style_np = PreprocessStyleImage("./input/starry_night.jpg", shape=content_np.shape)
+    dshape = content_np.shape
+    ctx = mx.gpu()
+    # style
+    style_mod = get_style_module("style", dshape, ctx, vgg_params)
+    style_mod.forward(mx.io.DataBatch([mx.nd.array(style_np)], [0]), is_train=False)
+    style_array = [arr.copyto(mx.cpu()) for arr in style_mod.get_outputs()]
+    del style_mod
+    # content
+    content_mod = get_content_module("content", dshape, ctx, vgg_params)
+    content_mod.forward(mx.io.DataBatch([mx.nd.array(content_np)], [0]), is_train=False)
+    content_array = content_mod.get_outputs()[0].copyto(mx.cpu())
+    del content_mod
+    # loss
+    mod, gscale = get_loss_module("loss", dshape, ctx, vgg_params)
+    extra_args = {"target_gram_%d" % i : style_array[i] for i in range(len(style_array))}
+    extra_args["target_content"] = content_array
+    mod.set_params(extra_args, {}, True, True)
+    grad_array = []
+    for i in range(len(style_array)):
+        grad_array.append(mx.nd.ones((1,), ctx) * (float(style_weight) / gscale[i]))
+    grad_array.append(mx.nd.ones((1,), ctx) * (float(content_weight)))
+    # train
+    img = mx.nd.zeros(content_np.shape, ctx=ctx)
+    img[:] = mx.rnd.uniform(-0.1, 0.1, img.shape)
+    lr = mx.lr_scheduler.FactorScheduler(step=80, factor=.9)
+    optimizer = mx.optimizer.SGD(
+            learning_rate = 0.001,
+            wd = 0.0005,
+            momentum=0.9,
+            lr_scheduler = lr)
+    optim_state = optimizer.create_state(0, img)
+
+    old_img = img.copyto(ctx)
+    clip_norm = 1 * np.prod(img.shape)
+
+    import logging
+    for e in range(800):
+        mod.forward(mx.io.DataBatch([img], [0]), is_train=True)
+        mod.backward(grad_array)
+        data_grad = mod.get_input_grads()[0]
+        gnorm = mx.nd.norm(data_grad).asscalar()
+        if gnorm > clip_norm:
+            print("Data Grad: ", gnorm / clip_norm)
+            data_grad[:] *= clip_norm / gnorm
+
+        optimizer.update(0, img, data_grad, optim_state)
+        new_img = img
+        eps = (mx.nd.norm(old_img - new_img) / mx.nd.norm(new_img)).asscalar()
+        old_img = new_img.copyto(ctx)
+        logging.info('epoch %d, relative change %f', e, eps)
+        if (e+1) % 50 == 0:
+            SaveImage(new_img.asnumpy(), 'output/tmp_'+str(e+1)+'.jpg')
+
+    SaveImage(new_img.asnumpy(), "./output/out.jpg")
+
diff --git a/example/neural-style/end_to_end/boost_inference.py b/example/neural-style/end_to_end/boost_inference.py
new file mode 100644
index 000000000000..72427bedc7a6
--- /dev/null
+++ b/example/neural-style/end_to_end/boost_inference.py
@@ -0,0 +1,38 @@
+import sys
+sys.path.insert(0, "../mxnet/python")
+
+import mxnet as mx
+import numpy as np
+
+#import basic
+import data_processing
+import gen_v3
+import gen_v4
+
+dshape = (1, 3, 480, 640)
+clip_norm = 1.0 * np.prod(dshape)
+model_prefix = "./model/"
+ctx = mx.gpu(0)
+
+
+
+# generator
+gens = [gen_v4.get_module("g0", dshape, ctx),
+        gen_v3.get_module("g1", dshape, ctx),
+        gen_v3.get_module("g2", dshape, ctx),
+        gen_v4.get_module("g3", dshape, ctx)]
+for i in range(len(gens)):
+    gens[i].load_params("./model/%d/v3_0002-0026000.params" % i)
+
+content_np = data_processing.PreprocessContentImage("../IMG_4343.jpg", min(dshape[2:]), dshape)
+data = [mx.nd.array(content_np)]
+for i in range(len(gens)):
+    gens[i].forward(mx.io.DataBatch([data[-1]], [0]), is_train=False)
+    new_img = gens[i].get_outputs()[0]
+    data.append(new_img.copyto(mx.cpu()))
+    data_processing.SaveImage(new_img.asnumpy(), "out_%d.jpg" % i)
+
+
+import os
+os.system("rm -rf out.zip")
+os.system("zip out.zip out_*")
diff --git a/example/neural-style/end_to_end/boost_train.py b/example/neural-style/end_to_end/boost_train.py
new file mode 100644
index 000000000000..9100cc1875a2
--- /dev/null
+++ b/example/neural-style/end_to_end/boost_train.py
@@ -0,0 +1,147 @@
+import sys
+sys.path.insert(0, "../../mxnet/python")
+
+import mxnet as mx
+import numpy as np
+
+import basic
+import data_processing
+import gen_v3
+import gen_v4
+
+# params
+vgg_params = mx.nd.load("./vgg19.params")
+style_weight = 1.2
+content_weight = 10
+dshape = (1, 3, 384, 384)
+clip_norm = 0.05 * np.prod(dshape)
+model_prefix = "v3"
+ctx = mx.gpu(0)
+
+# init style
+style_np = data_processing.PreprocessStyleImage("../starry_night.jpg", shape=dshape)
+style_mod = basic.get_style_module("style", dshape, ctx, vgg_params)
+style_mod.forward(mx.io.DataBatch([mx.nd.array(style_np)], [0]), is_train=False)
+style_array = [arr.copyto(mx.cpu()) for arr in style_mod.get_outputs()]
+del style_mod
+
+# content
+content_mod = basic.get_content_module("content", dshape, ctx, vgg_params)
+
+# loss
+loss, gscale = basic.get_loss_module("loss", dshape, ctx, vgg_params)
+extra_args = {"target_gram_%d" % i : style_array[i] for i in range(len(style_array))}
+loss.set_params(extra_args, {}, True, True)
+grad_array = []
+for i in range(len(style_array)):
+    grad_array.append(mx.nd.ones((1,), ctx) * (float(style_weight) / gscale[i]))
+grad_array.append(mx.nd.ones((1,), ctx) * (float(content_weight)))
+
+# generator
+gens = [gen_v4.get_module("g0", dshape, ctx),
+        gen_v3.get_module("g1", dshape, ctx),
+        gen_v3.get_module("g2", dshape, ctx),
+        gen_v4.get_module("g3", dshape, ctx)]
+for gen in gens:
+    gen.init_optimizer(
+        optimizer='sgd',
+        optimizer_params={
+            'learning_rate': 1e-4,
+            'momentum' : 0.9,
+            'wd': 5e-3,
+            'clip_gradient' : 5.0
+        })
+
+
+# tv-loss
+def get_tv_grad_executor(img, ctx, tv_weight):
+    """create TV gradient executor with input binded on img
+    """
+    if tv_weight <= 0.0:
+        return None
+    nchannel = img.shape[1]
+    simg = mx.sym.Variable("img")
+    skernel = mx.sym.Variable("kernel")
+    channels = mx.sym.SliceChannel(simg, num_outputs=nchannel)
+    out = mx.sym.Concat(*[
+        mx.sym.Convolution(data=channels[i], weight=skernel,
+                           num_filter=1,
+                           kernel=(3, 3), pad=(1,1),
+                           no_bias=True, stride=(1,1))
+        for i in range(nchannel)])
+    kernel = mx.nd.array(np.array([[0, -1, 0],
+                                   [-1, 4, -1],
+                                   [0, -1, 0]])
+                         .reshape((1, 1, 3, 3)),
+                         ctx) / 8.0
+    out = out * tv_weight
+    return out.bind(ctx, args={"img": img,
+                               "kernel": kernel})
+tv_weight = 1e-2
+
+start_epoch = 0
+end_epoch = 3
+
+
+# data
+import os
+import random
+import logging
+
+data_root = "../data/"
+file_list = os.listdir(data_root)
+num_image = len(file_list)
+logging.info("Dataset size: %d" % num_image)
+
+
+# train
+
+for i in range(start_epoch, end_epoch):
+    random.shuffle(file_list)
+    for idx in range(num_image):
+        loss_grad_array = []
+        data_array = []
+        path = data_root + file_list[idx]
+        content_np = data_processing.PreprocessContentImage(path, min(dshape[2:]), dshape)
+        data = mx.nd.array(content_np)
+        data_array.append(data)
+        # get content
+        content_mod.forward(mx.io.DataBatch([data], [0]), is_train=False)
+        content_array = content_mod.get_outputs()[0].copyto(mx.cpu())
+        # set target content
+        loss.set_params({"target_content" : content_array}, {}, True, True)
+        # gen_forward
+        for k in range(len(gens)):
+            gens[k].forward(mx.io.DataBatch([data_array[-1]], [0]), is_train=True)
+            data_array.append(gens[k].get_outputs()[0].copyto(mx.cpu()))
+            # loss forward
+            loss.forward(mx.io.DataBatch([data_array[-1]], [0]), is_train=True)
+            loss.backward(grad_array)
+            grad = loss.get_input_grads()[0]
+            loss_grad_array.append(grad.copyto(mx.cpu()))
+        grad = mx.nd.zeros(data.shape)
+        for k in range(len(gens) - 1, -1, -1):
+            tv_grad_executor = get_tv_grad_executor(gens[k].get_outputs()[0],
+                    ctx, tv_weight)
+            tv_grad_executor.forward()
+
+            grad[:] += loss_grad_array[k] + tv_grad_executor.outputs[0].copyto(mx.cpu())
+            gnorm = mx.nd.norm(grad).asscalar()
+            if gnorm > clip_norm:
+                grad[:] *= clip_norm / gnorm
+
+            gens[k].backward([grad])
+            gens[k].update()
+        if idx % 20 == 0:
+            logging.info("Epoch %d: Image %d" % (i, idx))
+            for k in range(len(gens)):
+                logging.info("Data Norm :%.5f" %\
+                        (mx.nd.norm(gens[k].get_input_grads()[0]).asscalar() / np.prod(dshape)))
+        if idx % 1000 == 0:
+            for k in range(len(gens)):
+                gens[k].save_params("./model/%d/%s_%04d-%07d.params" % (k, model_prefix, i, idx))
+
+
+
+
+
diff --git a/example/neural-style/end_to_end/data_processing.py b/example/neural-style/end_to_end/data_processing.py
new file mode 100644
index 000000000000..5469fb008d7a
--- /dev/null
+++ b/example/neural-style/end_to_end/data_processing.py
@@ -0,0 +1,67 @@
+import numpy as np
+from skimage import io, transform
+from skimage.restoration import denoise_tv_chambolle
+import logging
+import random
+FORMAT = '%(asctime)-15s %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+def PreprocessContentImage(path, short_edge, dshape=None):
+    img = io.imread(path)
+    #logging.info("load the content image, size = %s", img.shape[:2])
+    factor = float(short_edge) / min(img.shape[:2])
+    new_size = (int(img.shape[0] * factor), int(img.shape[1] * factor))
+    resized_img = transform.resize(img, new_size)
+    sample = np.asarray(resized_img) * 256
+    if dshape != None:
+        # random crop
+        xx = int((sample.shape[0] - dshape[2]))
+        yy = int((sample.shape[1] - dshape[3]))
+        xstart = random.randint(0, xx)
+        ystart = random.randint(0, yy)
+        xend = xstart + dshape[2]
+        yend = ystart + dshape[3]
+        sample = sample[xstart:xend, ystart:yend, :]
+
+    # swap axes to make image from (224, 224, 3) to (3, 224, 224)
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+    # sub mean
+    sample[0, :] -= 123.68
+    sample[1, :] -= 116.779
+    sample[2, :] -= 103.939
+    #logging.info("resize the content image to %s", sample.shape)
+    return np.resize(sample, (1, 3, sample.shape[1], sample.shape[2]))
+
+def PreprocessStyleImage(path, shape):
+    img = io.imread(path)
+    resized_img = transform.resize(img, (shape[2], shape[3]))
+    sample = np.asarray(resized_img) * 256
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+
+    sample[0, :] -= 123.68
+    sample[1, :] -= 116.779
+    sample[2, :] -= 103.939
+    return np.resize(sample, (1, 3, sample.shape[1], sample.shape[2]))
+
+def PostprocessImage(img):
+    img = np.resize(img, (3, img.shape[2], img.shape[3]))
+    img[0, :] += 123.68
+    img[1, :] += 116.779
+    img[2, :] += 103.939
+    img = np.swapaxes(img, 1, 2)
+    img = np.swapaxes(img, 0, 2)
+    img = np.clip(img, 0, 255)
+    return img.astype('uint8')
+
+def SaveImage(img, filename, remove_noise=0.02):
+    logging.info('save output to %s', filename)
+    out = PostprocessImage(img)
+    if remove_noise != 0.0:
+        out = denoise_tv_chambolle(out, weight=remove_noise, multichannel=True)
+    io.imsave(filename, out)
+
+
+
+
diff --git a/example/neural-style/end_to_end/gen_v3.py b/example/neural-style/end_to_end/gen_v3.py
new file mode 100644
index 000000000000..dbc83b1ea004
--- /dev/null
+++ b/example/neural-style/end_to_end/gen_v3.py
@@ -0,0 +1,72 @@
+
+# coding: utf-8
+
+# In[1]:
+
+import sys
+sys.path.insert(0, "../../mxnet/python")
+
+
+# In[2]:
+
+import mxnet as mx
+import numpy as np
+
+
+def Conv(data, num_filter, kernel=(5, 5), pad=(2, 2), stride=(2, 2)):
+    sym = mx.sym.Convolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=False)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    return sym
+
+
+def Deconv(data, num_filter, im_hw, kernel=(7, 7), pad=(2, 2), stride=(2, 2), crop=True, out=False):
+    sym = mx.sym.Deconvolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True)
+    if crop:
+        sym = mx.sym.Crop(sym, offset=(1, 1), h_w=im_hw, num_args=1)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    if out == False:
+        sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    else:
+        sym = mx.sym.Activation(sym, act_type="tanh")
+    return sym
+
+# In[70]:
+
+def get_generator(prefix, im_hw):
+    data = mx.sym.Variable("%s_data" % prefix)
+    conv1 = Conv(data, 64) # 192
+    conv1_1 = Conv(conv1, 48, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    conv2 = Conv(conv1_1, 128) # 96
+    conv2_1 = Conv(conv2, 96, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    conv3 = Conv(conv2_1, 256) # 48
+    conv3_1 = Conv(conv3, 192, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    deconv1 = Deconv(conv3_1, 128, (int(im_hw[0] / 4), int(im_hw[1] / 4))) + conv2
+    conv4_1 = Conv(deconv1, 160, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    deconv2 = Deconv(conv4_1, 64, (int(im_hw[0] / 2), int(im_hw[1] / 2))) + conv1
+    conv5_1 = Conv(deconv2, 96, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    deconv3 = Deconv(conv5_1, 3, im_hw, kernel=(8,  8), pad=(3, 3), out=True, crop=False)
+    raw_out = (deconv3 * 128) + 128
+    norm = mx.sym.SliceChannel(raw_out, num_outputs=3)
+    r_ch = norm[0] - 123.68
+    g_ch = norm[1] - 116.779
+    b_ch = norm[2] - 103.939
+    norm_out = 0.4 * mx.sym.Concat(*[r_ch, g_ch, b_ch]) + 0.6 * data
+    return norm_out
+
+def get_module(prefix, dshape, ctx, is_train=True):
+    sym = get_generator(prefix, dshape[-2:])
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    if is_train:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=True, inputs_need_grad=True)
+    else:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False, inputs_need_grad=False)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    return mod
+
+
+
+
diff --git a/example/neural-style/end_to_end/gen_v4.py b/example/neural-style/end_to_end/gen_v4.py
new file mode 100644
index 000000000000..379e904b9690
--- /dev/null
+++ b/example/neural-style/end_to_end/gen_v4.py
@@ -0,0 +1,86 @@
+
+# coding: utf-8
+
+# In[1]:
+
+import sys
+sys.path.insert(0, "../mxnet/python")
+
+
+# In[2]:
+
+import mxnet as mx
+import numpy as np
+
+
+def Conv(data, num_filter, kernel=(5, 5), pad=(2, 2), stride=(2, 2)):
+    sym = mx.sym.Convolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=False)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    return sym
+
+
+def Deconv(data, num_filter, kernel=(6, 6), pad=(2, 2), stride=(2, 2), out=False):
+    sym = mx.sym.Deconvolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    if out == False:
+        sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    else:
+        sym = mx.sym.Activation(sym, act_type="tanh")
+    return sym
+
+# In[70]:
+
+def get_generator(prefix, im_hw):
+    data = mx.sym.Variable("%s_data" % prefix)
+
+    conv1_1 = mx.sym.Convolution(data, num_filter=48, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv1_1 = mx.sym.BatchNorm(conv1_1, fix_gamma=False)
+    conv1_1 = mx.sym.LeakyReLU(conv1_1, act_type="leaky")
+
+    conv2_1 = mx.sym.Convolution(conv1_1, num_filter=32, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv2_1 = mx.sym.BatchNorm(conv2_1, fix_gamma=False)
+    conv2_1 = mx.sym.LeakyReLU(conv2_1, act_type="leaky")
+
+    conv3_1 = mx.sym.Convolution(conv2_1, num_filter=64, kernel=(3, 3), pad=(1, 1), no_bias=False)
+    conv3_1 = mx.sym.BatchNorm(conv3_1, fix_gamma=False)
+    conv3_1 = mx.sym.LeakyReLU(conv3_1, act_type="leaky")
+
+    conv4_1 = mx.sym.Convolution(conv3_1, num_filter=32, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv4_1 = mx.sym.BatchNorm(conv4_1, fix_gamma=False)
+    conv4_1 = mx.sym.LeakyReLU(conv4_1, act_type="leaky")
+
+    conv5_1 = mx.sym.Convolution(conv4_1, num_filter=48, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv5_1 = mx.sym.BatchNorm(conv5_1, fix_gamma=False)
+    conv5_1 = mx.sym.LeakyReLU(conv5_1, act_type="leaky")
+
+    conv6_1 = mx.sym.Convolution(conv5_1, num_filter=32, kernel=(5, 5), pad=(2, 2), no_bias=True)
+    conv6_1 = mx.sym.BatchNorm(conv6_1, fix_gamma=False)
+    conv6_1 = mx.sym.LeakyReLU(conv6_1, act_type="leaky")
+
+    out = mx.sym.Convolution(conv6_1, num_filter=3, kernel=(3, 3), pad=(1, 1), no_bias=True)
+    out = mx.sym.BatchNorm(out, fix_gamma=False)
+    out = mx.sym.Activation(data=out, act_type="tanh")
+    raw_out = (out * 128) + 128
+    norm = mx.sym.SliceChannel(raw_out, num_outputs=3)
+    r_ch = norm[0] - 123.68
+    g_ch = norm[1] - 116.779
+    b_ch = norm[2] - 103.939
+    norm_out = 0.4 * mx.sym.Concat(*[r_ch, g_ch, b_ch]) + 0.6 * data
+    return norm_out
+
+def get_module(prefix, dshape, ctx, is_train=True):
+    sym = get_generator(prefix, dshape[-2:])
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    if is_train:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=True, inputs_need_grad=True)
+    else:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False, inputs_need_grad=False)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    return mod
+
+
+

From 5544a05142564d0c11cdca6766831e1cb06fdd88 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 21 Jun 2016 10:21:33 -0700
Subject: [PATCH 044/126] [OP] Add expand_dims (#2488)

---
 src/operator/matrix_op-inl.h           | 61 ++++++++++++++++++++++++++
 src/operator/matrix_op.cc              |  1 +
 tests/python/unittest/test_operator.py | 13 ++++++
 3 files changed, 75 insertions(+)

diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index de20ff54f2e8..24dda9580e62 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -27,6 +27,7 @@ struct TransposeParam : public dmlc::Parameter<TransposeParam> {
   }
 };
 
+
 template<typename xpu>
 void TransposeImpl(const TBlob &src,
               TBlob *ret,
@@ -141,6 +142,58 @@ inline TShape TransposeShape(const TShape& shp,
 }
 
 
+struct ExpandDimParam : public dmlc::Parameter<ExpandDimParam> {
+  index_t axis;
+  DMLC_DECLARE_PARAMETER(ExpandDimParam) {
+    DMLC_DECLARE_FIELD(axis)
+    .describe("Position (amongst axes) where new axis is to be inserted.");
+  }
+};
+
+
+inline TShape ExpandDimShape(const TShape& shp,
+                             const EnvArguments& env) {
+  ExpandDimParam param;
+  param.Init(env.kwargs);
+  CHECK_LE(param.axis, shp.ndim())
+      << "axis must be smaller equal to the dimension of the array";
+  std::vector<index_t> idx(shp.data(), shp.data() + shp.ndim());
+  idx.insert(idx.begin() + param.axis, 1);
+  return TShape(idx.begin(), idx.end());
+}
+
+
+template<typename xpu>
+void ReshapeImpl(const TBlob &src,
+                 const EnvArguments& env,
+                 TBlob *ret,
+                 OpReqType req,
+                 RunContext ctx) {
+  if (req == kNullOp) return;
+  if (req == kWriteInplace) {
+    CHECK(ret->CheckContiguous() && src.CheckContiguous());
+  }
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      using namespace mshadow::expr;
+      mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mout = src.get_with_shape<xpu, 2, DType>(out.shape_, s);
+      ASSIGN_DISPATCH(out, req, F<mshadow::op::identity>(mout));
+    });
+}
+
+template<typename xpu>
+void ReshapeGrad_(const OutputGrad& out_grad,
+                  const EnvArguments& env,
+                  TBlob *in_grad,
+                  OpReqType req,
+                  RunContext ctx) {
+  ReshapeImpl<xpu>(
+      out_grad.data, env, in_grad, req, ctx);
+}
+
+
 template<typename xpu>
 void DotForward_(const TBlob& lhs,
                  const TBlob& rhs,
@@ -504,6 +557,14 @@ MXNET_REGISTER_SIMPLE_OP(transpose, XPU)
 .set_gradient(XPU::kDevMask, TransposeGrad<XPU>, kNoInplace)
 .describe("Transpose the input matrix and return a new one");
 
+// expand_dim
+MXNET_REGISTER_SIMPLE_OP(expand_dims, XPU)
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReshapeImpl<XPU>, kInplaceInOut)
+.set_shape_function(ExpandDimShape)
+.set_gradient(XPU::kDevMask, ReshapeGrad_<XPU>, kInplaceOutIn)
+.describe("Expand the shape of array by inserting a new axis.");
+
 // crop
 MXNET_REGISTER_SIMPLE_OP(crop, XPU)
 .set_enable_kwargs(true)
diff --git a/src/operator/matrix_op.cc b/src/operator/matrix_op.cc
index ae263121e669..24b72429a169 100644
--- a/src/operator/matrix_op.cc
+++ b/src/operator/matrix_op.cc
@@ -9,6 +9,7 @@
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(TransposeParam);
+DMLC_REGISTER_PARAMETER(ExpandDimParam);
 DMLC_REGISTER_PARAMETER(SimpleCropParam);
 DMLC_REGISTER_PARAMETER(SliceParam);
 DMLC_REGISTER_PARAMETER(FlipParam);
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 6ea951ced92f..73e58d57465f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1037,6 +1037,7 @@ def test_broadcast_axis():
             assert err_backward < 1E-8
     test_broadcast_axis()
 
+
 def test_transpose():
     for ndim in range(1, 6):
         for t in range(5):
@@ -1051,6 +1052,17 @@ def test_transpose():
             y = mx.nd.transpose(x)
             assert_allclose(np.transpose(x.asnumpy()), y.asnumpy())
 
+
+def test_expand_dims():
+    for ndim in range(1, 6):
+        for t in range(5):
+            dims = list(np.random.randint(1, 10, size=ndim))
+            axis = np.random.randint(1, ndim+1)
+            x = mx.nd.array(np.random.normal(size=dims))
+            y = mx.nd.expand_dims(x, axis=axis)
+            assert_allclose(np.expand_dims(x.asnumpy(), axis=axis), y.asnumpy())
+
+
 def test_crop():
     for ndim in range(1, 6):
         for t in range(5):
@@ -1110,6 +1122,7 @@ def test_flip():
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
 if __name__ == '__main__':
+    test_expand_dims()
     test_slice_axis()
     test_softmax()
     test_broadcast_binary_op()

From 2373e6bea7a68d332a34e1cb90cae9c35d5a505a Mon Sep 17 00:00:00 2001
From: Yizhi Liu <javelinjs@gmail.com>
Date: Wed, 22 Jun 2016 10:49:40 +0800
Subject: [PATCH 045/126] [scala]bug fix for example. more logs for native
 library loading (#2493)

---
 scala-package/core/pom.xml                                 | 5 ++++-
 scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala | 4 +++-
 scala-package/core/src/test/resources/log4j.properties     | 7 +++++++
 scala-package/examples/src/main/resources/log4j.properties | 2 +-
 .../dmlc/mxnet/examples/imclassification/TrainMnist.scala  | 4 ++--
 5 files changed, 17 insertions(+), 5 deletions(-)
 create mode 100644 scala-package/core/src/test/resources/log4j.properties

diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 1cd795978f68..5900a0a710a2 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -61,7 +61,10 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-Djava.library.path=${project.parent.basedir}/native/${platform}/target</argLine> 
+          <argLine>
+            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
+            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
+          </argLine> 
         </configuration>
       </plugin>
       <plugin>
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
index d995176f1796..cf3bee93a98a 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
@@ -43,7 +43,8 @@ object Base {
           "Copying native library from the archive. " +
           "Consider installing the library somewhere in the path " +
           "(for Windows: PATH, for Linux: LD_LIBRARY_PATH), " +
-          "or specifying by Java cmd option -Djava.library.path=[lib path].")
+          "or specifying by Java cmd option -Djava.library.path=[lib path]." +
+          "Exception:", e)
         NativeLibraryLoader.loadLibrary("mxnet-scala")
     }
   } catch {
@@ -69,6 +70,7 @@ object Base {
       System.loadLibrary(libname)
     } catch {
       case e: UnsatisfiedLinkError =>
+        logger.warn("Failed to load from native path. Exception:", e)
         val os = System.getProperty("os.name")
         // ref: http://lopica.sourceforge.net/os.html
         if (os.startsWith("Linux")) {
diff --git a/scala-package/core/src/test/resources/log4j.properties b/scala-package/core/src/test/resources/log4j.properties
new file mode 100644
index 000000000000..7d7ca36b28a1
--- /dev/null
+++ b/scala-package/core/src/test/resources/log4j.properties
@@ -0,0 +1,7 @@
+# for development debugging
+log4j.rootLogger = debug, stdout
+
+log4j.appender.stdout = org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target = System.out
+log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n
diff --git a/scala-package/examples/src/main/resources/log4j.properties b/scala-package/examples/src/main/resources/log4j.properties
index 7d7ca36b28a1..7f5be5f70b89 100644
--- a/scala-package/examples/src/main/resources/log4j.properties
+++ b/scala-package/examples/src/main/resources/log4j.properties
@@ -1,5 +1,5 @@
 # for development debugging
-log4j.rootLogger = debug, stdout
+log4j.rootLogger = warn, stdout
 
 log4j.appender.stdout = org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.Target = System.out
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
index f9acac462f10..44792cf4fc00 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
@@ -102,9 +102,9 @@ object TrainMnist {
         envs.put("DMLC_NUM_WORKER", inst.numWorker.toString)
         require(inst.numServer > 0, "Num of servers must > 0")
         envs.put("DMLC_NUM_SERVER", inst.numServer.toString)
+        logger.info("Init PS environments")
+        KVStoreServer.init(envs.toMap)
       }
-      logger.info("Init PS environments")
-      KVStoreServer.init(envs.toMap)
 
       if (inst.role != "worker") {
         logger.info("Start KVStoreServer for scheduler & servers")

From cde4e0dea623ee1fd2259d79e77c9538392051a0 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@users.noreply.github.com>
Date: Tue, 21 Jun 2016 23:47:18 -0700
Subject: [PATCH 046/126] Upload VGG19 symbol (#2502)

* add vgg19 symbol
---
 .../neural-style/end_to_end/model_vgg19.py    | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 example/neural-style/end_to_end/model_vgg19.py

diff --git a/example/neural-style/end_to_end/model_vgg19.py b/example/neural-style/end_to_end/model_vgg19.py
new file mode 100644
index 000000000000..6e287b55b2fa
--- /dev/null
+++ b/example/neural-style/end_to_end/model_vgg19.py
@@ -0,0 +1,96 @@
+import mxnet as mx
+import os, sys
+from collections import namedtuple
+
+ConvExecutor = namedtuple('ConvExecutor', ['executor', 'data', 'data_grad', 'style', 'content', 'arg_dict'])
+
+def get_vgg_symbol(prefix, content_only=False):
+    # declare symbol
+    data = mx.sym.Variable("%s_data" % prefix)
+    conv1_1 = mx.symbol.Convolution(name='%s_conv1_1' % prefix, data=data , num_filter=64, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu1_1 = mx.symbol.Activation(data=conv1_1 , act_type='relu')
+    conv1_2 = mx.symbol.Convolution(name='%s_conv1_2' % prefix, data=relu1_1 , num_filter=64, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu1_2 = mx.symbol.Activation(data=conv1_2 , act_type='relu')
+    pool1 = mx.symbol.Pooling(data=relu1_2 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv2_1 = mx.symbol.Convolution(name='%s_conv2_1' % prefix, data=pool1 , num_filter=128, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu2_1 = mx.symbol.Activation(data=conv2_1 , act_type='relu')
+    conv2_2 = mx.symbol.Convolution(name='%s_conv2_2' % prefix, data=relu2_1 , num_filter=128, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu2_2 = mx.symbol.Activation(data=conv2_2 , act_type='relu')
+    pool2 = mx.symbol.Pooling(data=relu2_2 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv3_1 = mx.symbol.Convolution(name='%s_conv3_1' % prefix, data=pool2 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_1 = mx.symbol.Activation(data=conv3_1 , act_type='relu')
+    conv3_2 = mx.symbol.Convolution(name='%s_conv3_2' % prefix, data=relu3_1 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_2 = mx.symbol.Activation(data=conv3_2 , act_type='relu')
+    conv3_3 = mx.symbol.Convolution(name='%s_conv3_3' % prefix, data=relu3_2 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_3 = mx.symbol.Activation(data=conv3_3 , act_type='relu')
+    conv3_4 = mx.symbol.Convolution(name='%s_conv3_4' % prefix, data=relu3_3 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_4 = mx.symbol.Activation(data=conv3_4 , act_type='relu')
+    pool3 = mx.symbol.Pooling(data=relu3_4 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv4_1 = mx.symbol.Convolution(name='%s_conv4_1' % prefix, data=pool3 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_1 = mx.symbol.Activation(data=conv4_1 , act_type='relu')
+    conv4_2 = mx.symbol.Convolution(name='%s_conv4_2' % prefix, data=relu4_1 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_2 = mx.symbol.Activation(data=conv4_2 , act_type='relu')
+    conv4_3 = mx.symbol.Convolution(name='%s_conv4_3' % prefix, data=relu4_2 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_3 = mx.symbol.Activation(data=conv4_3 , act_type='relu')
+    conv4_4 = mx.symbol.Convolution(name='%s_conv4_4' % prefix, data=relu4_3 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_4 = mx.symbol.Activation(data=conv4_4 , act_type='relu')
+    pool4 = mx.symbol.Pooling(data=relu4_4 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv5_1 = mx.symbol.Convolution(name='%s_conv5_1' % prefix, data=pool4 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu5_1 = mx.symbol.Activation(data=conv5_1 , act_type='relu')
+
+
+    if content_only:
+        return relu4_2
+    # style and content layers
+    style = mx.sym.Group([relu1_1, relu2_1, relu3_1, relu4_1, relu5_1])
+    content = mx.sym.Group([relu4_2])
+    return style, content
+
+
+def get_executor_with_style(style, content, input_size, ctx):
+    out = mx.sym.Group([style, content])
+    # make executor
+    arg_shapes, output_shapes, aux_shapes = out.infer_shape(data=(1, 3, input_size[0], input_size[1]))
+    arg_names = out.list_arguments()
+    arg_dict = dict(zip(arg_names, [mx.nd.zeros(shape, ctx=ctx) for shape in arg_shapes]))
+    grad_dict = {"data": arg_dict["data"].copyto(ctx)}
+    # init with pretrained weight
+    pretrained = mx.nd.load("./model/vgg19.params")
+    for name in arg_names:
+        if name == "data":
+            continue
+        key = "arg:" + name
+        if key in pretrained:
+            pretrained[key].copyto(arg_dict[name])
+        else:
+            print("Skip argument %s" % name)
+    executor = out.bind(ctx=ctx, args=arg_dict, args_grad=grad_dict, grad_req="write")
+    return ConvExecutor(executor=executor,
+                        data=arg_dict["data"],
+                        data_grad=grad_dict["data"],
+                        style=executor.outputs[:-1],
+                        content=executor.outputs[-1],
+                        arg_dict=arg_dict)
+
+def get_executor_content(content, input_size, ctx):
+    arg_shapes, output_shapes, aux_shapes = content.infer_shape(data=(1, 3, input_size[0], input_size[1]))
+    arg_names = out.list_arguments()
+    arg_dict = dict(zip(arg_names, [mx.nd.zeros(shape, ctx=ctx) for shape in arg_shapes]))
+    pretrained = mx.nd.load("./model/vgg19.params")
+    for name in arg_names:
+        if name == "data":
+            continue
+        key = "arg:" + name
+        if key in pretrained:
+            pretrained[key].copyto(arg_dict[name])
+        else:
+            print("Skip argument %s" % name)
+    executor = out.bind(ctx=ctx, args=arg_dict, args_grad=[], grad_req="null")
+    return ConvExecutor(executor=executor,
+                        data=arg_dict["data"],
+                        data_grad=None,
+                        style=None,
+                        content=executor.outputs[0],
+                        arg_dict=arg_dict)
+
+

From 91421fc331acf328ce7ee0facaa08dfb21aed68a Mon Sep 17 00:00:00 2001
From: Hu Shiwen <yajiedesign@gmail.com>
Date: Thu, 23 Jun 2016 04:35:09 +0800
Subject: [PATCH 047/126] add WARPCTC plugin compile with cmake (#2477)

fix cmake variable name
---
 CMakeLists.txt | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de8d1e85360d..b81b1910c015 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,14 +10,17 @@ mxnet_option(USE_OPENMP  "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN   "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
 mxnet_option(USE_CUDA    "Build with CUDA support"   ON)
 mxnet_option(USE_DIST_KVSTORE    "Build with DIST_KVSTORE support"   OFF)
+mxnet_option(USE_PLUGINS_WARPCTC	"Use WARPCTC Plugins" OFF)
 
 SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 
-
 include(mshadow/cmake/mshadow.cmake)
 include(mshadow/cmake/Utils.cmake)
 include(mshadow/cmake/Cuda.cmake)
 
+set(mxnet_LINKER_LIBS "")
+list(APPEND mxnet_LINKER_LIBS ${mshadow_LINKER_LIBS})
+
 include_directories("include")
 include_directories("mshadow")
 include_directories("dmlc-core/include")
@@ -44,7 +47,7 @@ if(USE_OPENCV)
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-  list(APPEND mshadow_LINKER_LIBS ${OpenCV_LIBS})
+  list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
   message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
   add_definitions(-DMXNET_USE_OPENCV=1)
 else(USE_OPENCV)
@@ -68,7 +71,7 @@ if(USE_CUDNN)
   if(HAVE_CUDNN)
     add_definitions(-DUSE_CUDNN)
     include_directories(SYSTEM ${CUDNN_INCLUDE})
-    list(APPEND mshadow_LINKER_LIBS ${CUDNN_LIBRARY})
+    list(APPEND mxnet_LINKER_LIBS ${CUDNN_LIBRARY})
 	add_definitions(-DMSHADOW_USE_CUDNN=1)
   endif()
 endif()
@@ -115,12 +118,22 @@ mxnet_source_group("Source\\resource.cc"   GLOB "src/resource.cc/*.cc")
 mxnet_source_group("Source\\storage"   GLOB "src/storage/*.cc")
 mxnet_source_group("Source\\symbol"   GLOB "src/symbol/*.cc")
 
-
-
-
-
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h")
-FILE(GLOB_RECURSE cuda "src/*.cu")
+FILE(GLOB_RECURSE CUDA "src/*.cu")
+
+if(USE_PLUGINS_WARPCTC)
+	set(WARPCTC_INCLUDE  "" CACHE PATH "WARPCTC include")
+	set(WARPCTC_LIB  "" CACHE FILEPATH "WARPCTC lib")
+	include_directories(SYSTEM ${WARPCTC_INCLUDE})
+	list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB})
+	mxnet_source_group("Include\\plugin\\warpctc"   GLOB "plugin/warpctc/*.h")
+	mxnet_source_group("Source\\plugin\\warpctc"   GLOB "plugin/warpctc/*.cc")
+	mxnet_source_group("Cuda\\plugin\\warpctc"   GLOB "plugin/warpctc/*.cu")
+	FILE(GLOB_RECURSE PLUGINS_SOURCE "plugin/warpctc/*.cc" "plugin/warpctc/*.h")
+	FILE(GLOB_RECURSE PLUGINS_CUSRC "plugin/warpctc/*.cu")
+	list(APPEND SOURCE ${PLUGINS_SOURCE})
+	list(APPEND CUDA ${PLUGINS_CUSRC})
+endif()
 
 if (NOT (EXTRA_OPERATORS STREQUAL ""))
 	mxnet_source_group("Extra"   GLOB_RECURSE "${EXTRA_OPERATORS}/*.cc")
@@ -142,19 +155,18 @@ endif()
 
 if(USE_CUDA)
   # define preprocessor macro so that we will not include the generated forcelink header
-  mshadow_cuda_compile(cuda_objs ${cuda})
+  mshadow_cuda_compile(cuda_objs ${CUDA})
   if(MSVC)
     FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-    list(APPEND mshadow_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
+    list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
     set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
-    list(APPEND mshadow_LINKER_LIBS ${CUDA_cuda_LIBRARY})
+    list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
   else(MSVC)
-    list(APPEND mshadow_LINKER_LIBS nvrtc cuda)
+    list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
   endif()
-  list(APPEND SOURCE ${cuda_objs} ${cuda})
+  list(APPEND SOURCE ${cuda_objs} ${CUDA})
 endif()
 
-
 if(NOT MSVC)
   # Only add c++11 flags and definitions after cuda compiling
   add_definitions(-DDMLC_USE_CXX11)
@@ -170,10 +182,9 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
 else()
   add_library(mxnet SHARED ${SOURCE})
 endif()
-target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
+target_link_libraries(mxnet ${mxnet_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
 
-
 if(MSVC)
   set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 endif()

From 2262bb2f7586171e54614f295b8f6f79b2cf7586 Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@ust.hk>
Date: Thu, 23 Jun 2016 06:15:25 +0800
Subject: [PATCH 048/126] Update MShadow (#2500)

---
 mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mshadow b/mshadow
index 02a4a0ef942c..f64098619b39 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 02a4a0ef942ce49fb1729882e5da2b67c46aa723
+Subproject commit f64098619b39cce4b1449ed76674b692dc5d9246

From 1934de547935f59e87e48dbb8d9aef3eb19c620a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 22 Jun 2016 15:16:51 -0700
Subject: [PATCH 049/126] Enable Bulk Execution of a Subgraph For Faster GPU
 Serving (#2496)

* [Resource] Enable temp space to remain valid until release is called

* [EXEC] Enable bulk execution optimization
---
 include/mxnet/resource.h      |  14 +++
 src/operator/activation-inl.h |  14 ---
 src/resource.cc               |  41 ++++++---
 src/symbol/graph_executor.cc  | 157 ++++++++++++++++++++++++++++++++++
 src/symbol/graph_executor.h   |  13 +++
 5 files changed, 211 insertions(+), 28 deletions(-)

diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 31c380dd8503..da41cb07e52d 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -74,6 +74,10 @@ struct Resource {
    * \brief Get space requested as mshadow Tensor.
    *  The caller can request arbitrary size.
    *
+   *  This space can be shared with other calls to this->get_space.
+   *  So the caller need to serialize the calls when using the conflicted space.
+   *  The temp space will remain valid until release is called.
+   *
    * \param shape the Shape of returning tensor.
    * \param stream the stream of retruning tensor.
    * \return the mshadow tensor requested.
@@ -132,6 +136,16 @@ struct Resource {
         reinterpret_cast<DType*>(get_host_space_internal(shape.Size() * sizeof(DType))),
         shape, shape[ndim - 1], NULL);
   }
+  /*!
+   * \brief Release the all existing allocated space.
+   *  The existing allocated address will remain valdd
+   *  until release is called.
+   *
+   *  Even if user do not call release, the space occupation
+   *  of the resource will remain at most two times of maximum
+   *  requested space.
+   */
+  void release() const;
   /*!
    * \brief internal function to get space from resources.
    * \param size The size of the space.
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index fd7c1aa283d4..e6da76d90329 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -60,10 +60,6 @@ class ActivationOp : public Operator {
     Tensor<xpu, 2, DType> data = in_data[activation::kData].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
     Assign(out, req[activation::kOut], F<ForwardOp>(data));
-    // Use asynchronize complete notification
-    // This is only intended as an example of async ops
-    if (s != NULL) s->Wait();
-    ctx.async_on_complete();
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -83,16 +79,6 @@ class ActivationOp : public Operator {
     Tensor<xpu, 2, DType> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> m_in_grad = in_grad[activation::kData].FlatTo2D<xpu, DType>(s);
     Assign(m_in_grad, req[activation::kData], F<BackwardOp>(m_out_data) * m_out_grad);
-    // Use asynchronize complete notification
-    // This is only intended as an example of async ops
-    if (s != NULL) s->Wait();
-    ctx.async_on_complete();
-  }
-
-  virtual ExecType exec_type() const {
-    // Use asynchronize complete notification
-    // This is only intended as an example of async ops
-    return kAsync;
   }
 };  // class ActivationOp
 
diff --git a/src/resource.cc b/src/resource.cc
index 2ea019f63fd0..bb1842ab83d1 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -24,6 +24,11 @@ struct SpaceAllocator {
   Storage::Handle handle;
   // internal CPU handle
   Storage::Handle host_handle;
+  // The old handles that need to be kept valid
+  // until release is called.
+  // This API allows several CUDA calls using
+  // temp space to get valid space until all the calls finished.
+  std::vector<Storage::Handle> old_handles;
 
   SpaceAllocator() {
     handle.dptr = nullptr;
@@ -33,30 +38,35 @@ struct SpaceAllocator {
   }
 
   inline void Release() {
-    if (handle.size != 0) {
-      Storage::Get()->Free(handle);
-      handle.size = 0;
+    for (const Storage::Handle& handle : old_handles) {
+      if (handle.size != 0) {
+        Storage::Get()->Free(handle);
+      }
     }
+    old_handles.clear();
   }
 
-  inline void ReleaseHost() {
-    if (host_handle.size != 0) {
-      Storage::Get()->Free(host_handle);
-      host_handle.size = 0;
-    }
+  inline void ReleaseAll() {
+    old_handles.push_back(handle);
+    old_handles.push_back(host_handle);
+    this->Release();
+    handle.size = 0;
+    host_handle.size = 0;
   }
 
   inline void* GetSpace(size_t size) {
     if (handle.size >= size) return handle.dptr;
-    this->Release();
-    handle = Storage::Get()->Alloc(size, ctx);
+    old_handles.push_back(handle);
+    handle = Storage::Get()->Alloc(
+        std::max(size, handle.size * 2), ctx);
     return handle.dptr;
   }
 
   inline void* GetHostSpace(size_t size) {
     if (host_handle.size >= size) return host_handle.dptr;
-    this->ReleaseHost();
-    host_handle = Storage::Get()->Alloc(size, Context());
+    old_handles.push_back(host_handle);
+    host_handle = Storage::Get()->Alloc(
+        std::max(size, handle.size * 2), Context());
     return host_handle.dptr;
   }
 };
@@ -203,8 +213,7 @@ class ResourceManagerImpl : public ResourceManager {
         Engine::Get()->DeleteVariable(
             [r](RunContext rctx){
               SpaceAllocator rcpy = r;
-              MSHADOW_CATCH_ERROR(rcpy.Release());
-              MSHADOW_CATCH_ERROR(rcpy.ReleaseHost());
+              MSHADOW_CATCH_ERROR(rcpy.ReleaseAll());
             }, ctx, resource[i].var);
       }
     }
@@ -251,6 +260,10 @@ void* Resource::get_host_space_internal(size_t size) const {
   return static_cast<resource::SpaceAllocator*>(ptr_)->GetHostSpace(size);
 }
 
+void Resource::release() const {
+  return static_cast<resource::SpaceAllocator*>(ptr_)->Release();
+}
+
 ResourceManager* ResourceManager::Get() {
   static resource::ResourceManagerImpl inst;
   return &inst;
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index b990a36e86f5..d54972b27433 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -6,6 +6,7 @@
 #include <dmlc/logging.h>
 #include <mxnet/resource.h>
 #include <mxnet/symbolic.h>
+#include <dmlc/timer.h>
 #include <memory>
 #include <map>
 #include <set>
@@ -288,6 +289,11 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
 
 GraphExecutor::~GraphExecutor() {
   Engine::Get()->WaitForAll();
+  for (auto &kv : cached_seg_opr_) {
+    if (kv.second != nullptr) {
+      Engine::Get()->DeleteOperator(kv.second);
+    }
+  }
   // need to delete the operators before delete the NDArray they referenced.
   for (OpNode& node : op_nodes_) {
     node.DeleteOperator();
@@ -825,6 +831,36 @@ void GraphExecutor::InitOpNodes() {
 }
 
 void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
+  // heurestic, only enable bulk on forward only
+  bool bulk_exec = prefer_bulk_execution_ && !monitor_callback_
+      && topo_start == 0 && num_forward_nodes_ == topo_order_.size();
+
+  if (bulk_exec) {
+    // encode things into a key
+    size_t key = topo_start * op_nodes_.size() + topo_end;
+    if (cached_seg_opr_.count(key) == 0) {
+      cached_seg_opr_[key] = this->CreateCachedOpr(topo_start, topo_end);
+      if (cached_seg_opr_.at(key) != nullptr) {
+        LOG(INFO) << "Created bulk execution on segment ["
+                  << topo_start << ", " << topo_end << ")";
+      }
+    }
+    auto cached_op = cached_seg_opr_.at(key);
+    if (cached_op != nullptr) {
+      Context* pctx = nullptr;
+      for (size_t i = topo_start; i < topo_end; ++i) {
+        uint32_t nid = topo_order_[i];
+        if (!op_nodes_[nid].activated) continue;
+        if (graph_.nodes[nid].is_variable()) continue;
+        OpNode& opnode = op_nodes_[nid];
+        opnode.op_ctx.is_train = is_train;
+        pctx = &(opnode.ctx);
+      }
+      Engine::Get()->Push(cached_op, *pctx);
+      return;
+    }
+  }
+
   for (size_t i = topo_start; i < topo_end; ++i) {
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
@@ -943,6 +979,127 @@ void GraphExecutor::Backward(const std::vector<NDArray> &head_grads) {
   RunOps(true, num_forward_nodes_, topo_order_.size());
 }
 
+Engine::OprHandle GraphExecutor::CreateCachedOpr(size_t topo_start, size_t topo_end) {
+  std::vector<Engine::VarHandle> read_vars;
+  std::vector<Engine::VarHandle> write_vars;
+  Context *pctx = nullptr;
+
+  for (size_t k = topo_start; k < topo_end; ++k) {
+    uint32_t nid = topo_order_[k];
+    OpNode& op_node = op_nodes_[nid];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    if (op_node.op->exec_type() != Operator::kSync) {
+      return nullptr;
+    }
+    if (pctx == nullptr) pctx = &(op_node.ctx);
+    if (*pctx != op_node.ctx) {
+      return nullptr;
+    }
+    const StaticGraph::Node& gnode = graph_.nodes[nid];
+    // AddTO: index is used to store in-place add resources.
+    const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
+
+    for (const DataEntryInfo& out : op_node.outputs) {
+      write_vars.push_back(out.data.var());
+      if (out.type == kTobeBindByExternal) return nullptr;
+    }
+
+    for (const DataEntryInfo& aux : op_node.aux_states) {
+      write_vars.push_back(aux.data.var());
+      if (aux.type == kTobeBindByExternal) return nullptr;
+    }
+
+    for (size_t i = 0; i < ninput; ++i) {
+      const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i];
+      const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+      read_vars.push_back(info.data.var());
+      if (info.type == kTobeBindByExternal) return nullptr;
+    }
+
+    for (const Resource& r : op_node.op_ctx.requested) {
+      write_vars.push_back(r.var);
+    }
+  }
+  if (pctx == nullptr) return nullptr;
+  // deduplication
+  std::sort(write_vars.begin(), write_vars.end());
+  write_vars.resize(std::unique(write_vars.begin(), write_vars.end()) -
+                    write_vars.begin());
+  std::sort(read_vars.begin(), read_vars.end());
+  read_vars.resize(std::unique(read_vars.begin(), read_vars.end()) -
+                   read_vars.begin());
+  auto wit = write_vars.begin();
+  auto rtop = read_vars.begin();
+  for (auto rit = read_vars.begin(); rit != read_vars.end(); ++rit) {
+    while (wit != write_vars.end() && *wit < *rit) ++wit;
+    if (*wit != *rit) {
+      *rtop = *rit;
+      ++rtop;
+    }
+  }
+  read_vars.resize(rtop - read_vars.begin());
+
+  bool is_gpu = pctx->dev_mask() == gpu::kDevMask;
+  auto exec_fun = [this, topo_start, topo_end, is_gpu]
+      (RunContext ctx, Engine::CallbackOnComplete on_complete) {
+    std::vector<OpReqType> req;
+    std::vector<TBlob> in_data, out_data, aux_data;
+    for (size_t k = topo_start; k < topo_end; ++k) {
+      uint32_t nid = topo_order_[k];
+      if (!op_nodes_[nid].activated) continue;
+      if (graph_.nodes[nid].is_variable()) continue;
+      OpNode& op_node = op_nodes_[nid];
+      const StaticGraph::Node& gnode = graph_.nodes[nid];
+      CHECK_NE(op_node.op->exec_type(), Operator::kCrossDeviceCopy);
+      CHECK_NE(op_node.op->exec_type(), Operator::kAsync);
+      // AddTO: index is used to store in-place add resources.
+      const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
+      req.clear();
+      in_data.clear();
+      out_data.clear();
+      aux_data.clear();
+      for (const DataEntryInfo& out : op_node.outputs) {
+        req.push_back(out.op_req);
+        out_data.push_back(out.data.data());
+      }
+      for (size_t i = 0; i < gnode.addto_index.size(); ++i) {
+        CHECK_EQ(req[gnode.addto_index[i]], kWriteInplace);
+        req[gnode.addto_index[i]] = kAddTo;
+        const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i + ninput];
+        const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        CHECK_EQ(info.inplace_op_id, static_cast<int>(nid));
+      }
+      // aux
+      for (const DataEntryInfo& aux : op_node.aux_states) {
+        aux_data.push_back(aux.data.data());
+      }
+      // input
+      for (size_t i = 0; i < ninput; ++i) {
+        const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i];
+        const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        in_data.push_back(info.data.data());
+      }
+      // run the function.
+      Operator* op = op_node.op.get();
+      OpContext* op_ctx_ptr = &op_node.op_ctx;
+      op_ctx_ptr->run_ctx = ctx;
+      op->Forward(*op_ctx_ptr, in_data, req, out_data, aux_data);
+    }
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      // Wait GPU kernel to finish.
+      ctx.get_stream<gpu>()->Wait();
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    }
+    on_complete();
+  };
+  return Engine::Get()->NewOperator(
+      exec_fun, read_vars, write_vars, FnProperty::kNormal);
+}
+
 Executor *Executor::Bind(Symbol symbol,
                          const Context& default_ctx,
                          const std::map<std::string, Context>& group2ctx,
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index c3e1ccb86c7c..50d3a289e9e1 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -46,6 +46,7 @@ class GraphExecutor : public Executor {
                    const std::vector<NDArray> &aux_states,
                    Executor* shared_exec = nullptr) {
     enable_inplace_allocation_ = dmlc::GetEnv("MXNET_EXEC_ENABLE_INPLACE", true);
+    prefer_bulk_execution_ = dmlc::GetEnv("MXNET_EXEC_PREFER_BULK_EXEC", true);
     if (shared_exec != NULL) {
       GraphExecutor* gexec = dynamic_cast<GraphExecutor*>(shared_exec);
       CHECK(gexec) << "Input executor for sharing memory must have GraphExecutor type.";
@@ -191,6 +192,14 @@ class GraphExecutor : public Executor {
    * \return the execution entry.
    */
   inline OpExecEntry GetOpExecEntry(uint32_t node_id);
+  /*!
+   * \brief Try to create a cached operator to run segments between start and end
+   * \param topo_start beginning of segment
+   * \param topo_end end of segment
+   * \return the cached operator.
+   *  Can be nullptr if cached operator cannot be created.
+   */
+  Engine::OprHandle CreateCachedOpr(size_t topo_start, size_t topo_end);
   // initialize the internal graph structure
   void InitGraph(const Symbol &symbol,
                  const Context& default_ctx,
@@ -232,6 +241,8 @@ class GraphExecutor : public Executor {
   size_t total_allocated_temp_;
   // number of forward nodes in the graph
   size_t num_forward_nodes_;
+  // whether to enable bulk execution
+  bool prefer_bulk_execution_;
   // head gradient node in the graph, if there is backward pass
   std::vector<uint32_t> head_grad_nodes_;
   // mirror map of nodes, experimental feature, normally can be ignored.
@@ -246,6 +257,8 @@ class GraphExecutor : public Executor {
   std::shared_ptr<GraphStoragePool> shared_mem_;
   // monitor call back
   std::function<void(const char*, void*)> monitor_callback_;
+  // cached segment operator
+  std::unordered_map<size_t, Engine::OprHandle> cached_seg_opr_;
 };  // class GraphExecutor
 }  // namespace mxnet
 #endif  // MXNET_SYMBOL_GRAPH_EXECUTOR_H_

From bf7798d7720a5ec47ae0b1b942d317fc7586aa21 Mon Sep 17 00:00:00 2001
From: alex-weaver <awsweaver@gmail.com>
Date: Wed, 22 Jun 2016 23:27:54 +0100
Subject: [PATCH 050/126] caffe_converter: fixed bug parsing caffemodel files
 on windows (file should be opened in binary mode) (#2504)

caffe_converter: added instructions and build script for usage on windows.
---
 tools/caffe_converter/README.md               | 20 +++++++++++++++++--
 .../caffe_parse/parse_from_protobuf.py        |  2 +-
 tools/caffe_converter/make_win32.bat          |  3 +++
 3 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 tools/caffe_converter/make_win32.bat

diff --git a/tools/caffe_converter/README.md b/tools/caffe_converter/README.md
index 2e6eca1ea40c..3155239daf1d 100644
--- a/tools/caffe_converter/README.md
+++ b/tools/caffe_converter/README.md
@@ -1,6 +1,6 @@
 # Convert Caffe Model to Mxnet Format
 
-### Build
+### Build (Linux)
 
 Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.html) or [Google protobuf](https://developers.google.com/protocol-buffers/?hl=en) is required. The latter is often much easier to install:  
 
@@ -10,9 +10,25 @@ Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.htm
 
 Now we can build the tool by running `make` in the current directory.
 
+### Build (Windows)
+
+Note: this tool currently only works on python 2.
+
+We must make sure that the installed python binding and protobuf compiler are using the same version of protobuf,
+so we install the bindings first, and then install the corresponding compiler.
+
+1. Install the protobuf bindings. At time of writing, the conda package manager has the most up to date version. Either run `conda install -c conda-forge protobuf` or `pip install protobuf`
+2. Download the win32 build of protoc from [Protocol Buffers Releases](https://github.com/google/protobuf/releases). Make sure to download the version that corresponds to the version of the bindings. Extract to any location then add that location to your `PATH`
+3. Run `make_win32.bat` to build the package
+
+
 ### How to use
 
-Use `./run.sh model_name` to download and convert a model. E.g. `./run.sh vgg19`
+Linux: Use `./run.sh model_name` to download and convert a model. E.g. `./run.sh vgg19`
+
+Windows: Use `python convert_model.py prototxt caffemodel outputprefix`  
+For example: `python convert_model.py VGG_ILSVRC_16_layers_deploy.prototxt VGG_ILSVRC_16_layers.caffemodel vgg16`
+
 
 ### Note
 
diff --git a/tools/caffe_converter/caffe_parse/parse_from_protobuf.py b/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
index 6350a20dfb21..865e047507df 100644
--- a/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
+++ b/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
@@ -10,7 +10,7 @@ def parse_caffemodel(filepath):
 
     returns: layers
     '''
-    f = open(filepath)
+    f = open(filepath, 'rb')
     contents = f.read()
 
     netparam = caffe_pb2.NetParameter()
diff --git a/tools/caffe_converter/make_win32.bat b/tools/caffe_converter/make_win32.bat
new file mode 100644
index 000000000000..7d354dcaeb6c
--- /dev/null
+++ b/tools/caffe_converter/make_win32.bat
@@ -0,0 +1,3 @@
+@protoc --python_out=./ ./caffe_parse/caffe.proto
+@echo done.
+@pause

From be9b08e97c483f4a74ad534007a3ebdce0061581 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Thu, 23 Jun 2016 07:00:11 +0800
Subject: [PATCH 051/126] create mxnet/docs/zh/packages/python/io.md (#2479)

---
 docs/zh/packages/python/io.md | 185 ++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 docs/zh/packages/python/io.md

diff --git a/docs/zh/packages/python/io.md b/docs/zh/packages/python/io.md
new file mode 100644
index 000000000000..08165d84ed0d
--- /dev/null
+++ b/docs/zh/packages/python/io.md
@@ -0,0 +1,185 @@
+MXNet Python Data Loading API
+=============================
+* [Introduction](#introduction) 介绍 MXNet 数据加载模块的主要特性.
+* [Parameters For Data Iterator](#parameters-for-data-iterator) 阐述清楚 dataIter 的参数的不同用法.
+* [Create A Data Iterator](#create-a-data-iterator) 介绍如何在创建一个  python 版本的 MXNet 的 Data Iterator.
+* [How To Get Data](#how-to-get-data) 介绍数据源以及数据预处理工具.
+* [IO API Reference](#io-api-reference) IO API 参考文档以及它们的解释.
+
+Introduction
+------------
+这页面介绍 MXNet 的数据输入方式. MXNet 使用迭代器 (iterator)的方式向神经网络输入数据. 迭代器做了一些数据预处理, 同时以 batch 的形式向神经网络提供数据.
+
+
+* 我们为 MNIST 图像和 RecordIO 图像提供了基本的迭代器.
+* 为了掩盖 IO 开销, 我们提供了预处理策略, 它可以让机器学习的过程和取数据的过程并行来做. 我们使用一个单独的线程来做取数据的工作.
+
+Parameters For Data Iterator
+----------------------------
+
+一般地讲, 如果你要创建一个数据迭代器, 你需要实现下面讲到的五种参数:
+
+* **Dataset Param** 提供数据集的基本信息, 比如说, 文件路径, 输入的数据的 shape. 
+* **Batch Param** 提供构建一个 batch 的信息,  比如说 batch size. 
+* **Augmentation Param** 指定输入数据的扩充方式 (e.g. crop, mirror).
+* **Backend Param** 控制后端线程掩盖数据加载开销的行为.
+* **Auxiliary Param** 提供的可选项, 用来帮助检查和 debug..
+
+通常地讲, **Dataset Param** 和 **Batch Param**  *必须* 提 供, 否则 data batch 无法创建. 其他的参数根据算法和性能的需要来设置.  文档的后半部分会提供解释详尽的例子.
+
+Create A Data Iterator
+----------------------
+这个 IO API 提供在 python 中创建数据迭代器的简单方式. 下面的代码是如何创建一个 Cifar 的数据迭代器的代码.
+
+
+```python
+>>>dataiter = mx.io.ImageRecordIter(
+>>>        # Utility Parameter 
+>>>        # 可选
+>>>        # Name of the data, should match the name of the data input of the network 
+>>>        # data_name='data',
+>>>        # Utility Parameter
+>>>        # 可选
+>>>        # Name of the label, should match the name of the label parameter of the network.
+>>>        # Usually, if the loss layer is named 'foo', then the label input has the name
+>>>        # 'foo_label', unless overwritten
+>>>        # label_name='softmax_label',
+>>>        # Dataset Parameter
+>>>        # Impulsary
+>>>        # indicating the data file, please check the data is already there
+>>>        path_imgrec="data/cifar/train.rec",
+>>>        # Dataset Parameter
+>>>        # Impulsary
+>>>        # indicating the image size after preprocessing
+>>>        data_shape=(3,28,28),
+>>>        # Batch Parameter
+>>>        # Impulsary
+>>>        # tells how many images in a batch
+>>>        batch_size=100,
+>>>        # Augmentation Parameter
+>>>        # 可选
+>>>        # when offers mean_img, each image will substract the mean value at each pixel
+>>>        mean_img="data/cifar/cifar10_mean.bin",
+>>>        # Augmentation Parameter
+>>>        # 可选
+>>>        # randomly crop a patch of the data_shape from the original image
+>>>        rand_crop=True,
+>>>        # Augmentation Parameter
+>>>        # Optional
+>>>        # randomly mirror the image horizontally
+>>>        rand_mirror=True,
+>>>        # Augmentation Parameter
+>>>        # Optional
+>>>        # randomly shuffle the data
+>>>        shuffle=False,
+>>>        # Backend Parameter
+>>>        # Optional
+>>>        # Preprocessing thread number
+>>>        preprocess_threads=4,
+>>>        # Backend Parameter
+>>>        # Optional
+>>>        # Prefetch buffer size
+>>>        prefetch_buffer=1)
+```
+
+从上面的代码中, 我们可以学到如何创建一个数据迭代器. 首先, 你需要明确的指出需要取哪种类型的数据(MNIST, ImageRecord 等等). 然后, 提供描述数据的可选参数, 比如 batching, 数据扩充方式, 多线程处理, 预取数据.  MNNet 框架会检查参数的有效性, 如果一个必须的参数没有提供, 框架会报错.
+
+
+How To Get Data
+---------------
+
+
+我们提供了 [脚本](../../tests/python/common/get_data.py) 来下载MNIST数据 和Cifar10 ImageRecord 数据.  如果你要创建你自己的数据集, 我们建议您用RecordIO 作为数据格式.
+
+## Create Dataset Using RecordIO
+
+RecordIO 实现了顺序存储 record 的数据格式. 我们建议图像数据按照 record 的格式来存储和打包到一起. 这样做的有以下几点:
+
+
+* 将图像储存为压缩过的格式, 比如 JPEG, 因为 record 可以大小不同. 压缩过的格式可以极大的减小储存在硬盘上的数据集大小.
+* 将若干 record 打包存储, 可以实现硬盘的连续读取, 避免随机读取硬盘.
+* RecordIO 容易分块, 这样分布式处理的设置会更加简单. 后面会有例子具体来说明.
+
+我们提供了 [im2rec tool](../../tools/im2rec.cc) 来让用户自己来生成 RecordIO 格式的数据集.  下面是具体流程:
+
+### 0.Before you start
+确定你已经下载了需要的数据集. 你不需要自己来做图像的 resize 操作, 现在 `im2rec` 这个工具可以自动来做这种操作. 你可以查看 `im2rec` 提供的的信息来获取更多的内容.
+
+### 1.Make the image list
+当你得到了信息之后, 你首先需要生成一个 image list 的文件. 格式如下
+```
+integer_image_index \t label_index \t path_to_image
+```
+通常, 这个程序会读取一个包含所有图像文件名的列表文件,  shuffe 这些文件, 然后将 shuffe 后的图像文件名列表分为训练列表文件和测试列表文件. 按照下面给出的例子的格式存储.
+
+简单的例子文件
+
+```bash
+895099  464     n04467665_17283.JPEG
+10025081        412     ILSVRC2010_val_00025082.JPEG
+74181   789     n01915811_2739.JPEG
+10035553        859     ILSVRC2010_val_00035554.JPEG
+10048727        929     ILSVRC2010_val_00048728.JPEG
+94028   924     n01980166_4956.JPEG
+1080682 650     n11807979_571.JPEG
+972457  633     n07723039_1627.JPEG
+7534    11      n01630670_4486.JPEG
+1191261 249     n12407079_5106.JPEG
+```
+
+### 2.Make the binary file
+
+需要用 *im2rec* 这个程序来生成二进制文件.  im2rec 需要你刚刚生成的 _ image list file _ 的路径, 图像的 _root_ 路径 和 _output file_ 路径作为参数. 这个过程需要花费几个小时, 所以需要耐心. :)
+
+
+简单的例子:
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256
+```
+要想获得更多的用法, 直接运行 ```./bin/im2rec```命令, 会在终端打印出详细的用法.
+
+### Extension: Mutliple Labels for a Single Image
+
+`im2rec` 工具以及 `mx.io.ImageRecordIter` 支持对单个图像打多个标签. 假设你需要为单个图像打四个标签, 你可以按照下面的步骤来使用 RecordIO 相关的工具.
+
+1. 按照下面的格式生成 image list 文件:
+```
+integer_image_index \t label_1 \t label_2 \t label_3 \t label_4 \t path_to_image
+```
+
+2. 使用 `im2rec` 时, 需要增加一个 'label_width=4' 作为命令行参数, 比如.
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
+```
+
+3. 在你的迭代器初始化的时候, 设置 `label_width=4` 和 `path_imglist=<<The PATH TO YOUR image.lst>>` 作为参数.
+
+```python
+dataiter = mx.io.ImageRecordIter(
+  path_imgrec="data/cifar/train.rec",
+  data_shape=(3,28,28),
+  path_imglist="data/cifar/image.lst",
+  label_width=4
+)
+```
+
+这样你就完成了一个多标签的数据迭代器.
+
+```eval_rst
+.. raw:: html
+
+    <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+```
+
+
+IO API Reference
+----------------
+
+```eval_rst
+.. automodule:: mxnet.io
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.io");</script>
+```

From 88151cafbf7e8f89e99d5c33adacf9f7ed34a0d2 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Thu, 23 Jun 2016 07:00:22 +0800
Subject: [PATCH 052/126] create mxnet/docs/zh/packages/python/kvstore.md
 (#2480)

---
 docs/zh/packages/python/kvstore.md | 133 +++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 docs/zh/packages/python/kvstore.md

diff --git a/docs/zh/packages/python/kvstore.md b/docs/zh/packages/python/kvstore.md
new file mode 100644
index 000000000000..5d694036d548
--- /dev/null
+++ b/docs/zh/packages/python/kvstore.md
@@ -0,0 +1,133 @@
+KVStore API
+===========
+
+* [基本的 Push 和 Pull 操作](#basic-push-and-pull)
+* [key-value pairs 列表的接口](#interface-for-list-key-value-pairs)
+* [多机]() TODO
+
+## Basic Push and Pull
+
+单机多卡的基本操作.
+
+### Initialization
+
+首先让我们来考虑一个简单的例子. 首先初始化一个 (`int`, `NDAarray`) push 到 KVstore 里, 然后再将数据   pull 下来.
+
+```python
+>>> kv = mx.kv.create('local') # create a local kv store.
+>>> shape = (2,3)
+>>> kv.init(3, mx.nd.ones(shape)*2)
+>>> a = mx.nd.zeros(shape)
+>>> kv.pull(3, out = a)
+>>> print a.asnumpy()
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+
+### Push, Aggregation, and Updater
+
+对于任意一个被初始化的 key-value 数据, 我们可以向这个 `key` push 一个相同 shape 的数据覆盖掉原来的 value.
+
+
+```python
+>>> kv.push(3, mx.nd.ones(shape)*8)
+>>> kv.pull(3, out = a) # pull out the value
+>>> print a.asnumpy()
+[[ 8.  8.  8.]
+ [ 8.  8.  8.]]
+```
+
+需要做 push 操作的数据可以存储在任意的设备上. 而且, 我们可以向同一个 key 推送多份数据, KVStore 客户端会首先将这些数据做 sum 操作, 然后将聚合后的结果 push 到服务器端, 减少了数据通信.
+
+```python
+>>> gpus = [mx.gpu(i) for i in range(4)]
+>>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
+>>> kv.push(3, b)
+>>> kv.pull(3, out = a)
+>>> print a.asnumpy()
+[[ 4.  4.  4.]
+ [ 4.  4.  4.]]
+```
+
+对于每一个 push 操作, KVStore 将推送上来的数据通过 `updater` 定义的方式来进行更新操作. 默认的 `updater` 是 `ASSGIN`, 我们可以根据需要来替换掉这个默认的 `update`.
+
+```python
+>>> def update(key, input, stored):
+>>>     print "update on key: %d" % key
+>>>     stored += input * 2
+>>> kv._set_updater(update)
+>>> kv.pull(3, out=a)
+>>> print a.asnumpy()
+[[ 4.  4.  4.]
+ [ 4.  4.  4.]]
+>>> kv.push(3, mx.nd.ones(shape))
+update on key: 3
+>>> kv.pull(3, out=a)
+>>> print a.asnumpy()
+[[ 6.  6.  6.]
+ [ 6.  6.  6.]]
+```
+
+### Pull
+
+我们已经看到如何 pull 单个的 key-value 对. 类似于 push, 我们也能只用一个调用来将数据 pull 到多个设备中.
+
+```python
+>>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
+>>> kv.pull(3, out = b)
+>>> print b[1].asnumpy()
+[[ 6.  6.  6.]
+ [ 6.  6.  6.]]
+```
+
+## Interface for list key-value pairs
+
+我们到现在为止所介绍的所有操作都是关于一个 key. KVStore 也提供了对 key-value pair 列表的接口. 
+
+针对单个的设备:
+
+```python
+>>> keys = [5, 7, 9]
+>>> kv.init(keys, [mx.nd.ones(shape)]*len(keys))
+>>> kv.push(keys, [mx.nd.ones(shape)]*len(keys))
+update on key: 5
+update on key: 7
+update on key: 9
+>>> b = [mx.nd.zeros(shape)]*len(keys)
+>>> kv.pull(keys, out = b)
+>>> print b[1].asnumpy()
+[[ 3.  3.  3.]
+ [ 3.  3.  3.]]
+```
+
+针对多个设备:
+
+```python
+>>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
+>>> kv.push(keys, b)
+update on key: 5
+update on key: 7
+update on key: 9
+>>> kv.pull(keys, out = b)
+>>> print b[1][1].asnumpy()
+[[ 11.  11.  11.]
+ [ 11.  11.  11.]]
+```
+
+```eval_rst
+.. raw:: html
+
+    <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+```
+
+
+## API Reference
+
+```eval_rst
+.. automodule:: mxnet.kvstore
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.kvstore");</script>
+```

From 721fd800be0dab51c594bf956e0188a78a8a7a70 Mon Sep 17 00:00:00 2001
From: Prem Nair <prem.q.nair@gmail.com>
Date: Wed, 22 Jun 2016 21:15:06 -0700
Subject: [PATCH 053/126] Add ability to checkpoint at lower rates (#1669)

---
 python/mxnet/callback.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index 8265eb39b52c..618d8cd4d783 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -8,22 +8,26 @@
 import time
 from .model import save_checkpoint
 
-def do_checkpoint(prefix):
+def do_checkpoint(prefix, period=1):
     """Callback to checkpoint the model to prefix every epoch.
 
     Parameters
     ----------
     prefix : str
         The file prefix to checkpoint to
+    period : int
+    	How many epochs to wait before checkpointing. Default is 1.
 
     Returns
     -------
     callback : function
         The callback function that can be passed as iter_end_callback to fit.
     """
+    period = int(max(1, period))
     def _callback(iter_no, sym, arg, aux):
         """The checkpoint function."""
-        save_checkpoint(prefix, iter_no + 1, sym, arg, aux)
+        if (iter_no + 1) % period == 0:
+            save_checkpoint(prefix, iter_no + 1, sym, arg, aux)
     return _callback
 
 

From 388a2a985981136b45bae9cacbcd0dd683000562 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 22 Jun 2016 21:57:03 -0700
Subject: [PATCH 054/126] [OP] Move sample op to operator_util, support source
 function (#2511)

---
 include/mxnet/operator_util.h        |  35 +++
 mshadow                              |   2 +-
 python/mxnet/random.py               |  11 +-
 src/operator/operator_util.cc        | 389 ++++++++++++++++++++-------
 src/operator/sample_op-inl.h         | 112 ++++++++
 src/operator/sample_op.cc            |  16 ++
 src/operator/sample_op.cu            |   7 +
 tests/python/unittest/test_random.py |  34 ++-
 8 files changed, 501 insertions(+), 105 deletions(-)
 create mode 100644 src/operator/sample_op-inl.h
 create mode 100644 src/operator/sample_op.cc
 create mode 100644 src/operator/sample_op.cu

diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index 94eb994d07e1..71276a4bec5f 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -60,6 +60,26 @@ struct EnvArguments {
   std::vector<Resource> resource;
 };
 
+/*!
+ * \brief source function that generate output based on env
+ *  The result container is pre-allocated with the correct shape.
+ * \param env The Environment arguments.
+ * \param ret The containter to store return value.
+ * \param req The requirement to stroe the ret.
+ * \param ctx Runtime context to execute the function.
+ */
+typedef void (*SourceFunction)(const EnvArguments& env,
+                               TBlob* ret,
+                               OpReqType req,
+                               RunContext ctx);
+
+/*!
+ * \brief Shape inference function to get the correct shape.
+ * \param env The Environment arguments.
+ * \return The inferred result shape.
+ */
+typedef TShape (*SourceShapeFunction)(const EnvArguments& env);
+
 /*!
  * \brief Unary function that takes a src and save result to ret.
  *  The result container is pre-allocated with the correct shape.
@@ -265,6 +285,11 @@ class SimpleOpRegEntry {
    * \param req the request.
    */
   virtual TSelf& set_resource_request(ResourceRequest req) = 0;
+  /*!
+   * \brief set source inference function.
+   * \param fshapeinfer The source function that peforms the operation.
+   */
+  virtual TSelf& set_shape_function(SourceShapeFunction fshapeinfer) = 0;
   /*!
    * \brief set shape inference function.
    *  Default: out_shape = in_shape
@@ -277,6 +302,16 @@ class SimpleOpRegEntry {
    * \param fshapeinfer The binary function that peforms the operation.
    */
   virtual TSelf& set_shape_function(BinaryShapeFunction fshapeinfer) = 0;
+  /*!
+   * \brief set function of the function to be fsource
+   * \param dev_mask The device mask of the function can act on.
+   * \param fsource The unary function that peforms the operation.
+   * \param register_symbolic Whether register a symbolic operator as well.
+   */
+  virtual TSelf& set_function(
+      int dev_mask,
+      SourceFunction fsource,
+      SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
   /*!
    * \brief set function of the function to be funary
    * \param dev_mask The device mask of the function can act on.
diff --git a/mshadow b/mshadow
index f64098619b39..948abff7d748 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit f64098619b39cce4b1449ed76674b692dc5d9246
+Subproject commit 948abff7d748ae4f83c6ce194ec0174db4858e9a
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index 489a8bd16097..a3eec80e8122 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -38,17 +38,17 @@ def uniform(low, high, shape=None, ctx=None, out=None):
         if isinstance(shape, int):
             shape = (shape,)
         out = empty(shape, ctx)
-    return NDArray._random_uniform(low, high, out=out)
+    return NDArray._sample_uniform(low=low, high=high, shape=out.shape, out=out)
 
 
-def normal(mean, stdvar, shape=None, ctx=None, out=None):
+def normal(loc, scale, shape=None, ctx=None, out=None):
     """Generate normal(Gaussian) distribution N(mean, stdvar^2) with shape.
 
     Parameters
     ----------
-    mean : float
+    loc : float
         The mean of the normal distribution.
-    stdvar : float
+    scale : float
         The standard deviation of normal distribution.
     shape : tuple, optional
         Output shape of the NDArray generated.
@@ -71,7 +71,7 @@ def normal(mean, stdvar, shape=None, ctx=None, out=None):
         if isinstance(shape, int):
             shape = (shape,)
         out = empty(shape, ctx)
-    return NDArray._random_gaussian(mean, stdvar, out=out)
+    return NDArray._sample_normal(loc=loc, scale=scale, shape=out.shape, out=out)
 
 
 def seed(seed_state):
@@ -96,4 +96,3 @@ def seed(seed_state):
         raise ValueError('sd must be int')
     seed_state = ctypes.c_int(int(seed_state))
     check_call(_LIB.MXRandomSeed(seed_state))
-
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index 2b92b3150a97..70e88331d49a 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -15,6 +15,7 @@ namespace mxnet {
 namespace op {
 
 class SimpleOpPropBase;
+class SimpleSourceOpProp;
 class SimpleUnaryOpProp;
 class SimpleBinaryOpProp;
 
@@ -64,6 +65,12 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
     return *this;
   }
 
+  TSelf& set_shape_function(SourceShapeFunction fshapeinfer) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    source_shape_ = fshapeinfer;
+    return *this;
+  }
+
   TSelf& set_shape_function(UnaryShapeFunction fshapeinfer) override {
     std::lock_guard<std::mutex> lock(mutex_);
     unary_shape_ = fshapeinfer;
@@ -76,6 +83,21 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
     return *this;
   }
 
+  TSelf& set_function(int dev_mask,
+                      SourceFunction fsource,
+                      SimpleOpRegOption register_symbolic) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    SetFunction(&fsource_, dev_mask, fsource, "SourceFunction");
+    if (++reg_counter_ == 1) {
+      this->RegisterSourceImperative();
+      register_symbolic_ = (register_symbolic == kRegisterSymbolic);
+      if (register_symbolic_) {
+        this->RegisterSourceSymbolic();
+      }
+    }
+    return *this;
+  }
+
   TSelf& set_function(int dev_mask,
                       UnaryFunction funary,
                       SimpleOpInplaceOption inplace_in_out,
@@ -178,6 +200,7 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
  protected:
   // make friend with unary op
   friend class SimpleOpPropBase;
+  friend class SimpleSourceOpProp;
   friend class SimpleUnaryOpProp;
   friend class SimpleBinaryOpProp;
   // internal mutex
@@ -196,6 +219,11 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
   bool enable_kwargs_{false};
   // resource requirements
   std::vector<ResourceRequest> resource_requests_;
+  // ------ source functions ----
+  // source shape inference information.
+  SourceShapeFunction source_shape_{nullptr};
+  // source functions on each device mask
+  std::vector<SourceFunction> fsource_;
   // ------ unary functions -----
   // unary shape inference information.
   UnaryShapeFunction unary_shape_{nullptr};
@@ -266,6 +294,10 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
     }
     return *op_reg_;
   }
+  // register source function.
+  void RegisterSourceImperative();
+  // register source symbolic function.
+  void RegisterSourceSymbolic();
   // register unary function.
   void RegisterUnaryImperative();
   // register unary symbolic function.
@@ -295,6 +327,264 @@ SimpleOpRegistry::~SimpleOpRegistry() {
     delete kv.second;
   }
 }
+
+// base class
+struct SimpleOpScalarParam :
+      public dmlc::Parameter<SimpleOpScalarParam> {
+  float scalar;
+  DMLC_DECLARE_PARAMETER(SimpleOpScalarParam) {
+    DMLC_DECLARE_FIELD(scalar)
+        .describe("scalar value.");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(SimpleOpScalarParam);
+
+class SimpleOpPropBase : public OperatorProperty {
+ public:
+  std::string name;
+  EnvArguments env;
+  SimpleOpRegEntryImpl* source;
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    if (source->enable_kwargs_) {
+      env.kwargs = kwargs;
+    } else if (source->enable_scalar_) {
+      SimpleOpScalarParam param;
+      param.Init(kwargs);
+      env.scalar = param.scalar;
+    } else {
+      CHECK_EQ(kwargs.size(), 0)
+          << "Operator " << source->symbol_name_ << " donot accept any keyword arguments";
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    if (source->enable_kwargs_) {
+      return std::map<std::string, std::string>(
+          env.kwargs.begin(), env.kwargs.end());
+    } else if (source->enable_scalar_) {
+      SimpleOpScalarParam param;
+      param.scalar = env.scalar;
+      return param.__DICT__();
+    } else {
+      return std::map<std::string, std::string>();
+    }
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return source->resource_requests_;
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return source->resource_requests_;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_LE(in_type->size(), this->ListArguments().size());
+    int dtype = -1;
+    // reduce dtype to a common one.
+    for (unsigned i = 0; i < in_type->size(); ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == -1 ||
+              in_type->at(i) == dtype) <<
+          "Non-uniform input data type. Expected " << dtype << "got " << in_type->at(i);
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "At least one input type needs to be specified.";
+      return false;
+    }
+
+    int n_in = this->ListArguments().size();
+    in_type->clear();
+    for (int i = 0; i < n_in; ++i) in_type->push_back(dtype);
+
+    int n_out = this->ListOutputs().size();
+    out_type->clear();
+    for (int i = 0; i < n_out; ++i) out_type->push_back(dtype);
+
+    int n_aux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (int i = 0; i < n_aux; ++i) aux_type->push_back(dtype);
+    return true;
+  }
+
+  std::string TypeString() const override {
+    return name;
+  }
+};
+
+//-------------------------------------
+// source function Implementation
+//-------------------------------------
+void SimpleOpRegEntryImpl::RegisterSourceImperative() {
+  CHECK_EQ(reg_counter_, 1);
+  // The body to be registered
+  auto body = [this] (NDArray** used_vars,
+                      real_t* s,
+                      NDArray** mutate_vars,
+                      int num_params,
+                      char** param_keys,
+                      char** param_vals) {
+    NDArray* out = mutate_vars[0];
+    // setup env.
+    EnvArguments env;
+    if (enable_scalar_) env.scalar = s[0];
+    if (enable_kwargs_) {
+      for (int i = 0; i < num_params; ++i) {
+        env.kwargs.emplace_back(std::make_pair(
+            std::string(param_keys[i]), std::string(param_vals[i])));
+      }
+    } else {
+      CHECK_EQ(num_params, 0)
+        << "operator " << this->name << " do not take keyword arguments";
+    }
+    // shape inference.
+    CHECK(source_shape_ != nullptr);
+    TShape dshape = source_shape_(env);
+    // check output shape.
+    CHECK(!out->is_none());
+    CHECK(out->shape() == dshape) << "target shape mismatch "
+    << out->shape() << " vs. " << dshape;
+
+    // important: callback must always capture by value
+    NDArray ret = *out;
+    // request resources.
+    std::vector<Engine::VarHandle> write_vars = {ret.var()};
+    for (ResourceRequest req : resource_requests_) {
+      env.resource.push_back(ResourceManager::Get()->Request(ret.ctx(), req));
+      write_vars.push_back(env.resource.back().var);
+    }
+    // check if the function exist
+    int dev_mask = ret.ctx().dev_mask();
+    // error message
+    if (static_cast<size_t>(dev_mask) >= fsource_.size() ||
+        fsource_[dev_mask] == nullptr) {
+      if (dev_mask == gpu::kDevMask) {
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      }
+      LOG(FATAL) << "Function " << this->name
+                 << "not registered for device " << dev_mask;
+    }
+    // invoke the function
+    SourceFunction fun = fsource_[dev_mask];
+    OpReqType req = kWriteTo;
+
+    Engine::Get()->PushSync([ret, fun, dev_mask, req, env](RunContext ctx) {
+        ret.CheckAndAlloc();
+        TBlob tmp = ret.data();
+        (*fun)(env, &tmp, req, ctx);
+#if MXNET_USE_CUDA
+        if (dev_mask == gpu::kDevMask) {
+          ctx.get_stream<gpu>()->Wait();
+        }
+#endif
+      }, ret.ctx(), {}, write_vars);
+  };
+  // register the function.
+  NDArrayReg()
+      .set_body(body)
+      .set_num_use_vars(0)
+      .set_num_mutate_vars(1);
+  if (enable_scalar_) {
+      NDArrayReg()
+          .set_num_scalars(1)
+          .add_argument("scalar", "float", "scalar input to the function");
+  }
+}
+
+// operator to invoke unary function.
+struct SimpleSourceOperator : public Operator {
+  EnvArguments env;
+  SourceFunction forward;
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data,
+               const std::vector<TBlob> &aux_args) override {
+    if (ctx.requested.size() != 0) env.resource = ctx.requested;
+    CHECK_EQ(in_data.size(), 0);
+    CHECK_EQ(out_data.size(), 1);
+    TBlob out = out_data[0];
+    (*forward)(env, &out, req[0], ctx.run_ctx);
+  }
+
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad,
+                const std::vector<TBlob> &aux_args) override {
+    LOG(FATAL) << "no gradient can be done";
+    // no nothing.
+  }
+};  // class SimpleUnaryOperator
+
+class SimpleSourceOpProp : public SimpleOpPropBase {
+ public:
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_EQ(in_shape->size(), 0)
+        << in_shape->size();
+    CHECK(source->source_shape_ != nullptr);
+    out_shape->clear();
+    out_shape->push_back((*(source->source_shape_))(env));
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SimpleSourceOpProp();
+    ptr->source = source;
+    ptr->name = name;
+    ptr->env = env;
+    return ptr;
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    size_t dev_mask = ctx.dev_mask();
+    SimpleSourceOperator *op = new SimpleSourceOperator();
+    CHECK(dev_mask < source->fsource_.size() && source->fsource_[dev_mask] != nullptr);
+    op->forward = source->fsource_[dev_mask];
+    op->env = this->env;
+    return op;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {};
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    out_type->clear();
+    out_type->push_back(mshadow::kFloat32);
+    return true;
+  }
+};
+
+void SimpleOpRegEntryImpl::RegisterSourceSymbolic() {
+  // register the operator
+  auto op_factory = [this]() {
+    SimpleSourceOpProp *prop = new SimpleSourceOpProp();
+    prop->name = this->symbol_name_;
+    prop->source = this;
+    return prop;
+  };
+  OpReg()
+      .set_body(op_factory);
+}
+
 //-------------------------------------
 // unary function Implementation
 //-------------------------------------
@@ -457,99 +747,6 @@ struct SimpleUnaryOperator : public Operator {
   }
 };  // class SimpleUnaryOperator
 
-struct SimpleOpScalarParam :
-      public dmlc::Parameter<SimpleOpScalarParam> {
-  float scalar;
-  DMLC_DECLARE_PARAMETER(SimpleOpScalarParam) {
-    DMLC_DECLARE_FIELD(scalar)
-        .describe("scalar value.");
-  }
-};
-
-DMLC_REGISTER_PARAMETER(SimpleOpScalarParam);
-
-class SimpleOpPropBase : public OperatorProperty {
- public:
-  std::string name;
-  EnvArguments env;
-  SimpleOpRegEntryImpl* source;
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    if (source->enable_kwargs_) {
-      env.kwargs = kwargs;
-    } else if (source->enable_scalar_) {
-      SimpleOpScalarParam param;
-      param.Init(kwargs);
-      env.scalar = param.scalar;
-    } else {
-      CHECK_EQ(kwargs.size(), 0)
-          << "Operator " << source->symbol_name_ << " donot accept any keyword arguments";
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    if (source->enable_kwargs_) {
-      return std::map<std::string, std::string>(
-          env.kwargs.begin(), env.kwargs.end());
-    } else if (source->enable_scalar_) {
-      SimpleOpScalarParam param;
-      param.scalar = env.scalar;
-      return param.__DICT__();
-    } else {
-      return std::map<std::string, std::string>();
-    }
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return source->resource_requests_;
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return source->resource_requests_;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_LE(in_type->size(), this->ListArguments().size());
-    int dtype = -1;
-    // reduce dtype to a common one.
-    for (unsigned i = 0; i < in_type->size(); ++i) {
-      if (dtype == -1) {
-        dtype = in_type->at(i);
-      } else {
-        CHECK(in_type->at(i) == -1 ||
-              in_type->at(i) == dtype) <<
-          "Non-uniform input data type. Expected " << dtype << "got " << in_type->at(i);
-      }
-    }
-
-    if (dtype == -1) {
-      LOG(FATAL) << "At least one input type needs to be specified.";
-      return false;
-    }
-
-    int n_in = this->ListArguments().size();
-    in_type->clear();
-    for (int i = 0; i < n_in; ++i) in_type->push_back(dtype);
-
-    int n_out = this->ListOutputs().size();
-    out_type->clear();
-    for (int i = 0; i < n_out; ++i) out_type->push_back(dtype);
-
-    int n_aux = this->ListAuxiliaryStates().size();
-    aux_type->clear();
-    for (int i = 0; i < n_aux; ++i) aux_type->push_back(dtype);
-    return true;
-  }
-
-  std::string TypeString() const override {
-    return name;
-  }
-};
-
 class SimpleUnaryOpProp : public SimpleOpPropBase {
  public:
   bool InferShape(std::vector<TShape> *in_shape,
@@ -644,11 +841,9 @@ void SimpleOpRegEntryImpl::RegisterUnarySymbolic() {
   };
   OpReg()
       .set_body(op_factory)
-      .add_argument("lhs", "Symbol", "Left symbolic input to the function")
-      .add_argument("rhs", "Symbol", "Left symbolic input to the function");
+      .add_argument("src", "Symbol", "Left symbolic input to the function");
 }
 
-
 //-------------------------------------
 // binary function Implementation
 //-------------------------------------
@@ -933,7 +1128,7 @@ void SimpleOpRegEntryImpl::RegisterBinarySymbolic() {
   OpReg()
       .set_body(op_factory)
       .add_argument("lhs", "Symbol", "Left symbolic input to the function")
-      .add_argument("rhs", "Symbol", "Left symbolic input to the function");
+      .add_argument("rhs", "Symbol", "Right symbolic input to the function");
 }
 
 }  // namespace op
diff --git a/src/operator/sample_op-inl.h b/src/operator/sample_op-inl.h
new file mode 100644
index 000000000000..41e3c40634ab
--- /dev/null
+++ b/src/operator/sample_op-inl.h
@@ -0,0 +1,112 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op-inl.h
+ * \brief Function defintion sampling operators.
+ */
+#ifndef MXNET_OPERATOR_SAMPLE_OP_INL_H_
+#define MXNET_OPERATOR_SAMPLE_OP_INL_H_
+
+#include <mxnet/operator_util.h>
+#include "./mshadow_op.h"
+
+#if defined(__CUDACC__)
+#define XPU gpu
+#else
+#define XPU cpu
+#endif
+
+namespace mxnet {
+namespace op {
+
+struct SampleUniformParam : public dmlc::Parameter<SampleUniformParam> {
+  float low;
+  float high;
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(SampleUniformParam) {
+    DMLC_DECLARE_FIELD(low).set_default(0.0f)
+        .describe("The lower bound of distribution");
+    DMLC_DECLARE_FIELD(high).set_default(1.0f)
+        .describe("The upper bound of distribution");
+    DMLC_DECLARE_FIELD(shape)
+        .describe("The shape of the output");
+  }
+};
+
+struct SampleNormalParam : public dmlc::Parameter<SampleNormalParam> {
+  float loc;
+  float scale;
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(SampleNormalParam) {
+    DMLC_DECLARE_FIELD(loc).set_default(0.0f)
+        .describe("Mean of the distribution.");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Standard deviation of the distribution.");
+    DMLC_DECLARE_FIELD(shape)
+        .describe("The shape of the output");
+  }
+};
+
+template<typename xpu>
+void SampleUniform_(const EnvArguments& env,
+                    TBlob *ret,
+                    OpReqType req,
+                    RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, mshadow::kFloat32)
+      << "only support float32 rnd so far";
+  SampleUniformParam param;
+  param.Init(env.kwargs);
+  mshadow::Random<xpu, float> *prnd = env.resource[0].get_random<xpu, float>(s);
+  mshadow::Tensor<xpu, 2, float> tmp = ret->FlatTo2D<xpu, float>(s);
+  prnd->SampleUniform(&tmp, float(param.low), float(param.high));  // NOLINT(*)
+}
+
+template<typename xpu>
+void SampleNormal_(const EnvArguments& env,
+                   TBlob *ret,
+                   OpReqType req,
+                   RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, mshadow::kFloat32)
+      << "only support float32 rnd so far";
+  SampleNormalParam param;
+  param.Init(env.kwargs);
+  mshadow::Random<xpu, float> *prnd = env.resource[0].get_random<xpu, float>(s);
+  mshadow::Tensor<xpu, 2, float> tmp = ret->FlatTo2D<xpu, float>(s);
+  prnd->SampleGaussian(&tmp, float(param.loc), float(param.scale));  // NOLINT(*)
+}
+
+template<typename ParamType>
+inline TShape SampleShape(const EnvArguments& env) {
+  ParamType param;
+  param.Init(env.kwargs);
+  return param.shape;
+}
+
+// sample uniform
+MXNET_REGISTER_SIMPLE_OP(_sample_uniform, XPU)
+.set_symbol_op_name("uniform")
+.set_enable_kwargs(true)
+.set_resource_request(ResourceRequest::kRandom)
+.set_function(XPU::kDevMask, SampleUniform_<XPU>)
+.set_shape_function(SampleShape<SampleUniformParam>)
+.describe("Sample a uniform distribution")
+.add_arguments(SampleUniformParam::__FIELDS__());
+
+// sample normal
+MXNET_REGISTER_SIMPLE_OP(_sample_normal, XPU)
+.set_symbol_op_name("normal")
+.set_enable_kwargs(true)
+.set_resource_request(ResourceRequest::kRandom)
+.set_function(XPU::kDevMask, SampleNormal_<XPU>)
+.set_shape_function(SampleShape<SampleNormalParam>)
+.describe("Sample a normal distribution")
+.add_arguments(SampleNormalParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SAMPLE_OP_INL_H_
diff --git a/src/operator/sample_op.cc b/src/operator/sample_op.cc
new file mode 100644
index 000000000000..7672563648d8
--- /dev/null
+++ b/src/operator/sample_op.cc
@@ -0,0 +1,16 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op.cc
+ * \brief CPU Implementation of sample op
+ */
+// this will be invoked by cc
+#include "./sample_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(SampleUniformParam);
+DMLC_REGISTER_PARAMETER(SampleNormalParam);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sample_op.cu b/src/operator/sample_op.cu
new file mode 100644
index 000000000000..acc8541b4e4f
--- /dev/null
+++ b/src/operator/sample_op.cu
@@ -0,0 +1,7 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op.cu
+ * \brief GPU Implementation of sample op
+ */
+// this will be invoked by nvcc and compile GPU version
+#include "./sample_op-inl.h"
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 10be569e8f76..3ec5029cc0d3 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -23,10 +23,42 @@ def check_with_device(device):
         assert abs(np.mean(un1.asnumpy()) - (a+b)/2) < 0.1
 
 
+def check_symbolic_random(dev):
+    a, b = -10, 10
+    mu, sigma = 10, 2
+    shape = (100, 100)
+    X = mx.sym.Variable("X")
+    Y = mx.sym.uniform(low=a, high=b, shape=shape) + X
+    x = mx.nd.zeros(shape, ctx=dev)
+    xgrad = mx.nd.zeros(shape, ctx=dev)
+    yexec = Y.bind(dev, {'X' : x}, {'X': xgrad})
+    mx.random.seed(128)
+    yexec.forward()
+    yexec.backward(yexec.outputs[0])
+    un1 = (yexec.outputs[0] - x).copyto(dev)
+    assert same(xgrad.asnumpy(), un1.asnumpy())
+    mx.random.seed(128)
+    yexec.forward()
+    un2 = (yexec.outputs[0] - x).copyto(dev)
+    assert same(un1.asnumpy(), un2.asnumpy())
+    assert abs(np.mean(un1.asnumpy()) - (a+b)/2) < 0.1
+
+    Y = mx.sym.normal(loc=mu, scale=sigma, shape=shape)
+    yexec = Y.simple_bind(dev)
+    mx.random.seed(128)
+    yexec.forward()
+    ret1 = yexec.outputs[0].copyto(dev)
+    mx.random.seed(128)
+    ret2 = mx.random.normal(mu, sigma, shape)
+    assert same(ret1.asnumpy(), ret2.asnumpy())
+    assert abs(np.mean(ret1.asnumpy()) - mu) < 0.1
+    assert abs(np.std(ret1.asnumpy()) - sigma) < 0.1
+
+
 def test_random():
     check_with_device(mx.cpu())
+    check_symbolic_random(mx.cpu())
 
 
 if __name__ == '__main__':
     test_random()
-

From a717570e3ea49f0761fef3779d88e0186f8dd6d5 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Tue, 7 Jun 2016 13:04:29 -0700
Subject: [PATCH 055/126] softmax output label shape

---
 src/operator/softmax_output-inl.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 546f70010056..6aa9c395a059 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -215,8 +215,24 @@ class SoftmaxOutputProp : public OperatorProperty {
     // label.shape == data.shape: use probability as label
     if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
       if (param_.multi_output) {
-        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel,
-                           Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
+        TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
+        TShape lshape2(dshape.ndim() - 1);
+        lshape2[0] = dshape[0];
+        for (index_t i = 2; i < dshape.ndim(); ++i)
+          lshape2[i-1] = dshape[i];
+        TShape lshape3 = dshape;
+        lshape3[1] = 1;
+        if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+          in_shape->at(softmaxout_enum::kLabel) = lshape1;
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
+        } else {
+          std::ostringstream os;
+          os << "Expecting " << lshape1 << " or " << lshape2
+             << ". But got " << in_shape->at(softmaxout_enum::kLabel);
+          throw InferShapeError(os.str(), softmaxout_enum::kLabel);
+        }
       } else {
         TShape label_shape(dshape.ndim() - 1);
         for (index_t i = 0; i + 1 < dshape.ndim(); ++i)

From 9b6d4d21e1eb7011e8b37261f0046f8f6b894ffc Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Fri, 17 Jun 2016 15:30:26 -0700
Subject: [PATCH 056/126] opencv plugin

---
 plugin/opencv/__init__.py |   6 ++
 plugin/opencv/cv_api.cc   | 149 ++++++++++++++++++++++++++++++++
 plugin/opencv/cv_api.h    |  35 ++++++++
 plugin/opencv/opencv.mk   |   4 +
 plugin/opencv/opencv.py   | 173 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 367 insertions(+)
 create mode 100644 plugin/opencv/__init__.py
 create mode 100644 plugin/opencv/cv_api.cc
 create mode 100644 plugin/opencv/cv_api.h
 create mode 100644 plugin/opencv/opencv.mk
 create mode 100644 plugin/opencv/opencv.py

diff --git a/plugin/opencv/__init__.py b/plugin/opencv/__init__.py
new file mode 100644
index 000000000000..072575177e41
--- /dev/null
+++ b/plugin/opencv/__init__.py
@@ -0,0 +1,6 @@
+# coding: utf-8
+# pylint: disable=wildcard-import
+
+"""Opencv plugin for mxnet"""
+from .opencv import *
+
diff --git a/plugin/opencv/cv_api.cc b/plugin/opencv/cv_api.cc
new file mode 100644
index 000000000000..78bec01548d4
--- /dev/null
+++ b/plugin/opencv/cv_api.cc
@@ -0,0 +1,149 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file cv_api.h
+ * \brief C API for opencv
+ * \author Junyuan Xie
+ */
+#include <dmlc/base.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <opencv2/opencv.hpp>
+#include "cv_api.h"
+#include "../../src/c_api/c_api_error.h"
+
+
+using namespace mxnet;
+// http://www.64lines.com/jpeg-width-height
+// Gets the JPEG size from the array of data passed to the function, file reference: http://www.obrador.com/essentialjpeg/headerinfo.htm
+bool get_jpeg_size(const unsigned char* data, mx_uint data_size, mx_uint *width, mx_uint *height) {
+  // Check for valid JPEG image
+  mx_uint i = 0;  // Keeps track of the position within the file
+  if (data[i] == 0xFF && data[i+1] == 0xD8 && data[i+2] == 0xFF && data[i+3] == 0xE0) {
+    i += 4;
+    // Check for valid JPEG header (null terminated JFIF)
+    if (data[i+2] == 'J' && data[i+3] == 'F' && data[i+4] == 'I'
+        && data[i+5] == 'F' && data[i+6] == 0x00) {
+      // Retrieve the block length of the first block since
+      // the first block will not contain the size of file
+      uint16_t block_length = data[i] * 256 + data[i+1];
+      while (i < data_size) {
+        i+=block_length;  // Increase the file index to get to the next block
+        if (i >= data_size) return false;  // Check to protect against segmentation faults
+        if (data[i] != 0xFF) return false;  // Check that we are truly at the start of another block
+        if (data[i+1] == 0xC0) {
+          // 0xFFC0 is the "Start of frame" marker which contains the file size
+          // The structure of the 0xFFC0 block is quite simple
+          // [0xFFC0][ushort length][uchar precision][ushort x][ushort y]
+          *height = data[i+5]*256 + data[i+6];
+          *width = data[i+7]*256 + data[i+8];
+          return true;
+        } else {
+          i+=2;  // Skip the block marker
+          block_length = data[i] * 256 + data[i+1];  // Go to the next block
+        }
+      }
+      return false;  // If this point is reached then no size was found
+    } else {
+      return false;  // Not a valid JFIF string
+    }
+  } else {
+    return false;  // Not a valid SOI header
+  }
+}
+
+bool get_png_size(const unsigned char* data, mx_uint data_size, mx_uint *width, mx_uint *height) {
+  if (data[0] == 0x89 && data[1] == 0x50 && data[2] ==0x4E && data[3] == 0x47) {
+    unsigned char const* p = data + 16;
+    *width = ((p[0]*256 + p[1])*256 + p[2])*256 + p[3];
+    p += 4;
+    *height = ((p[0]*256 + p[1])*256 + p[2])*256 + p[3];
+    return true;
+  } else {
+    return false;
+  }
+}
+
+MXNET_DLL int MXCVImdecode(const unsigned char *img, const mx_uint len,
+                           const int flag, NDArrayHandle *out) {
+  API_BEGIN();
+  mx_uint dims[3];
+  CHECK_GE(flag, 0) << "flag must be 0 (grayscale) or 1 (colored).";
+  dims[2] = flag == 0 ? 1 : 3;
+  if (get_jpeg_size(img, len, dims+1, dims)) {
+  } else if (get_png_size(img, len, dims+1, dims)) {
+  } else {
+    LOG(FATAL) << "Only supports png and jpg.";
+  }
+  NDArray ndout(TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+  unsigned char *img_cpy = new unsigned char[len];
+  memcpy(img_cpy, img, sizeof(unsigned char)*len);
+  Engine::Get()->PushSync([=](RunContext ctx){
+      ndout.CheckAndAlloc();
+      cv::Mat buf(1, len, CV_8U, img_cpy);
+      cv::Mat dst(dims[0], dims[1], flag == 0 ? CV_8U : CV_8UC3, ndout.data().dptr_);
+      cv::imdecode(buf, flag, &dst);
+      CHECK(!dst.empty());
+      delete[] img_cpy;
+    }, ndout.ctx(), {}, {ndout.var()});
+  NDArray *tmp = new NDArray();
+  *tmp = ndout;
+  *out = tmp;
+  API_END();
+}
+
+
+MXNET_DLL int MXCVResize(NDArrayHandle src, const mx_uint w, const mx_uint h,
+                         const int interpolation, NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray ndsrc = *static_cast<NDArray*>(src);
+  CHECK_EQ(ndsrc.shape().ndim(), 3);
+  CHECK_EQ(ndsrc.ctx(), Context::CPU());
+  CHECK_EQ(ndsrc.dtype(), mshadow::kUint8);
+
+  mx_uint dims[3] = {h, w, ndsrc.shape()[2]};
+  NDArray ndout(TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+
+  Engine::Get()->PushSync([=](RunContext ctx){
+      ndout.CheckAndAlloc();
+      cv::Mat buf(ndsrc.shape()[0], ndsrc.shape()[1],
+                  dims[2] == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
+      cv::Mat dst(h, w, dims[2] == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
+      cv::resize(buf, dst, cv::Size(w, h), 0, 0, interpolation);
+      CHECK(!dst.empty());
+    }, ndout.ctx(), {ndsrc.var()}, {ndout.var()});
+  NDArray *tmp = new NDArray();
+  *tmp = ndout;
+  *out = tmp;
+  API_END();
+}
+
+MXNET_DLL int MXCVcopyMakeBorder(NDArrayHandle src,
+                                 const int top,
+                                 const int bot,
+                                 const int left,
+                                 const int right,
+                                 const int type,
+                                 const double value,
+                                 NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray ndsrc = *static_cast<NDArray*>(src);
+  CHECK_EQ(ndsrc.shape().ndim(), 3);
+  CHECK_EQ(ndsrc.ctx(), Context::CPU());
+  CHECK_EQ(ndsrc.dtype(), mshadow::kUint8);
+
+  int h = ndsrc.shape()[0], w = ndsrc.shape()[1], c = ndsrc.shape()[2];
+  mx_uint dims[3] = {top+h+bot, left+w+right, c};
+  NDArray ndout(TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+
+  Engine::Get()->PushSync([=](RunContext ctx){
+      ndout.CheckAndAlloc();
+      cv::Mat buf(h, w, c == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
+      cv::Mat dst(top+h+bot, left+w+right, c == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
+      cv::copyMakeBorder(buf, dst, top, bot, left, right, type, cv::Scalar(value));
+      CHECK(!dst.empty());
+    }, ndout.ctx(), {ndsrc.var()}, {ndout.var()});
+  NDArray *tmp = new NDArray();
+  *tmp = ndout;
+  *out = tmp;
+  API_END();
+}
diff --git a/plugin/opencv/cv_api.h b/plugin/opencv/cv_api.h
new file mode 100644
index 000000000000..fc224d0e1d05
--- /dev/null
+++ b/plugin/opencv/cv_api.h
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file cv_api.h
+ * \brief C API for opencv
+ * \author Junyuan Xie
+ */
+#ifndef PLUGIN_OPENCV_CV_API_H_
+#define PLUGIN_OPENCV_CV_API_H_
+
+#include <mxnet/c_api.h>
+
+MXNET_DLL int MXCVImdecode(
+  const unsigned char *img,
+  const mx_uint len,
+  const int flag,
+  NDArrayHandle *out);
+
+MXNET_DLL int MXCVResize(
+  NDArrayHandle src,
+  const mx_uint w,
+  const mx_uint h,
+  const int interpolation,
+  NDArrayHandle *out);
+
+MXNET_DLL int MXCVcopyMakeBorder(
+  NDArrayHandle src,
+  const int top,
+  const int bot,
+  const int left,
+  const int right,
+  const int type,
+  const double value,
+  NDArrayHandle *out);
+
+#endif  // PLUGIN_OPENCV_CV_API_H_
diff --git a/plugin/opencv/opencv.mk b/plugin/opencv/opencv.mk
new file mode 100644
index 000000000000..ab1f6ff2ee03
--- /dev/null
+++ b/plugin/opencv/opencv.mk
@@ -0,0 +1,4 @@
+OPENCV_SRC = $(wildcard plugin/opencv/*.cc)
+PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(OPENCV_SRC))
+OPENCV_CUSRC = $(wildcard plugin/opencv/*.cu)
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(OPENCV_CUSRC))
diff --git a/plugin/opencv/opencv.py b/plugin/opencv/opencv.py
new file mode 100644
index 000000000000..6ee5be13f643
--- /dev/null
+++ b/plugin/opencv/opencv.py
@@ -0,0 +1,173 @@
+# coding: utf-8
+# pylint: disable=too-many-arguments,no-member,invalid-name
+
+"""Opencv plugin for mxnet"""
+import random
+import ctypes
+import cv2
+import mxnet as mx
+from mxnet.base import _LIB
+from mxnet.base import mx_uint, NDArrayHandle, check_call
+
+def imdecode(str_img, flag=1):
+    """Decode image from str buffer.
+    Wrapper for cv2.imdecode that uses mx.nd.NDArray
+
+    Parameters
+    ----------
+    str_img : str
+        str buffer read from image file
+    flag : int
+        same as flag for cv2.imdecode
+    Returns
+    -------
+    img : NDArray
+        decoded image in (width, height, channels)
+        with BGR color channel order
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXCVImdecode(ctypes.c_char_p(str_img),
+                                 mx_uint(len(str_img)),
+                                 flag, ctypes.byref(hdl)))
+    return mx.nd.NDArray(hdl)
+
+def resize(src, size, interpolation=cv2.INTER_LINEAR):
+    """Decode image from str buffer.
+    Wrapper for cv2.imresize that uses mx.nd.NDArray
+
+    Parameters
+    ----------
+    src : NDArray
+        image in (width, height, channels)
+    size : tuple
+        target size in (width, height)
+    interpolation : int
+        same as interpolation for cv2.imresize
+
+    Returns
+    -------
+    img : NDArray
+        resized image
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXCVResize(src.handle, mx_uint(size[0]), mx_uint(size[1]),
+                               interpolation, ctypes.byref(hdl)))
+    return mx.nd.NDArray(hdl)
+
+def copyMakeBorder(src, top, bot, left, right, border_type=cv2.BORDER_CONSTANT, value=0):
+    """Pad image border
+    Wrapper for cv2.copyMakeBorder that uses mx.nd.NDArray
+
+    Parameters
+    ----------
+    src : NDArray
+        Image in (width, height, channels).
+        Others are the same with cv2.copyMakeBorder
+
+    Returns
+    -------
+    img : NDArray
+        padded image
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXCVcopyMakeBorder(src.handle, ctypes.c_int(top), ctypes.c_int(bot),
+                                       ctypes.c_int(left), ctypes.c_int(right),
+                                       ctypes.c_int(border_type), ctypes.c_double(value),
+                                       ctypes.byref(hdl)))
+    return mx.nd.NDArray(hdl)
+
+
+def scale_down(src_size, size):
+    """Scale down crop size if it's bigger than image size"""
+    w, h = size
+    sw, sh = src_size
+    if sh < h:
+        w, h = float(w*sh)/h, sh
+    if sw < w:
+        w, h = sw, float(h*sw)/w
+    return int(w), int(h)
+
+def fixed_crop(src, x0, y0, w, h, size=None, interpolation=cv2.INTER_CUBIC):
+    """Crop src at fixed location, and (optionally) resize it to size"""
+    out = mx.nd.crop(src, begin=(y0, x0, 0), end=(y0+h, x0+w, int(src.shape[2])))
+    if size is not None and (w, h) != size:
+        out = resize(out, size, interpolation=interpolation)
+    return out
+
+def random_crop(src, size):
+    """Randomly crop src with size. Upsample result if src is smaller than size"""
+    h, w, _ = src.shape
+    new_w, new_h = scale_down((w, h), size)
+
+    x0 = random.randint(0, w - new_w)
+    y0 = random.randint(0, h - new_h)
+
+    out = fixed_crop(src, x0, y0, new_w, new_h, size)
+    return out, (x0, y0, new_w, new_h)
+
+def color_normalize(src, mean, std):
+    """Normalize src with mean and std"""
+    src -= mean
+    src /= std
+    return src
+
+def random_size_crop(src, size, min_area=0.25, ratio=(3.0/4.0, 4.0/3.0)):
+    """Randomly crop src with size. Randomize area and aspect ratio"""
+    h, w, _ = src.shape
+    area = w*h
+    for _ in range(10):
+        new_area = random.uniform(min_area, 1.0) * area
+        new_ratio = random.uniform(*ratio)
+        new_w = int(new_area*new_ratio)
+        new_h = int(new_area/new_ratio)
+
+        if random.uniform(0., 1.) < 0.5:
+            new_w, new_h = new_h, new_w
+
+        if new_w > w or new_h > h:
+            continue
+
+        x0 = random.randint(0, w - new_w)
+        y0 = random.randint(0, h - new_h)
+
+        out = fixed_crop(src, x0, y0, new_w, new_h, size)
+        return out, (x0, y0, new_w, new_h)
+
+    return random_crop(src, size)
+
+class ImageListIter(mx.io.DataIter):
+    """An example image iterator using opencv plugin"""
+    def __init__(self, root, flist, batch_size, size, mean=None):
+        super(ImageListIter, self).__init__()
+        self.root = root
+        self.list = [line.strip() for line in open(flist).readlines()]
+        self.cur = 0
+        self.batch_size = batch_size
+        self.size = size
+        if mean is not None:
+            self.mean = mx.nd.array(mean)
+        else:
+            self.mean = None
+
+    def reset(self):
+        self.cur = 0
+
+    def next(self):
+        batch = mx.nd.zeros((self.batch_size, self.size[1], self.size[0], 3))
+        i = self.cur
+        for i in range(self.cur, min(len(self.list), self.cur+self.batch_size)):
+            str_img = open(self.root+self.list[i]+'.jpg').read()
+            img = imdecode(str_img, 1)
+            img, _ = random_crop(img, self.size)
+            batch[i - self.cur] = img
+        batch = mx.nd.transpose(batch, axes=(0, 3, 1, 2))
+        ret = mx.io.DataBatch(data=[batch],
+                              label=[],
+                              pad=self.batch_size-(i-self.cur),
+                              index=None)
+        self.cur = i
+        return ret
+
+
+
+

From f10e709e6ea84c91d7a3134e43d0d0c63086452c Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@ust.hk>
Date: Fri, 24 Jun 2016 09:30:50 +0800
Subject: [PATCH 057/126] Sum with axis full support (#2518)

* Add full support of sum (multiple axes + keepdims)

* Fix style

* Fix lint
---
 mshadow                                   |   2 +-
 python/mxnet/ndarray.py                   | 119 --------
 python/mxnet/symbol.py                    |  74 -----
 src/operator/broadcast_reduce_op-inl.h    | 331 ++++++++++------------
 src/operator/broadcast_reduce_op_common.h |  25 +-
 tests/python/unittest/test_ndarray.py     |   8 +-
 tests/python/unittest/test_operator.py    |  19 +-
 7 files changed, 197 insertions(+), 381 deletions(-)

diff --git a/mshadow b/mshadow
index 948abff7d748..e41ae71f7096 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 948abff7d748ae4f83c6ce194ec0174db4858e9a
+Subproject commit e41ae71f7096f4b3592c30786328f95ad0eb6dd0
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index e8ef02bfdb8c..f68fbe08e023 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -835,125 +835,6 @@ def ones(shape, ctx=None, dtype=mx_real_t):
     arr[:] = 1.0
     return arr
 
-# pylint: disable=too-many-locals, invalid-name, no-member, protected-access, undefined-variable
-# pylint: disable=too-many-branches
-def _reduce(arr, axis=None, keepdims=False, typ='sum'):
-    """ Reduce the array along given axises. The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    if typ == 'sum':
-        reduce_func = sum_axis
-    elif typ == 'max':
-        reduce_func = max_axis
-    elif typ == 'min':
-        reduce_func = min_axis
-    else:
-        raise TypeError('typ=\'%s\' is not supported.' % typ)
-    ndim = len(arr.shape)
-    if axis is None:
-        axis = list(range(ndim))
-    elif isinstance(axis, int):
-        axis = [axis]
-    elif isinstance(axis, tuple) or isinstance(axis, list):
-        axis = list(axis)
-    else:
-        raise TypeError('\'%s\' object is not supported as axis.' % type(axis).__name__)
-
-    if list(range(ndim)) == axis:
-        ret = reduce_func(arr, axis=-1, keepdims=keepdims)
-        if not keepdims:
-            return ret.asnumpy()[0]
-        else:
-            return ret
-    for i in axis:
-        if not isinstance(i, int):
-            raise TypeError('\'%s\' object cannot be interpreted as an integer' % type(i).__name__)
-    axis = sorted([x if x >= 0 else x + ndim for x in axis])
-    for i in axis:
-        if i < 0 or ndim <= i:
-            raise ValueError('\'axis\' entry is out of bounds')
-    if len(set(axis)) != len(axis):
-        raise ValueError('duplicate value in \'axis\'')
-    assert(len(axis) != 0)
-    ret = arr
-    for i in reversed(axis):
-        ret = reduce_func(ret, axis=i, keepdims=keepdims)
-    return ret
-# pylint: enable=too-many-locals, invalid-name, no-member, protected-access, undefined-variable
-# pylint: enable=too-many-branches
-
-def sum(arr, axis=None, keepdims=False):
-    """ Sum the array along given axises. The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    return _reduce(arr=arr, axis=axis, keepdims=keepdims, typ='sum')
-
-def max(arr, axis=None, keepdims=False):
-    """ Take the maximum of the array along given axises.
-    The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    return _reduce(arr=arr, axis=axis, keepdims=keepdims, typ='max')
-
-def min(arr, axis=None, keepdims=False):
-    """ Take the minimum of the array along given axises.
-    The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    return _reduce(arr=arr, axis=axis, keepdims=keepdims, typ='min')
-
 def full(shape, val, ctx=None):
     """Create a new NDArray filled with given value, with specified shape.
 
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 2577b8f65c30..fb0c3a5875f1 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -1129,80 +1129,6 @@ def pow(base, exp):
         raise TypeError('types (%s, %s) not supported' % (str(type(base)), str(type(exp))))
 
 
-# pylint: disable= undefined-variable, too-many-branches
-def _reduce(data, axis=None, keepdims=False, name=None, typ='sum'):
-    """ Reduce the array along given axis. The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    data : Symbol
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Symbol
-        Symbol represents the reduced Array.
-    """
-    if 'sum' == typ:
-        reduce_func = sum_axis
-    else:
-        raise TypeError('typ=\'%s\' is not supported.' % typ)
-    if axis is None:
-        ret = reduce_func(data, axis=-1, keepdims=keepdims, name=name)
-        return ret
-    elif isinstance(axis, int):
-        axis = [axis]
-    elif isinstance(axis, tuple) or isinstance(axis, list):
-        axis = list(axis)
-    else:
-        raise TypeError('\'%s\' object is not supported as axis.' % type(axis).__name__)
-
-    for i in axis:
-        if not isinstance(i, int):
-            raise TypeError('\'%s\' object cannot be interpreted as an integer' % type(i).__name__)
-    axis = sorted(axis)
-    for i in axis:
-        if i < 0:
-            raise ValueError('\'axis\' entry is out of bounds')
-    if len(set(axis)) != len(axis):
-        raise ValueError('duplicate value in \'axis\'')
-    assert (len(axis) != 0)
-    ret = data
-    for (i, ele) in enumerate(reversed(axis)):
-        if i == (len(axis) - 1):
-            ret = reduce_func(ret, axis=ele, keepdims=keepdims, name=name)
-        else:
-            ret = reduce_func(ret, axis=ele, keepdims=keepdims)
-    return ret
-# pylint: enable= undefined-variable, too-many-branches
-
-
-def sum(data, axis=None, keepdims=False, name=None):
-    """ Calculate the sum of the array along given axis.
-    The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    data : Symbol
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Symbol
-        Symbol represents the reduced Array.
-    """
-    return _reduce(data=data, axis=axis, keepdims=keepdims, name=name, typ='sum')
-
-
-
 # pylint: disable=no-member
 # pylint: disable=redefined-builtin
 def maximum(left, right):
diff --git a/src/operator/broadcast_reduce_op-inl.h b/src/operator/broadcast_reduce_op-inl.h
index ba6d08320053..e9c0bd31fbe4 100644
--- a/src/operator/broadcast_reduce_op-inl.h
+++ b/src/operator/broadcast_reduce_op-inl.h
@@ -9,6 +9,7 @@
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "./mshadow_op.h"
+#include "./broadcast_reduce_op_common.h"
 
 #if defined(__CUDACC__)
 #define XPU gpu
@@ -21,10 +22,11 @@ namespace op {
 
 struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
   bool keepdims;
-  int axis;
+  TShape axis;
   DMLC_DECLARE_PARAMETER(ReduceAxisParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(-1).set_lower_bound(-1)
-      .describe("The axis to perform the reduction. axis=-1 means to reduce all dimensions");
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+      .describe("Same as Numpy. The axes to perform the reduction."
+                "If left empty, a global reduction will be performed.");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
       .describe("Same as Numpy. If keepdims is set to true, "
       "the axis which is reduced is left in the result as dimension with size one.");
@@ -32,13 +34,13 @@ struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
 };
 
 struct BroadcastAxisParam : public dmlc::Parameter<BroadcastAxisParam> {
-  int axis;
-  int size;
+  TShape axis;
+  TShape size;
   DMLC_DECLARE_PARAMETER(BroadcastAxisParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(0).set_lower_bound(0)
-      .describe("The target axis of broadcasting.");
-    DMLC_DECLARE_FIELD(size).set_default(0).set_lower_bound(1)
-      .describe("Size of the broadcasting axis.");
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+      .describe("The axes to perform the broadcasting.");
+    DMLC_DECLARE_FIELD(size).set_default(TShape())
+      .describe("Target sizes of the broadcasting axes.");
   }
 };
 
@@ -46,26 +48,24 @@ inline TShape ReduceAxisShape(const TShape& ishape,
   const EnvArguments& env) {
   ReduceAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < static_cast<int>(ishape.ndim()) || -1 == param.axis) <<
-    "axis must be smaller than the source ndim or equal to -1! Received axis=" <<
-    param.axis << ", src_ndim=" << ishape.ndim();
-  if (param.axis == -1 || (1 == ishape.ndim())) {
-    if (param.keepdims) {
-      return TShape(ishape.ndim());
-    } else {
-      return TShape(1);
+  std::vector<index_t> axes = ParseAxes_(param.axis, ishape.ndim());
+  if (axes.size() == 0) {
+    for (index_t i = 0; i < ishape.ndim(); ++i) {
+      axes.push_back(i);
     }
   }
   std::vector<mshadow::index_t> shape;
   for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (static_cast<int>(i) == param.axis) {
-      if (param.keepdims) {
-        shape.push_back(1);
-      }
-    } else {
+    if (!std::binary_search(axes.begin(), axes.end(), i)) {
       shape.push_back(ishape[i]);
+    } else if (param.keepdims) {
+      shape.push_back(1);
     }
   }
+  // We need to treat the global reduction case specially to avoid an empty output TShape.
+  if (shape.size() == 0) {
+    shape.push_back(1);
+  }
   return TShape(shape.begin(), shape.end());
 }
 
@@ -73,20 +73,15 @@ inline TShape BroadcastAxisShape(const TShape& ishape,
   const EnvArguments& env) {
   BroadcastAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < static_cast<int>(ishape.ndim())) <<
-    "axis must be smaller than the source ndim" << param.axis << ", src_ndim=" << ishape.ndim();
-  CHECK_EQ(ishape[param.axis], 1) <<
-    "Size of the broadcasting axis in the source must be 1, axis=" << param.axis
-    << ", size=" << ishape[param.axis];
-  std::vector<mshadow::index_t> shape;
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (static_cast<int>(i) != param.axis) {
-      shape.push_back(ishape[i]);
-    } else {
-      shape.push_back(param.size);
-    }
+  CHECK_EQ(param.axis.ndim(), param.size.ndim());
+  TShape ret = ishape;
+  for (index_t i = 0; i < param.axis.ndim(); i++) {
+    CHECK_EQ(ishape[param.axis[i]], 1) <<
+      "Size of the broadcasting axis in the source must be 1, axis=" << param.axis
+      << ", size=" << param.size;
+    ret[param.axis[i]] = param.size[i];
   }
-  return TShape(shape.begin(), shape.end());
+  return ret;
 }
 
 // return a shape of scalar
@@ -184,75 +179,38 @@ inline TShape ReduceChannelShape(const TShape& ishape,
 }
 
 // Reduce the given axis
-template<typename xpu, typename Reducer, bool get_mask>
+template<typename xpu, typename Reducer>
 void ReduceAxisImpl_(const TBlob &src,
                      const EnvArguments& env,
                      TBlob *ret,
                      OpReqType req,
                      RunContext ctx,
-                     int axis,
-                     bool keepdims) {
+                     TShape axes) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(src.type_flag_, ret->type_flag_);
-  if (-1 == axis) {
-    // Reduce all dimensions if axis == -1
-    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> in =
-        src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(src.shape_.Size(), 1), s);
-      mshadow::Tensor<xpu, 1, DType> out =
-        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(ret->shape_.Size()), s);
-      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(in)));
-    });
-    return;
-  }
-  int trailing = 1;
-  int leading = 1;
-  for (int i = 0; i < src.shape_.ndim(); ++i) {
-    if (i < axis) {
-      leading *= src.shape_[i];
-    } else if (i > axis) {
-      trailing *= src.shape_[i];
-    }
-  }
-  if (get_mask) {
-    // If get_mask is on, we have to use the slower `reduce_with_axis`
-    // since reduce_except_dim does not support the flag.
+  bool is_contiguous_axes;
+  index_t reducing_size;
+  CheckContiguousAxes_(&is_contiguous_axes, &reducing_size, axes, src.shape_);
+  if (is_contiguous_axes) {
     MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
-      mshadow::Tensor<xpu, 3, DType> in =
-        src.get_with_shape<xpu, 3, DType>(mshadow::Shape3(leading, src.shape_[axis], trailing), s);
-      mshadow::Tensor<xpu, 2, DType> out =
-        ret->get_with_shape<xpu, 2, DType>(mshadow::Shape2(leading, trailing), s);
-      CHECK(req != kAddTo) << "AddTo is not supported for `get_mask = true`";
-      ASSIGN_DISPATCH(out, req, (reduce_with_axis<Reducer, true>(in, 1)));
-    });
-    return;
-  }
-  if (1 == leading) {
-    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> in =
-        src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(src.shape_[axis], trailing), s);
-      mshadow::Tensor<xpu, 1, DType> out =
-        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(trailing), s);
-      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(in)));
-    });
-  } else if (1 == trailing) {
-    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> in =
-        src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(leading, src.shape_[axis]), s);
-      mshadow::Tensor<xpu, 1, DType> out =
-        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(leading), s);
-      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(in.T())));
+      Tensor<xpu, 3, DType> in = src.FlatTo3D<xpu, DType>(axes[0], axes[axes.ndim() - 1], s);
+      Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(ret->Size()), s);
+      ReduceAxesAssign<Reducer>(out, req, TShape(1), in);
     });
   } else {
+    Shape<MXNET_SPECIAL_MAX_NDIM> padded_shape_;
+    for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      padded_shape_[i] = (i < src.ndim()) ? src.shape_[i] : 1;
+    }
     MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
-      mshadow::Tensor<xpu, 3, DType> in =
-        src.get_with_shape<xpu, 3, DType>(mshadow::Shape3(leading, src.shape_[axis], trailing), s);
-      mshadow::Tensor<xpu, 1, DType> out =
-        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(leading * trailing), s);
-      ASSIGN_DISPATCH(out, req,
-        (reduce_except_dim<1, Reducer>(reshape(swapaxis<1, 0>(in),
-        mshadow::Shape2(src.shape_[axis], leading * trailing)))));
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> in =
+        src.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(padded_shape_, s);
+      Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(ret->Size()), s);
+      ReduceAxesAssign<Reducer>(out, req, axes, in);
     });
   }
 }
@@ -264,41 +222,46 @@ void BroadcastAxisImpl_(const TBlob &src,
   TBlob *ret,
   OpReqType req,
   RunContext ctx,
-  int axis,
-  int bsize,
-  bool keepdims) {
+  const TShape &axes,
+  const TShape &bsizes) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (axis == -1) {
-    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-      mshadow::Tensor<xpu, 1, DType> in =
-        src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(src.shape_.Size()), s);
-      mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
-      ASSIGN_DISPATCH(out, req,
-        broadcast_scalar(in, out.shape_));
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  bool is_contiguous_axes;
+  index_t broadcasting_size;
+  CheckContiguousAxes_(&is_contiguous_axes, &broadcasting_size, axes, ret->shape_);
+  if (is_contiguous_axes) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 3, DType> out = ret->FlatTo3D<xpu, DType>(axes[0], axes[axes.ndim() - 1], s);
+      Tensor<xpu, 3, DType> in =
+        src.get_with_shape<xpu, 3, DType>(Shape3(out.shape_[0], 1, out.shape_[2]), s);
+      ASSIGN_DISPATCH(out, req, broadcast_keepdim(in, 1, broadcasting_size));
     });
-    return;
-  }
-  int trailing = 1;
-  int leading = 1;
-  for (int i = 0; i < ret->shape_.ndim(); ++i) {
-    if (i < axis) {
-      leading *= ret->shape_[i];
-    } else if (i > axis) {
-      trailing *= ret->shape_[i];
+  } else {
+    CHECK(ret->ndim() <= MXNET_SPECIAL_MAX_NDIM) << "non-contiguous axis supports ndim up to "
+                                                 << MXNET_SPECIAL_MAX_NDIM;
+    Shape<MXNET_SPECIAL_MAX_NDIM> padded_src_shape_;
+    Shape<MXNET_SPECIAL_MAX_NDIM> padded_ret_shape_;
+    for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      padded_ret_shape_[i] = (i < ret->ndim()) ? ret->shape_[i] : 1;
+    }
+    padded_src_shape_ = padded_ret_shape_;
+    for (index_t i = 0; i < axes.ndim(); ++i) {
+      padded_src_shape_[axes[i]] = 1;
     }
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> in =
+        src.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(padded_src_shape_, s);
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> out =
+        ret->get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(padded_ret_shape_, s);
+      ASSIGN_DISPATCH(out, req, broadcast_multi_axes(in, axes, bsizes));
+    });
   }
-  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> in =
-    src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(leading, trailing), s);
-    mshadow::Tensor<xpu, 3, DType> out =
-      ret->get_with_shape<xpu, 3, DType>(mshadow::Shape3(leading, bsize, trailing), s);
-    ASSIGN_DISPATCH(out, req, broadcast_with_axis(in, 0, bsize));
-  });
 }
 
 // Forward pass of reduce over the given axis
-template<typename xpu, typename Reducer, bool get_mask>
+template<typename xpu, typename Reducer>
 void ReduceAxis(const TBlob &src,
   const EnvArguments& env,
   TBlob *ret,
@@ -307,10 +270,14 @@ void ReduceAxis(const TBlob &src,
   using namespace mshadow::expr;
   ReduceAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < src.shape_.ndim() || -1 == param.axis) <<
-    "axis must be smaller than the source ndim or equals to -1!"
-    " Received axis=" << param.axis << ", src_ndim=" << src.shape_.ndim();
-  ReduceAxisImpl_<xpu, Reducer, get_mask>(src, env, ret, req, ctx, param.axis, param.keepdims);
+  std::vector<index_t> axes = ParseAxes_(param.axis, src.ndim());
+  if (axes.size() == 0) {
+    for (index_t i = 0; i < src.ndim(); i++) {
+      axes.push_back(i);
+    }
+  }
+  ReduceAxisImpl_<xpu, Reducer>(src, env, ret, req, ctx,
+                                TShape(axes.begin(), axes.end()));
 }
 
 // Backward pass of reduce over the given axis
@@ -324,17 +291,18 @@ void SumAxisGrad_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   ReduceAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < in_grad->shape_.ndim() || param.axis == -1) <<
-    "axis must be smaller than the input grad ndim or equals to -1."
-    " Received axis=" << param.axis << ", igrad_ndim=" << in_grad->shape_.ndim();
-  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
-    << "Unary function only support input/output with the same type";
-  if (-1 == param.axis) {
-    BroadcastAxisImpl_<xpu>(out_grad.data, env, in_grad, req, ctx, param.axis, 0, param.keepdims);
-  } else {
-    BroadcastAxisImpl_<xpu>(out_grad.data, env, in_grad, req, ctx, param.axis,
-      in_grad->shape_[param.axis], param.keepdims);
+  std::vector<index_t> axes = ParseAxes_(param.axis, in_grad->ndim());
+  if (axes.size() == 0) {
+    for (index_t i = 0; i < in_grad->ndim(); i++) {
+      axes.push_back(i);
+    }
+  }
+  std::vector<size_t> bsizes;
+  for (std::vector<index_t>::iterator it = axes.begin(); it != axes.end(); ++it) {
+    bsizes.push_back(in_grad->shape_[*it]);
   }
+  BroadcastAxisImpl_<xpu>(out_grad.data, env, in_grad, req, ctx,
+                          TShape(axes.begin(), axes.end()), TShape(bsizes.begin(), bsizes.end()));
 }
 
 // Forward pass of broadcast over the given axis
@@ -347,13 +315,13 @@ void BroadcastAxis(const TBlob &src,
   using namespace mshadow::expr;
   BroadcastAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < src.shape_.ndim()) <<
-    "axis must be smaller than the source ndim" << param.axis <<
-    ", src_ndim=" << src.shape_.ndim();
-  CHECK_EQ(src.shape_[param.axis], 1) <<
-    "Size of the broadcasting axis in the source must be 1, "
-    "axis=" << param.axis << ", size=" << src.shape_[param.axis];
-  BroadcastAxisImpl_<xpu>(src, env, ret, req, ctx, param.axis, param.size, true);
+  std::vector<index_t> axes = ParseAxes_(param.axis, src.ndim());
+  std::vector<size_t> bsizes;
+  for (std::vector<index_t>::iterator it = axes.begin(); it != axes.end(); ++it) {
+    bsizes.push_back(ret->shape_[*it]);
+  }
+  BroadcastAxisImpl_<xpu>(src, env, ret, req, ctx,
+                          TShape(axes.begin(), axes.end()), TShape(bsizes.begin(), bsizes.end()));
 }
 
 // Backward pass of broadcast over the given axis
@@ -367,72 +335,83 @@ void BroadcastAxisGrad_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   BroadcastAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < in_grad->shape_.ndim()) <<
-    "axis must be smaller than the source ndim" << param.axis <<
-    ", src_ndim=" << in_grad->shape_.ndim();
-  CHECK_EQ(in_grad->shape_[param.axis], 1) <<
-    "Size of the broadcasting axis in the source must be 1, "
-    "axis=" << param.axis << ", size=" << in_grad->shape_[param.axis];
-  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
-    << "Unary function only support input/output with the same type";
-  ReduceAxisImpl_<xpu, mshadow::red::sum, false>(out_grad.data, env, in_grad, req, ctx,
-                                                 param.axis, true);
+  std::vector<index_t> axes = ParseAxes_(param.axis, in_grad->ndim());
+  ReduceAxisImpl_<xpu, mshadow::red::sum>(out_grad.data, env, in_grad, req, ctx,
+                                                 TShape(axes.begin(), axes.end()));
 }
 
 
-
 // L2 norm
 MXNET_REGISTER_SIMPLE_OP(norm, XPU)
 .set_function(XPU::kDevMask, L2Norm<XPU>, kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ScalarShape)
 .describe("Take L2 norm of the src."
           "The result will be ndarray of shape (1,) on the same device.");
+
 // Max
 MXNET_REGISTER_SIMPLE_OP(max, XPU)
-.set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::maximum>, kNoInplace, kNotRegisterSymbolic)
-.set_shape_function(ScalarShape)
-.describe("(Deprecated! Use max_axis instead.) Take max of the src."
-          "The result will be ndarray of shape (1,) on the same device.");
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum>,
+kNoInplace, kNotRegisterSymbolic)
+.set_shape_function(ReduceAxisShape)
+.describe("Take max of the src in the given axis. Params: `axis` and `keepdims`."
+          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
+          " keepdims: the same meaning as Numpy.");
+
 // Min
 MXNET_REGISTER_SIMPLE_OP(min, XPU)
-.set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::minimum>, kNoInplace, kNotRegisterSymbolic)
-.set_shape_function(ScalarShape)
-.describe("(Deprecated! Use min_axis instead.) Take min of the src."
-          "The result will be ndarray of shape (1,) on the same device.");
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum>,
+kNoInplace, kNotRegisterSymbolic)
+.set_shape_function(ReduceAxisShape)
+.describe("Take min of the src in the given axis. Params: `axis` and `keepdims`."
+          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
+          " keepdims: the same meaning as Numpy.");
+
 // Sum
 MXNET_REGISTER_SIMPLE_OP(sum, XPU)
-.set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::sum>, kNoInplace, kRegisterSymbolic)
-.set_shape_function(ScalarShape)
-.set_gradient(XPU::kDevMask, SumBackward_<XPU>, kNoInplace)
-.describe("(Deprecated! Use sum_axis instead.) Take sum of the src."
-          "The result will be ndarray of shape (1,) on the same device.");
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::sum>,
+kNoInplace, kRegisterSymbolic)
+.set_shape_function(ReduceAxisShape)
+.set_gradient(XPU::kDevMask, SumAxisGrad_<XPU>, kNoInplace)
+.describe("Take sum of the src in the given axis. Params: `axis` and `keepdims`."
+          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
+          " keepdims: the same meaning as Numpy.");
+
 // max_axis
 MXNET_REGISTER_SIMPLE_OP(max_axis, XPU)
 .set_enable_kwargs(true)
-.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum, false>,
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum>,
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
-.describe("Take max of the src in the given axis. axis=-1 means to reduce all the dimensions."
-"The keepdims option has the same meaning as Numpy.");
+.describe("(Depreciated! Use max instead!)"
+          " Take max of the src in the given axis. Params: `axis` and `keepdims`."
+          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
+          " keepdims: the same meaning as Numpy.");
 
 // min_axis
 MXNET_REGISTER_SIMPLE_OP(min_axis, XPU)
 .set_enable_kwargs(true)
-.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum, false>,
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum>,
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
-.describe("Take min of the src in the given axis. axis=-1 means to reduce all the dimensions."
-"The keepdims option has the same meaning as Numpy.");
+.describe("(Depreciated! Use min instead!)"
+          " Take min of the src in the given axis. Params: `axis` and `keepdims`."
+          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
+          " keepdims: the same meaning as Numpy.");
 
 // sum_axis
 MXNET_REGISTER_SIMPLE_OP(sum_axis, XPU)
 .set_enable_kwargs(true)
-.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::sum, false>,
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::sum>,
               kNoInplace, kRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
 .set_gradient(XPU::kDevMask, SumAxisGrad_<XPU>, kNoInplace)
-.describe("Take sum of the src in the given axis. axis=-1 means to reduce all the dimensions."
-"The keepdims option has the same meaning as Numpy.");
+.describe("(Depreciated! Use sum instead!)"
+          " Take sum of the src in the given axis. Params: `axis` and `keepdims`."
+          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
+          " keepdims: the same meaning as Numpy.");
 
 // argmax channel
 MXNET_REGISTER_SIMPLE_OP(argmax_channel, XPU)
@@ -450,7 +429,7 @@ MXNET_REGISTER_SIMPLE_OP(broadcast_axis, XPU)
 .set_shape_function(BroadcastAxisShape)
 .set_gradient(XPU::kDevMask, BroadcastAxisGrad_<XPU>, kNoInplace)
 .describe("Broadcast data in the given axis to the given size. "
-"The original size of the broadcasting axis must be 1.");
+          "The original size of the broadcasting axis must be 1.");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/broadcast_reduce_op_common.h b/src/operator/broadcast_reduce_op_common.h
index 37ad8adbc12d..179935d7a882 100644
--- a/src/operator/broadcast_reduce_op_common.h
+++ b/src/operator/broadcast_reduce_op_common.h
@@ -10,9 +10,30 @@
 #include <mxnet/operator.h>
 #include <mxnet/operator_util.h>
 #include <vector>
+#include <set>
 
 namespace mxnet {
 namespace op {
+/*!
+* \brief Sort the given axes and removes the duplicate keys to get a vector
+* \param param_axis the input axis
+* \param max_ndim the maximum ndim
+*/
+inline std::vector<index_t> ParseAxes_(const TShape& param_axis, index_t max_ndim) {
+  std::set<index_t> axes_set_;
+  std::vector<index_t> axes;
+  for (index_t i = 0; i < param_axis.ndim(); i++) {
+    CHECK(param_axis[i] < max_ndim) << "axes must be within the range, ndim of the source="
+      << max_ndim << "axis=" << param_axis;
+    CHECK_EQ(axes_set_.find(param_axis[i]), axes_set_.end())
+      << "Duplicate value in 'axis', received:" << param_axis;
+    axes_set_.insert(param_axis[i]);
+  }
+  for (std::set<index_t>::iterator it = axes_set_.begin(); it != axes_set_.end(); ++it) {
+    axes.push_back(*it);
+  }
+  return axes;
+}
 
 /*!
 * \brief Check if the axes are continuous + get reducing size. E.g (1, 3) -> false, (1,2,3) -> true
@@ -68,7 +89,7 @@ inline TShape GetBroadcastingAxes_(const mshadow::TShape &src_shape,
 */
 template<typename Reducer, typename xpu, typename SrcExp, typename DType>
 void ReduceAxesAssign(mshadow::Tensor<xpu, 1, DType> out, const OpReqType req,
-  const SrcExp &src_, const TShape &axes) {
+  const TShape &axes, const SrcExp &src_) {
   using namespace mshadow;
   using namespace mshadow::expr;
   static const int dimsrc = ExpInfo<SrcExp>::kDim;
@@ -158,7 +179,7 @@ void ReduceToAssign(mshadow::Tensor<xpu, 1, DType> out, const OpReqType req,
   Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
   TShape axes = GetBroadcastingAxes_(target_shape,
     TShape(src_shape.shape_, src_shape.shape_ + dimsrc));
-  ReduceAxesAssign<Reducer>(out, req, src_, axes);
+  ReduceAxesAssign<Reducer>(out, req, axes, src_);
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index b0273f288091..45537ff7540b 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -34,7 +34,7 @@ def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=
             out2 = uf(*numpy_arg).astype(dtype)
         else:
             out2 = npuf(*numpy_arg).astype(dtype)
-            
+
         assert out1.shape == out2.shape
         if isinstance(out1, mx.nd.NDArray):
             out1 = out1.asnumpy()
@@ -223,10 +223,12 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
                 axes = tuple(axes)
             numpy_ret = numpy_reduce_func(dat, axis=axes, keepdims=keepdims)
 
-            ndarray_ret = nd_reduce_func(arr=mx.nd.array(dat), axis=axes, keepdims=keepdims)
+            ndarray_ret = nd_reduce_func(mx.nd.array(dat), axis=axes, keepdims=keepdims)
             if type(ndarray_ret) is mx.ndarray.NDArray:
                 ndarray_ret = ndarray_ret.asnumpy()
-            assert ndarray_ret.shape == numpy_ret.shape
+            assert (ndarray_ret.shape == numpy_ret.shape) or \
+                   (ndarray_ret.shape == (1,) and numpy_ret.shape == ()), "nd:%s, numpy:%s" \
+                                                         %(ndarray_ret.shape, numpy_ret.shape)
             err = np.square(ndarray_ret - numpy_ret).mean()
             assert err < 1E-4
     test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.sum),
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 73e58d57465f..775a5c9de993 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -980,11 +980,16 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
                     axes.append(axis)
             if 0 == len(axes):
                 axes = None
+            elif 1 == len(axes):
+                axes = axes[0]
             else:
                 axes = tuple(axes)
             keepdims = np.random.randint(0, 2)
             a = mx.symbol.Variable('a')
-            b = mx_reduce_sym(a, axis=axes, keepdims=keepdims)
+            if axes is None:
+                b = mx_reduce_sym(a, keepdims=keepdims)
+            else:
+                b = mx_reduce_sym(a, axis=axes, keepdims=keepdims)
             dat_npy = np.random.rand(*shape)
             sum_groundtruth = np.array(numpy_reduce_func(dat_npy, axis=axes, keepdims=keepdims))
             if sum_groundtruth.shape == ():
@@ -1014,10 +1019,11 @@ def test_broadcast_axis():
             # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
             ndim = np.random.randint(1, 8)
             target_shape = np.random.randint(1, 11, size=(ndim,))
-            axis = np.random.randint(0, ndim)
+            axis = tuple(set(np.random.randint(0, ndim, np.random.randint(1, ndim + 1))))
             shape = target_shape.copy()
-            size = shape[axis]
-            shape[axis] = 1
+            size = tuple([shape[ele] for ele in axis])
+            for ele in axis:
+                shape[ele] = 1
             a = mx.symbol.Variable('a')
             b = mx.symbol.broadcast_axis(a, axis=axis, size=size)
             dat_npy = np.random.rand(*shape)
@@ -1033,8 +1039,9 @@ def test_broadcast_axis():
             err_forward = np.square(net.outputs[0].asnumpy() - groundtruth).mean()
             assert err_forward < 1E-8
             net.backward(out_grads=mx.nd.array(outgrad_npy))
-            err_backward = np.square(grad_nd.asnumpy() - grad_groundtruth).mean()
-            assert err_backward < 1E-8
+            err_backward = np.square(grad_nd.asnumpy() - grad_groundtruth).sum()\
+                           /np.prod(target_shape)
+            assert err_backward < 1E-6
     test_broadcast_axis()
 
 

From 107f879b9f8ecd9927612aea29cb8f6064ca6366 Mon Sep 17 00:00:00 2001
From: Yan Li <godricly_li@126.com>
Date: Fri, 24 Jun 2016 11:13:26 +0800
Subject: [PATCH 058/126] [RFC] Embedding DTypes (#1) (#2516)

* [RFC] Embedding DTypes

* [RFC] [DTypes] Embedding python test
---
 src/operator/embedding-inl.h          | 46 +++++++++++++++++++++------
 src/operator/embedding.cc             | 18 ++++++++---
 src/operator/embedding.cu             |  8 +++--
 tests/python/gpu/test_operator_gpu.py | 27 ++++++++++++----
 4 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/src/operator/embedding-inl.h b/src/operator/embedding-inl.h
index 8956a92357d1..f9765daaee3d 100644
--- a/src/operator/embedding-inl.h
+++ b/src/operator/embedding-inl.h
@@ -36,7 +36,7 @@ struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
 };
 
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class EmbeddingOp : public Operator {
  public:
   explicit EmbeddingOp(EmbeddingParam p) {
@@ -62,10 +62,10 @@ class EmbeddingOp : public Operator {
     const TShape& oshape = out_data[embedding::kOut].shape_;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> data = in_data[embedding::kData].get_with_shape<xpu, 1, real_t>(
+    Tensor<xpu, 1, DType> data = in_data[embedding::kData].get_with_shape<xpu, 1, DType>(
          Shape1(ishape.ProdShape(0, ishape.ndim())), s);
-    Tensor<xpu, 2> wmat = in_data[embedding::kWeight].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 2> out = out_data[embedding::kOut].get_with_shape<xpu, 2, real_t>(
+    Tensor<xpu, 2, DType> wmat = in_data[embedding::kWeight].get<xpu, 2, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[embedding::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
     out = take(data, wmat);
   }
@@ -89,13 +89,13 @@ class EmbeddingOp : public Operator {
     const TShape& oshape = out_grad[embedding::kOut].shape_;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> data = in_data[embedding::kData].get_with_shape<xpu, 1, real_t>(
+    Tensor<xpu, 1, DType> data = in_data[embedding::kData].get_with_shape<xpu, 1, DType>(
          Shape1(ishape.ProdShape(0, ishape.ndim())), s);
-    Tensor<xpu, 2> grad_out = out_grad[embedding::kOut].get_with_shape<xpu, 2, real_t>(
+    Tensor<xpu, 2, DType> grad_out = out_grad[embedding::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-    Tensor<xpu, 2> grad_in = in_grad[embedding::kWeight].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2, DType> grad_in = in_grad[embedding::kWeight].get<xpu, 2, DType>(s);
     if (req[embedding::kWeight] == kWriteTo) {
-      grad_in = 0.0f;
+      grad_in = scalar<DType>(0.0f);
       AddTakeGrad(grad_in, data, grad_out);
     } else if (req[embedding::kWeight] == kAddTo) {
       AddTakeGrad(grad_in, data, grad_out);
@@ -109,7 +109,7 @@ class EmbeddingOp : public Operator {
 };  // class EmbeddingOp
 
 template<typename xpu>
-Operator* CreateOp(EmbeddingParam param);
+Operator* CreateOp(EmbeddingParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class EmbeddingProp : public OperatorProperty {
@@ -146,6 +146,26 @@ class EmbeddingProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto sym = new EmbeddingProp();
     sym->param_ = this->param_;
@@ -163,7 +183,13 @@ class EmbeddingProp : public OperatorProperty {
     return {out_grad[embedding::kOut], in_data[embedding::kData]};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   EmbeddingParam param_;
diff --git a/src/operator/embedding.cc b/src/operator/embedding.cc
index c1b7f59edc47..f26b025657fe 100644
--- a/src/operator/embedding.cc
+++ b/src/operator/embedding.cc
@@ -9,12 +9,22 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(EmbeddingParam param) {
-  return new EmbeddingOp<cpu>(param);
+Operator* CreateOp<cpu>(EmbeddingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new EmbeddingOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* EmbeddingProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *EmbeddingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(EmbeddingParam);
diff --git a/src/operator/embedding.cu b/src/operator/embedding.cu
index 7c326ec7f806..4f1d8f8e45ee 100644
--- a/src/operator/embedding.cu
+++ b/src/operator/embedding.cu
@@ -9,8 +9,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(EmbeddingParam param) {
-  return new EmbeddingOp<gpu>(param);
+Operator* CreateOp<gpu>(EmbeddingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new EmbeddingOp<gpu, DType>(param);
+  });
+  return op;
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index afa1f0f8f3b3..e0502b04b147 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1,4 +1,4 @@
-import sys
+﻿import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
@@ -8,20 +8,23 @@
 from numpy.testing import assert_allclose
 import time
 
-def check_consistency(sym, ctx_list, scale=1.0):
+def check_consistency(sym, ctx_list, scale=1.0, grad_req='write'):
     tol = {np.dtype(np.float16): 1e-1,
            np.dtype(np.float32): 1e-3,
            np.dtype(np.float64): 1e-5,
            np.dtype(np.uint8): 0,
            np.dtype(np.int32): 0}
     assert(len(ctx_list) > 1)
-    exe_list = [sym.simple_bind(grad_req='write', **ctx) for ctx in ctx_list]
+    exe_list = [sym.simple_bind(grad_req=grad_req, **ctx) for ctx in ctx_list]
     for exe in exe_list:
         assert(len(exe.outputs) == 1)
         assert(len(exe.arg_arrays) == len(exe_list[0].arg_arrays))
         assert(len(exe.grad_arrays) == len(exe_list[0].grad_arrays))
 
     init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe_list[0].arg_arrays]
+    if sym.name == 'embedding':
+        init[0] = np.random.randint(low=0, high=10, size=exe_list[0].arg_arrays[0].shape)
+
     for exe in exe_list:
         for arr, iarr in zip(exe.arg_arrays, init):
             arr[:] = iarr.astype(arr.dtype)
@@ -32,7 +35,8 @@ def check_consistency(sym, ctx_list, scale=1.0):
         exe.backward(exe.outputs[0])
 
     outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
-    grads = [[grad.asnumpy() for grad in exe.grad_arrays] for exe in exe_list]
+    # lazy solution handling None grad
+    grads = [[grad.asnumpy() if grad is not None else np.zeros(1) for grad in exe.grad_arrays] for exe in exe_list]
     dtypes = [arr.dtype for arr in outputs]
     max_idx = np.argmax(dtypes)
 
@@ -46,8 +50,8 @@ def check_consistency(sym, ctx_list, scale=1.0):
             except Exception, e:
                 print e
 
-def check_speed(sym, ctx, scale=1.0, N=100):
-    exe = sym.simple_bind(grad_req='write', **ctx)
+def check_speed(sym, ctx, scale=1.0, N=100, grad_req='write'):
+    exe = sym.simple_bind(grad_req=grad_req, **ctx)
     init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe.arg_arrays]
     for arr, iarr in zip(exe.arg_arrays, init):
         arr[:] = iarr.astype(arr.dtype)
@@ -166,6 +170,16 @@ def test_activation_with_type():
                 {'ctx': mx.cpu(0), 'act_data': (2, 2, 10, 10), 'type_dict': {'act_data': np.float16}}]
     check_consistency(sym, ctx_list)
 
+def test_embedding_with_type():
+    sym = mx.sym.Embedding(name='embedding', input_dim=10, output_dim=20)
+    ctx_list = [{'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float64}},
+                {'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float32}},
+                {'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float16}},
+                {'ctx': mx.cpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float64}},
+                {'ctx': mx.cpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float32}},
+                {'ctx': mx.cpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float16}}]
+    check_consistency(sym, ctx_list, grad_req={'embedding_data': 'null','embedding_weight': 'write'})
+
 if __name__ == '__main__':
     test_convolution_with_type()
     test_deconvolution_with_type()
@@ -177,6 +191,7 @@ def test_activation_with_type():
     test_swapaxis_with_type()
     test_fullyconnected_with_type()
     test_activation_with_type()
+    test_embedding_with_type()
     #test_softmax_with_shape((3,4), mx.gpu())
     #test_multi_softmax_with_shape((3,4,5), mx.gpu())
 

From caee4b46c2496bc6647fe20229da7422d7ae3054 Mon Sep 17 00:00:00 2001
From: Yuqi Li <ziyeqinghan@gmail.com>
Date: Fri, 24 Jun 2016 11:14:20 +0800
Subject: [PATCH 059/126] Add GRU in R (#2491)

* add GRU model in R

* add documentations for gru model in R
---
 R-package/NAMESPACE                  |   3 +
 R-package/R/gru.R                    | 355 +++++++++++++++++++++++++++
 R-package/man/mx.gru.Rd              |  66 +++++
 R-package/man/mx.gru.forward.Rd      |  25 ++
 R-package/man/mx.gru.inference.Rd    |  45 ++++
 R-package/vignettes/CharRnnModel.Rmd |   5 +-
 docs/packages/r/CharRnnModel.md      |   5 +-
 7 files changed, 500 insertions(+), 4 deletions(-)
 create mode 100644 R-package/R/gru.R
 create mode 100644 R-package/man/mx.gru.Rd
 create mode 100644 R-package/man/mx.gru.forward.Rd
 create mode 100644 R-package/man/mx.gru.inference.Rd

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 956ce9767d3a..ad95fe050ef3 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -26,6 +26,9 @@ export(mx.exec.update.arg.arrays)
 export(mx.exec.update.aux.arrays)
 export(mx.exec.update.grad.arrays)
 export(mx.gpu)
+export(mx.gru)
+export(mx.gru.forward)
+export(mx.gru.inference)
 export(mx.init.Xavier)
 export(mx.init.create)
 export(mx.init.normal)
diff --git a/R-package/R/gru.R b/R-package/R/gru.R
new file mode 100644
index 000000000000..d2ffd9a414c2
--- /dev/null
+++ b/R-package/R/gru.R
@@ -0,0 +1,355 @@
+# gru cell symbol
+gru <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
+    if (dropout > 0)
+        indata <- mx.symbol.Dropout(data=indata, p=dropout)
+    i2h <- mx.symbol.FullyConnected(data=indata,
+                                    weight=param$gates.i2h.weight,
+                                    bias=param$gates.i2h.bias,
+                                    num.hidden=num.hidden * 2,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
+    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
+                                    weight=param$gates.h2h.weight,
+                                    bias=param$gates.h2h.bias,
+                                    num.hidden=num.hidden * 2,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
+    gates <- i2h + h2h
+    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=2,
+                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
+    update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
+    reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type="sigmoid")
+
+    htrans.i2h <- mx.symbol.FullyConnected(data=indata,
+                                           weight=param$trans.i2h.weight,
+                                           bias=param$trans.i2h.bias,
+                                           num.hidden=num.hidden,
+                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
+    h.after.reset <- prev.state$h * reset.gate
+    htrans.h2h <- mx.symbol.FullyConnected(data=h.after.reset,
+                                           weight=param$trans.h2h.weight,
+                                           bias=param$trans.h2h.bias,
+                                           num.hidden=num.hidden,
+                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
+    h.trans <- htrans.i2h + htrans.h2h
+    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh")
+    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
+    return (list(h=next.h))
+}
+
+# unrolled gru network
+gru.unroll <- function(num.gru.layer, seq.len, input.size,
+                       num.hidden, num.embed, num.label, dropout=0) {
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+    param.cells <- lapply(1:num.gru.layer, function(i) {
+        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
+                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
+                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
+                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
+                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
+                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
+                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
+                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.gru.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    label <- mx.symbol.Variable("label")
+    data <- mx.symbol.Variable("data")
+    embed <- mx.symbol.Embedding(data=data, input.dim=input.size,
+                                 weight=embed.weight, output.dim=num.embed, name='embed')
+    wordvec <- mx.symbol.SliceChannel(data=embed, num.outputs=seq.len, squeeze.axis=1)
+
+    last.hidden <- list()
+    for (seqidx in 1:seq.len) {
+        hidden <- wordvec[[seqidx]]
+        # stack GRU
+        for (i in 1:num.gru.layer) {
+            dp <- ifelse(i==1, 0, dropout)
+            next.state <- gru(num.hidden, indata=hidden,
+                              prev.state=last.states[[i]],
+                              param=param.cells[[i]],
+                              seqidx=seqidx, layeridx=i, 
+                              dropout=dp)
+            hidden <- next.state$h
+            last.states[[i]] <- next.state
+        }
+        # decoder
+        if (dropout > 0)
+            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+        last.hidden <- c(last.hidden, hidden)
+    }
+    last.hidden$dim <- 0
+    last.hidden$num.args <- seq.len
+    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
+    fc <- mx.symbol.FullyConnected(data=concat,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num.hidden=num.label)
+
+    label <- mx.symbol.transpose(data=label)
+    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
+
+    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
+    return (loss.all)
+}
+
+# gru inference model symbol
+gru.inference.symbol <- function(num.gru.layer, seq.len, input.size,
+                                 num.hidden, num.embed, num.label, dropout=0) {
+    seqidx <- 1
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+
+    param.cells <- lapply(1:num.gru.layer, function(i) {
+        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
+                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
+                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
+                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
+                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
+                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
+                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
+                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.gru.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    data <- mx.symbol.Variable("data")
+    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                  weight=embed.weight, output_dim=num.embed, name="embed")
+
+    # stack GRU
+    for (i in 1:num.gru.layer) {
+        dp <- ifelse(i==1, 0, dropout)
+        next.state <- gru(num.hidden, indata=hidden,
+                          prev.state=last.states[[i]],
+                          param=param.cells[[i]],
+                          seqidx=seqidx, layeridx=i, 
+                          dropout=dp)
+        hidden <- next.state$h
+        last.states[[i]] <- next.state
+    }
+    # decoder
+    if (dropout > 0)
+        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+
+    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
+                                   weight=cls.weight, bias=cls.bias, name='pred')
+    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
+    unpack.h <- lapply(1:num.gru.layer, function(i) {
+        state <- last.states[[i]]
+        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
+        return (state.h)
+    })
+
+    list.all <- c(sm, unpack.h)
+    return (mx.symbol.Group(list.all))
+}
+
+#' Training GRU Unrolled Model
+#'
+#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
+#'      The Training set.
+#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
+#'      The validation set used for validation evaluation during the progress.
+#' @param num.gru.layer integer
+#'      The number of the layer of gru.
+#' @param seq.len integer
+#'      The length of the input sequence.
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer
+#'      The batch size used for R array training.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param ctx mx.context, optional
+#'      The device used to perform training.
+#' @param num.round integer, default=10
+#'      The number of iterations over training data to train the model.
+#' @param update.period integer, default=1
+#'      The number of iterations to update parameters during training period.
+#' @param initializer initializer object. default=mx.init.uniform(0.01)
+#'      The initialization scheme for parameters.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param optimizer string, default="sgd"
+#'      The optimization method.
+#' @param ... other parameters passing to \code{mx.gru}/.
+#' @return model A trained gru unrolled model.
+#'
+#' @export
+mx.gru <- function( train.data, eval.data=NULL,
+                    num.gru.layer, seq.len,
+                    num.hidden, num.embed, num.label,
+                    batch.size, input.size,
+                    ctx=mx.ctx.default(),
+                    num.round=10, update.period=1,
+                    initializer=mx.init.uniform(0.01),
+                    dropout=0, optimizer='sgd',
+                    ...) {
+    # check data and change data into iterator
+    train.data <- check.data(train.data, batch.size, TRUE)
+    eval.data <- check.data(eval.data, batch.size, FALSE)
+
+    # get unrolled gru symbol
+    rnn.sym <- gru.unroll( num.gru.layer=num.gru.layer,
+                           num.hidden=num.hidden,
+                           seq.len=seq.len,
+                           input.size=input.size,
+                           num.embed=num.embed,
+                           num.label=num.label,
+                           dropout=dropout)
+
+    init.states.name <- lapply(1:num.gru.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+
+    # set up gru model
+    model <- setup.rnn.model(rnn.sym=rnn.sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.gru.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=initializer,
+                             dropout=dropout)
+
+    # train gru model
+    model <- train.rnn( model, train.data, eval.data,
+                        num.round=num.round,
+                        update.period=update.period,
+                        ctx=ctx,
+                        init.states.name=init.states.name,
+                        ...)
+    # change model into MXFeedForwardModel
+    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
+    return(structure(model, class="MXFeedForwardModel"))
+}
+
+#' Create a GRU Inference Model
+#'
+#' @param num.gru.layer integer
+#'      The number of the layer of gru.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer, default=1
+#'      The batch size used for R array training.
+#' @param arg.params list
+#'      The batch size used for R array training.
+#' @param ctx mx.context, optional
+#'      Model parameter, list of name to NDArray of net's weights.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+#'      A gru inference model.
+#'
+#' @export
+mx.gru.inference <- function(num.gru.layer,
+                             input.size,
+                             num.hidden,
+                             num.embed,
+                             num.label,
+                             batch.size=1,
+                             arg.params,
+                             ctx=mx.cpu(),
+                             dropout=0.) {
+    sym <- gru.inference.symbol(num.gru.layer=num.gru.layer,
+                                 input.size=input.size,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=num.label,
+                                 dropout=dropout)
+
+    init.states.name <- lapply(1:num.gru.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+
+    seq.len <- 1
+    # set up gru model
+    model <- setup.rnn.model(rnn.sym=sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.gru.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=mx.init.uniform(0.01),
+                             dropout=dropout)
+    arg.names <- names(model$rnn.exec$ref.arg.arrays)
+    for (k in names(arg.params)) {
+        if ((k %in% arg.names) && is.param.name(k) ) {
+            rnn.input <- list()
+            rnn.input[[k]] <- arg.params[[k]]
+            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
+        }
+    }
+    init.states <- list()
+    for (i in 1:num.gru.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+
+    return (model)
+}
+
+#' Using forward function to predict in gru inference model
+#'
+#' @param model gru model
+#'      A gru inference model
+#' @param input.data, array.matrix
+#'      The input data for forward function
+#' @param new.seq boolean, default=FALSE
+#'      Whether the input is the start of a new sequence
+#'
+#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
+#'
+#' @export
+mx.gru.forward <- function(model, input.data, new.seq=FALSE) {
+    if (new.seq == TRUE) {
+        init.states <- list()
+        for (i in 1:model$num.rnn.layer) {
+            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+        }
+        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    }
+    dim(input.data) <- c(model$batch.size)
+    data <- list(data=mx.nd.array(input.data))
+    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
+    mx.exec.forward(model$rnn.exec, is.train=FALSE)
+    init.states <- list()
+    for (i in 1:model$num.rnn.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
+    return (list(prob=prob, model=model))
+}
+
diff --git a/R-package/man/mx.gru.Rd b/R-package/man/mx.gru.Rd
new file mode 100644
index 000000000000..deca62cfa85a
--- /dev/null
+++ b/R-package/man/mx.gru.Rd
@@ -0,0 +1,66 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gru.R
+\name{mx.gru}
+\alias{mx.gru}
+\title{Training GRU Unrolled Model}
+\usage{
+mx.gru(train.data, eval.data = NULL, num.gru.layer, seq.len, num.hidden,
+  num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(),
+  num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01),
+  dropout = 0, optimizer = "sgd", ...)
+}
+\arguments{
+\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array)
+The Training set.}
+
+\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional
+The validation set used for validation evaluation during the progress.}
+
+\item{num.gru.layer}{integer
+The number of the layer of gru.}
+
+\item{seq.len}{integer
+The length of the input sequence.}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer
+The batch size used for R array training.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{ctx}{mx.context, optional
+The device used to perform training.}
+
+\item{num.round}{integer, default=10
+The number of iterations over training data to train the model.}
+
+\item{update.period}{integer, default=1
+The number of iterations to update parameters during training period.}
+
+\item{initializer}{initializer object. default=mx.init.uniform(0.01)
+The initialization scheme for parameters.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{optimizer}{string, default="sgd"
+The optimization method.}
+
+\item{...}{other parameters passing to \code{mx.gru}/.}
+}
+\value{
+model A trained gru unrolled model.
+}
+\description{
+Training GRU Unrolled Model
+}
+
diff --git a/R-package/man/mx.gru.forward.Rd b/R-package/man/mx.gru.forward.Rd
new file mode 100644
index 000000000000..cedc27bd85a4
--- /dev/null
+++ b/R-package/man/mx.gru.forward.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gru.R
+\name{mx.gru.forward}
+\alias{mx.gru.forward}
+\title{Using forward function to predict in gru inference model}
+\usage{
+mx.gru.forward(model, input.data, new.seq = FALSE)
+}
+\arguments{
+\item{model}{gru model
+A gru inference model}
+
+\item{input.data, }{array.matrix
+The input data for forward function}
+
+\item{new.seq}{boolean, default=FALSE
+Whether the input is the start of a new sequence}
+}
+\value{
+result A list(prob=prob, model=model) containing the result probability of each label and the model.
+}
+\description{
+Using forward function to predict in gru inference model
+}
+
diff --git a/R-package/man/mx.gru.inference.Rd b/R-package/man/mx.gru.inference.Rd
new file mode 100644
index 000000000000..85c66ed8a781
--- /dev/null
+++ b/R-package/man/mx.gru.inference.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gru.R
+\name{mx.gru.inference}
+\alias{mx.gru.inference}
+\title{Create a GRU Inference Model}
+\usage{
+mx.gru.inference(num.gru.layer, input.size, num.hidden, num.embed, num.label,
+  batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0)
+}
+\arguments{
+\item{num.gru.layer}{integer
+The number of the layer of gru.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer, default=1
+The batch size used for R array training.}
+
+\item{arg.params}{list
+The batch size used for R array training.}
+
+\item{ctx}{mx.context, optional
+Model parameter, list of name to NDArray of net's weights.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+}
+\value{
+model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+     A gru inference model.
+}
+\description{
+Create a GRU Inference Model
+}
+
diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd
index 1dd30ff12b4d..2cb4b00ec1ac 100644
--- a/R-package/vignettes/CharRnnModel.Rmd
+++ b/R-package/vignettes/CharRnnModel.Rmd
@@ -248,5 +248,6 @@ Settled asing lately sistering sounted to their hight
 
 Other RNN models
 ----------------
-In `mxnet`, other RNN models like custom RNN is also provided.
-- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
\ No newline at end of file
+In `mxnet`, other RNN models like custom RNN and gru is also provided.
+- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
+- For **GRU model**, you can replace `mx.lstm` with `mx.gru` to train gru model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to inference from gru model and get forward result from the inference model.
\ No newline at end of file
diff --git a/docs/packages/r/CharRnnModel.md b/docs/packages/r/CharRnnModel.md
index 4623386ff66c..201301c7981b 100644
--- a/docs/packages/r/CharRnnModel.md
+++ b/docs/packages/r/CharRnnModel.md
@@ -310,5 +310,6 @@ Settled asing lately sistering sounted to their hight
 
 Other RNN models
 ----------------
-In `mxnet`, other RNN models like custom RNN is also provided.
-- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
\ No newline at end of file
+In `mxnet`, other RNN models like custom RNN and gru is also provided.
+- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
+- For **GRU model**, you can replace `mx.lstm` with `mx.gru` to train gru model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to inference from gru model and get forward result from the inference model.

From d388008ef6e5ce701cd7e9dc129b5f0ef27313df Mon Sep 17 00:00:00 2001
From: Yizhi Liu <javelinjs@gmail.com>
Date: Thu, 23 Jun 2016 00:31:54 +0800
Subject: [PATCH 060/126] [scala] disable kvstore for single device

---
 Makefile                                         | 16 ++++++++--------
 .../examples/src/main/resources/log4j.properties |  2 +-
 .../examples/imclassification/ModelTrain.scala   | 13 ++++++++++---
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index 43ff9beec9cd..2ed5a51cda69 100644
--- a/Makefile
+++ b/Makefile
@@ -236,26 +236,26 @@ rpkg:	roxygen
 scalapkg:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn clean package -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
-											-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-											-Dlddeps="$(LIB_DEP)")
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)")
 
 scalatest:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn verify -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
-							 -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-							 -Dlddeps="$(LIB_DEP)" $(SCALA_TEST_ARGS))
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)" $(SCALA_TEST_ARGS))
 
 scalainstall:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn install -P$(SCALA_PKG_PROFILE) -DskipTests -Dcxx="$(CXX)" \
-							  -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-								-Dlddeps="$(LIB_DEP)")
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)")
 
 scaladeploy:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn deploy -Prelease,$(SCALA_PKG_PROFILE) -DskipTests -Dcxx="$(CXX)" \
-							 -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-							 -Dlddeps="$(LIB_DEP)")
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)")
 
 jnilint:
 	python2 dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src
diff --git a/scala-package/examples/src/main/resources/log4j.properties b/scala-package/examples/src/main/resources/log4j.properties
index 7f5be5f70b89..cb92f4c5250a 100644
--- a/scala-package/examples/src/main/resources/log4j.properties
+++ b/scala-package/examples/src/main/resources/log4j.properties
@@ -1,5 +1,5 @@
 # for development debugging
-log4j.rootLogger = warn, stdout
+log4j.rootLogger = info, stdout
 
 log4j.appender.stdout = org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.Target = System.out
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
index d2605a152b4a..97deaf3123b2 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
@@ -15,8 +15,7 @@ object ModelTrain {
           lr: Float = 0.1f, lrFactor: Float = 1f, lrFactorEpoch: Float = 1f,
           clipGradient: Float = 0f, monitorSize: Int = -1): Unit = {
     // kvstore
-    // TODO: if local mode and no gpu is used, set kv = null
-    val kv = KVStore.create(kvStore)
+    var kv = KVStore.create(kvStore)
 
     // load model
     val modelPrefixWithRank =
@@ -62,6 +61,12 @@ object ModelTrain {
         lrScheduler = lrScheduler, clipGradient = clipGradient,
         momentum = 0.9f, wd = 0.00001f)
 
+    // disable kvstore for single device
+    if (kv.`type`.contains("local") && (devs.length == 1 || devs(0).deviceType != "gpu")) {
+      kv.dispose()
+      kv = null
+    }
+
     val model = new FeedForward(ctx = devs,
                                 symbol = network,
                                 numEpoch = numEpochs,
@@ -80,7 +85,9 @@ object ModelTrain {
               kvStore = kv,
               batchEndCallback = new Speedometer(batchSize, 50),
               epochEndCallback = checkpoint)
-    kv.dispose()
+    if (kv != null) {
+      kv.dispose()
+    }
   }
   // scalastyle:on parameterNum
 }

From 5daba1a05503d39f47a7f4cc12d4ad53c748880c Mon Sep 17 00:00:00 2001
From: Yizhi Liu <javelinjs@gmail.com>
Date: Fri, 24 Jun 2016 00:41:55 +0800
Subject: [PATCH 061/126] [scala] fix memory leak

---
 .../core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala |  1 +
 .../core/src/main/scala/ml/dmlc/mxnet/Executor.scala   |  4 +++-
 .../core/src/main/scala/ml/dmlc/mxnet/NDArray.scala    | 10 +++++-----
 .../core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala  | 10 +++++++++-
 .../src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala   |  6 ++++--
 5 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
index 4dcc52e57efa..35aa2eef6ada 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
@@ -59,6 +59,7 @@ class Accuracy extends EvalMetric("accuracy") {
         }
       }
       this.numInst += predLabel.shape(0)
+      predLabel.dispose()
     }
   }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
index 523093c975f4..7507b3584faa 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
@@ -74,7 +74,9 @@ object Executor {
                                       targets: Seq[Array[(Int, Int, NDArray)]]): Unit = {
     for ((src, dTargets) <- data zip targets) {
       for ((start, end, dst) <- dTargets) {
-        src.slice(start, end).copyTo(dst)
+        val sliced = src.slice(start, end)
+        sliced.copyTo(dst)
+        sliced.dispose()
       }
     }
   }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
index 7f1948b45d31..cb8d11746a12 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
@@ -55,12 +55,12 @@ object NDArray {
         if (output == null) {
           require(acceptEmptyMutate, s"argument out is required to call $funcName")
           output = new NDArray(newEmptyHandle())
+          addDependency(Array(lhs, rhs), Array(output))
         }
         checkCall(_LIB.mxFuncInvoke(handle,
           Array(lhs.handle, rhs.handle),
           Array[MXFloat](),
           Array(output.handle)))
-        addDependency(Array(lhs, rhs), Array(output))
       case _ => throw new IllegalArgumentException(s"call $funcName as binary function")
     }
     output
@@ -76,12 +76,12 @@ object NDArray {
         if (output == null) {
           require(acceptEmptyMutate, s"argument out is required to call $funcName")
           output = new NDArray(newEmptyHandle())
+          addDependency(Array(src), Array(output))
         }
         checkCall(_LIB.mxFuncInvoke(handle,
           Array(src.handle),
           Array[MXFloat](),
           Array(output.handle)))
-        addDependency(Array(src), Array(output))
       case _ => throw new IllegalArgumentException(s"call $funcName as unary function")
     }
     output
@@ -109,17 +109,17 @@ object NDArray {
                                   scalarRange: Range) =>
         require(mutateVars == null || nMutateVars == mutateVars.length,
           s"expect $nMutateVars in $funcName")
+        val useVars = useVarsRange.map(args(_).asInstanceOf[NDArray]).toArray
+        val scalarVars = scalarRange.map(args(_).asInstanceOf[MXFloat]).toArray
         if (mutateVars == null) {
           require(acceptEmptyMutate, s"argument out is required to call $funcName")
           mutateVars = Array.fill[NDArray](nMutateVars)(new NDArray(newEmptyHandle()))
+          addDependency(useVars, mutateVars)
         }
-        val useVars = useVarsRange.map(args(_).asInstanceOf[NDArray]).toArray
-        val scalarVars = scalarRange.map(args(_).asInstanceOf[MXFloat]).toArray
         checkCall(_LIB.mxFuncInvoke(handle,
           useVars.map(_.handle),
           scalarVars,
           mutateVars.map(_.handle).array))
-        addDependency(useVars, mutateVars)
       case _ => throw new IllegalArgumentException(s"call $funcName as generic function")
     }
     mutateVars
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
index 7b456b0b4bb2..7c233b1c8988 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
@@ -7,11 +7,19 @@ object Optimizer {
     new MXKVStoreUpdater {
       val states = new scala.collection.mutable.HashMap[Int, AnyRef]
       override def update(index: Int, grad: NDArray, weight: NDArray): Unit = {
-        val state = states.getOrElseUpdate(index, optimizer.createState(index, weight))
+        val state =
+          if (states.contains(index)) {
+            states.get(index).get
+          } else {
+            val newState = optimizer.createState(index, weight)
+            states.put(index, newState)
+            newState
+          }
         optimizer.update(index, weight, grad, state)
       }
       override def dispose(): Unit = {
         states.values.foreach(optimizer.disposeState)
+        states.clear()
       }
     }
   }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
index 41e9ef1cf9b4..929630065926 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
@@ -29,9 +29,11 @@ class MXDataIter private[mxnet](private[mxnet] val handle: DataIterHandle,
       iterNext()
       val data = currentBatch.data(0)
       val label = currentBatch.label(0)
-      reset()
       // properties
-      (Map(dataName -> data.shape), Map(labelName -> label.shape), data.shape(0))
+      val res = (Map(dataName -> data.shape), Map(labelName -> label.shape), data.shape(0))
+      currentBatch.dispose()
+      reset()
+      res
     } else {
       (null, null, 0)
     }

From d6540eb13ff7093f8d818014253a41a5ff54a208 Mon Sep 17 00:00:00 2001
From: jennyzhang0215 <jennyzhang0215@gmail.com>
Date: Fri, 24 Jun 2016 18:12:39 +0800
Subject: [PATCH 062/126] add cmake command for Mac OS and edit related file

---
 docs/how_to/build.md    | 12 ++++++++++++
 python/mxnet/libinfo.py |  3 ++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/how_to/build.md b/docs/how_to/build.md
index 7e9c2e873bbc..862b1116e005 100644
--- a/docs/how_to/build.md
+++ b/docs/how_to/build.md
@@ -80,6 +80,18 @@ git clone --recursive https://github.com/dmlc/mxnet
 cd mxnet; cp make/osx.mk ./config.mk; make -j$(sysctl -n hw.ncpu)
 ```
 
+Or use cmake command and Xcode
+
+```bash
+mkdir build; cd build
+cmake -G Xcode -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" -DUSE_OPENMP="OFF" -DUSE_CUDNN="OFF" -DUSE_CUDA="OFF" -DBLAS=MKL ..
+```
+
+Then open `mxnet.xcodeproj` by xcode and change two flags in `Build Settings` before building:
+(1) Link-Time Optimization = Yes
+(2) Optimisation Level = Fasteset[-O3]
+
+
 Troubleshooting:
 
 Some of the users might meet the link error `ld: library not found for -lgomp`, indicating that the GNU implementation of OpenMP is not in the library path of operating system.
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 6cf82d8ae3ff..54b49c4290b7 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -14,7 +14,8 @@ def find_lib_path():
     """
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     api_path = os.path.join(curr_path, '../../lib/')
-    dll_path = [curr_path, api_path]
+    cmake_build_path = os.path.join(curr_path, '../../build/Release/')
+    dll_path = [curr_path, api_path, cmake_build_path]
     if os.name == 'nt':
         vs_configuration = 'Release'
         if platform.architecture()[0] == '64bit':

From b5313c17be89871ce7596aa9a70642fb0fb7431d Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@ust.hk>
Date: Sat, 25 Jun 2016 03:14:44 +0800
Subject: [PATCH 063/126] Implement broadcast_to in the C++ side (#2525)

Revise other Simple OPs for the doc problem.

Revise python test
---
 python/mxnet/ndarray.py                |  16 +--
 src/operator/broadcast_reduce_op-inl.h | 167 ++++++++++++++++---------
 src/operator/broadcast_reduce_op.cc    |   1 +
 src/operator/matrix_op-inl.h           |  16 ++-
 tests/python/unittest/test_operator.py |  52 ++++----
 5 files changed, 150 insertions(+), 102 deletions(-)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index f68fbe08e023..e26adbfbc45f 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -314,21 +314,7 @@ def broadcast_to(self, shape):
         shape : the shape to broadcast
             the broadcast shape
         """
-        cur_shape = self.shape
-        err_str = 'operands could not be broadcast together with remapped shapes'\
-                '[original->remapped]: {} and requested shape {}'.format(cur_shape, shape)
-        if len(shape) < len(cur_shape):
-            raise ValueError(err_str)
-        cur_shape = (1,) * (len(shape) - len(cur_shape)) + cur_shape
-        cur_shape_arr = np.array(cur_shape)
-        shape = np.array(shape)
-        broadcasting_axes = np.nonzero(cur_shape_arr != shape)
-        if (cur_shape_arr[broadcasting_axes] != 1).any():
-            raise ValueError(err_str)
-        ret = self.reshape(tuple(cur_shape_arr))
-        for axis in broadcasting_axes[0]:
-            ret = broadcast_axis(ret, axis=axis, size=shape[axis])
-        return ret
+        return broadcast_to(self, shape=tuple(shape))
     # pylint: enable= undefined-variable
 
     def wait_to_read(self):
diff --git a/src/operator/broadcast_reduce_op-inl.h b/src/operator/broadcast_reduce_op-inl.h
index e9c0bd31fbe4..2457948c2d7f 100644
--- a/src/operator/broadcast_reduce_op-inl.h
+++ b/src/operator/broadcast_reduce_op-inl.h
@@ -44,6 +44,17 @@ struct BroadcastAxisParam : public dmlc::Parameter<BroadcastAxisParam> {
   }
 };
 
+struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(BroadcastToParam) {
+    DMLC_DECLARE_FIELD(shape).set_default(TShape())
+      .describe("The shape of the desired array."
+                " We can set the dim to zero if it's same as the original."
+                " E.g `A = broadcast_to(B, shape=(10, 0, 0))` "
+                "has the same meaning as `A = broadcast_axis(B, axis=0, size=10)`.");
+  }
+};
+
 inline TShape ReduceAxisShape(const TShape& ishape,
   const EnvArguments& env) {
   ReduceAxisParam param;
@@ -84,6 +95,23 @@ inline TShape BroadcastAxisShape(const TShape& ishape,
   return ret;
 }
 
+inline TShape BroadcastToShape(const TShape& ishape,
+  const EnvArguments& env) {
+  BroadcastToParam param;
+  param.Init(env.kwargs);
+  CHECK_EQ(param.shape.ndim(), ishape.ndim());
+  TShape ret = ishape;
+  for (index_t i = 0; i < param.shape.ndim(); i++) {
+    if (param.shape[i] > 0 && (param.shape[i] != ishape[i])) {
+      CHECK_EQ(ishape[i], 1) <<
+        "Size of the broadcasting axis in the source must be 1, src_shape=" << ishape
+        << ", broadcast_to=" << param.shape;
+      ret[i] = param.shape[i];
+    }
+  }
+  return ret;
+}
+
 // return a shape of scalar
 inline TShape ScalarShape(const TShape& ishape,
                           const EnvArguments& env) {
@@ -108,42 +136,6 @@ void L2Norm(const TBlob &src,
   });
 }
 
-template<typename xpu, typename Reducer>
-void Reduce(const TBlob &src,
-            const EnvArguments& env,
-            TBlob *ret,
-            OpReqType req,
-            RunContext ctx) {
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(src.type_flag_, ret->type_flag_);
-  MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
-    mshadow::Tensor<xpu, 1, DType> out = ret->get<xpu, 1, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> in =
-      src.get_with_shape<xpu, 2, DType>(mshadow::Shape2(1, src.shape_.Size()), s);
-    ASSIGN_DISPATCH(out, req, (mshadow::expr::reduce_except_dim<0, Reducer>(in)));
-  });
-}
-
-// backward function that takes input value of the op
-template<typename xpu>
-void SumBackward_(const OutputGrad& scale,
-                  const EnvArguments& env,
-                  TBlob *in_grad,
-                  OpReqType req,
-                  RunContext ctx) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(in_grad->type_flag_, scale.data.type_flag_)
-    << "Unary function only support input/output with the same type";
-  MSHADOW_REAL_TYPE_SWITCH(in_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 1, DType> mscale = scale.data.get<xpu, 1, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
-      ASSIGN_DISPATCH(igrad, req,
-                      broadcast_scalar(mscale, igrad.shape_));
-  });
-}
-
 template<typename xpu, typename Reducer>
 void ReduceChannel(const TBlob &src,
                    const EnvArguments& env,
@@ -190,6 +182,15 @@ void ReduceAxisImpl_(const TBlob &src,
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(src.type_flag_, ret->type_flag_);
+  // If the axes is empty, we just need to give an identity mapping.
+  if (axes.ndim() == 0) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 2, DType> in = src.FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      ASSIGN_DISPATCH(out, req, F<mshadow_op::identity>(in));
+    });
+    return;
+  }
   bool is_contiguous_axes;
   index_t reducing_size;
   CheckContiguousAxes_(&is_contiguous_axes, &reducing_size, axes, src.shape_);
@@ -228,6 +229,15 @@ void BroadcastAxisImpl_(const TBlob &src,
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(src.type_flag_, ret->type_flag_);
+  // If the axes is empty, we just need to give an identity mapping.
+  if (axes.ndim() == 0) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 2, DType> in = src.FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      ASSIGN_DISPATCH(out, req, F<mshadow_op::identity>(in));
+    });
+    return;
+  }
   bool is_contiguous_axes;
   index_t broadcasting_size;
   CheckContiguousAxes_(&is_contiguous_axes, &broadcasting_size, axes, ret->shape_);
@@ -337,7 +347,46 @@ void BroadcastAxisGrad_(const OutputGrad& out_grad,
   param.Init(env.kwargs);
   std::vector<index_t> axes = ParseAxes_(param.axis, in_grad->ndim());
   ReduceAxisImpl_<xpu, mshadow::red::sum>(out_grad.data, env, in_grad, req, ctx,
-                                                 TShape(axes.begin(), axes.end()));
+                                          TShape(axes.begin(), axes.end()));
+}
+
+// Forward pass of broadcast_to
+template<typename xpu>
+void BroadcastTo(const TBlob &src,
+  const EnvArguments& env,
+  TBlob *ret,
+  OpReqType req,
+  RunContext ctx) {
+  using namespace mshadow::expr;
+  std::vector<index_t> axes;
+  std::vector<size_t> bsizes;
+  for (index_t i = 0; i < src.shape_.ndim(); ++i) {
+    if (src.shape_[i] != ret->shape_[i]) {
+      axes.push_back(i);
+      bsizes.push_back(ret->shape_[i]);
+    }
+  }
+  BroadcastAxisImpl_<xpu>(src, env, ret, req, ctx,
+                          TShape(axes.begin(), axes.end()), TShape(bsizes.begin(), bsizes.end()));
+}
+
+// Backward pass of broadcast_to
+template<typename xpu>
+void BroadcastToGrad_(const OutputGrad& out_grad,
+  const EnvArguments& env,
+  TBlob *in_grad,
+  OpReqType req,
+  RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  std::vector<index_t> axes;
+  for (index_t i = 0; i < in_grad->shape_.ndim(); ++i) {
+    if (out_grad.data.shape_[i] != in_grad->shape_[i]) {
+      axes.push_back(i);
+    }
+  }
+  ReduceAxisImpl_<xpu, mshadow::red::sum>(out_grad.data, env, in_grad, req, ctx,
+                                          TShape(axes.begin(), axes.end()));
 }
 
 
@@ -354,9 +403,8 @@ MXNET_REGISTER_SIMPLE_OP(max, XPU)
 .set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum>,
 kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
-.describe("Take max of the src in the given axis. Params: `axis` and `keepdims`."
-          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
-          " keepdims: the same meaning as Numpy.");
+.describe("Take max of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // Min
 MXNET_REGISTER_SIMPLE_OP(min, XPU)
@@ -364,9 +412,8 @@ MXNET_REGISTER_SIMPLE_OP(min, XPU)
 .set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum>,
 kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
-.describe("Take min of the src in the given axis. Params: `axis` and `keepdims`."
-          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
-          " keepdims: the same meaning as Numpy.");
+.describe("Take min of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // Sum
 MXNET_REGISTER_SIMPLE_OP(sum, XPU)
@@ -375,9 +422,8 @@ MXNET_REGISTER_SIMPLE_OP(sum, XPU)
 kNoInplace, kRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
 .set_gradient(XPU::kDevMask, SumAxisGrad_<XPU>, kNoInplace)
-.describe("Take sum of the src in the given axis. Params: `axis` and `keepdims`."
-          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
-          " keepdims: the same meaning as Numpy.");
+.describe("Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // max_axis
 MXNET_REGISTER_SIMPLE_OP(max_axis, XPU)
@@ -386,9 +432,8 @@ MXNET_REGISTER_SIMPLE_OP(max_axis, XPU)
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
 .describe("(Depreciated! Use max instead!)"
-          " Take max of the src in the given axis. Params: `axis` and `keepdims`."
-          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
-          " keepdims: the same meaning as Numpy.");
+          " Take max of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // min_axis
 MXNET_REGISTER_SIMPLE_OP(min_axis, XPU)
@@ -397,9 +442,8 @@ MXNET_REGISTER_SIMPLE_OP(min_axis, XPU)
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
 .describe("(Depreciated! Use min instead!)"
-          " Take min of the src in the given axis. Params: `axis` and `keepdims`."
-          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
-          " keepdims: the same meaning as Numpy.");
+          " Take min of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // sum_axis
 MXNET_REGISTER_SIMPLE_OP(sum_axis, XPU)
@@ -409,9 +453,8 @@ MXNET_REGISTER_SIMPLE_OP(sum_axis, XPU)
 .set_shape_function(ReduceAxisShape)
 .set_gradient(XPU::kDevMask, SumAxisGrad_<XPU>, kNoInplace)
 .describe("(Depreciated! Use sum instead!)"
-          " Take sum of the src in the given axis. Params: `axis` and `keepdims`."
-          " axis: tuple or integer of axes to reduce, global reduce will be performed if not set."
-          " keepdims: the same meaning as Numpy.");
+          " Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // argmax channel
 MXNET_REGISTER_SIMPLE_OP(argmax_channel, XPU)
@@ -429,7 +472,19 @@ MXNET_REGISTER_SIMPLE_OP(broadcast_axis, XPU)
 .set_shape_function(BroadcastAxisShape)
 .set_gradient(XPU::kDevMask, BroadcastAxisGrad_<XPU>, kNoInplace)
 .describe("Broadcast data in the given axis to the given size. "
-          "The original size of the broadcasting axis must be 1.");
+          "The original size of the broadcasting axis must be 1.")
+.add_arguments(BroadcastAxisParam::__FIELDS__());
+
+// broadcast_to
+MXNET_REGISTER_SIMPLE_OP(broadcast_to, XPU)
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, BroadcastTo<XPU>,
+kNoInplace, kRegisterSymbolic)
+.set_shape_function(BroadcastToShape)
+.set_gradient(XPU::kDevMask, BroadcastToGrad_<XPU>, kNoInplace)
+.describe("Broadcast data to the target shape. "
+          "The original size of the broadcasting axis must be 1.")
+.add_arguments(BroadcastToParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/broadcast_reduce_op.cc b/src/operator/broadcast_reduce_op.cc
index bd604ca2ad1e..5c731f0284bd 100644
--- a/src/operator/broadcast_reduce_op.cc
+++ b/src/operator/broadcast_reduce_op.cc
@@ -10,6 +10,7 @@ namespace op {
 
 DMLC_REGISTER_PARAMETER(ReduceAxisParam);
 DMLC_REGISTER_PARAMETER(BroadcastAxisParam);
+DMLC_REGISTER_PARAMETER(BroadcastToParam);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index 24dda9580e62..1195c504ed8a 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -555,7 +555,8 @@ MXNET_REGISTER_SIMPLE_OP(transpose, XPU)
 .set_function(XPU::kDevMask, Transpose<XPU>, kNoInplace, kRegisterSymbolic)
 .set_shape_function(TransposeShape)
 .set_gradient(XPU::kDevMask, TransposeGrad<XPU>, kNoInplace)
-.describe("Transpose the input matrix and return a new one");
+.describe("Transpose the input matrix and return a new one")
+.add_arguments(TransposeParam::__FIELDS__());
 
 // expand_dim
 MXNET_REGISTER_SIMPLE_OP(expand_dims, XPU)
@@ -563,14 +564,16 @@ MXNET_REGISTER_SIMPLE_OP(expand_dims, XPU)
 .set_function(XPU::kDevMask, ReshapeImpl<XPU>, kInplaceInOut)
 .set_shape_function(ExpandDimShape)
 .set_gradient(XPU::kDevMask, ReshapeGrad_<XPU>, kInplaceOutIn)
-.describe("Expand the shape of array by inserting a new axis.");
+.describe("Expand the shape of array by inserting a new axis.")
+.add_arguments(ExpandDimParam::__FIELDS__());
 
 // crop
 MXNET_REGISTER_SIMPLE_OP(crop, XPU)
 .set_enable_kwargs(true)
 .set_function(XPU::kDevMask, Crop<XPU>, kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(CropShape)
-.describe("Crop the input matrix and return a new one");
+.describe("Crop the input matrix and return a new one")
+.add_arguments(SimpleCropParam::__FIELDS__());
 
 // slice_axis
 MXNET_REGISTER_SIMPLE_OP(slice_axis, XPU)
@@ -579,15 +582,16 @@ MXNET_REGISTER_SIMPLE_OP(slice_axis, XPU)
               kNoInplace, kRegisterSymbolic)
 .set_gradient(XPU::kDevMask, SliceGrad_<XPU>, kNoInplace)
 .set_shape_function(SliceShape)
-.describe("Slice the input along certain axis and return a sliced array.");
+.describe("Slice the input along certain axis and return a sliced array.")
+.add_arguments(SliceParam::__FIELDS__());
 
 // flip
 MXNET_REGISTER_SIMPLE_OP(flip, XPU)
 .set_enable_kwargs(true)
 .set_function(XPU::kDevMask, Flip<XPU>, kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(FlipShape)
-.describe("Flip the input matrix along axis and return a new one");
-
+.describe("Flip the input matrix along axis and return a new one")
+.add_arguments(FlipParam::__FIELDS__());
 
 // dot
 MXNET_REGISTER_SIMPLE_OP(dot, XPU)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 775a5c9de993..d0833edb96c9 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1002,11 +1002,11 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
                          args_grad={'a': grad_nd})
             net.forward(is_train=True)
 
-            err_forward = np.square(net.outputs[0].asnumpy() - sum_groundtruth).sum()/np.prod(shape)
-            assert err_forward < 1E-6
+            err_forward = reldiff(net.outputs[0].asnumpy(), sum_groundtruth)
+            assert err_forward < 1E-4
             net.backward(out_grads=mx.nd.array(outgrad_npy))
-            err_backward = np.square(grad_nd.asnumpy() - grad_groundtruth).sum()
-            assert err_backward < 1E-6
+            err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth)
+            assert err_backward < 1E-4
     test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.sum),
                       lambda outgrad, data, axis, keepdims:
                         outgrad.reshape(_np_reduce(data, axis, 1, np.sum).shape),
@@ -1014,36 +1014,38 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
 
 def test_broadcast():
     sample_num = 200
-    def test_broadcast_axis():
-        for i in range(sample_num):
-            # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
-            ndim = np.random.randint(1, 8)
-            target_shape = np.random.randint(1, 11, size=(ndim,))
-            axis = tuple(set(np.random.randint(0, ndim, np.random.randint(1, ndim + 1))))
-            shape = target_shape.copy()
-            size = tuple([shape[ele] for ele in axis])
-            for ele in axis:
-                shape[ele] = 1
-            a = mx.symbol.Variable('a')
-            b = mx.symbol.broadcast_axis(a, axis=axis, size=size)
+    for i in range(sample_num):
+        # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
+        ndim = np.random.randint(1, 8)
+        target_shape = np.random.randint(1, 11, size=(ndim,))
+        axis = tuple(set(np.random.randint(0, ndim, np.random.randint(1, ndim + 1))))
+        shape = target_shape.copy()
+        size = tuple([shape[ele] for ele in axis])
+        for ele in axis:
+            shape[ele] = 1
+        a = mx.symbol.Variable('a')
+        sym_bcast_axis = mx.symbol.broadcast_axis(a, axis=axis, size=size)
+        sym_bcast_to = mx.symbol.broadcast_to(a, shape=tuple(target_shape))
+        def test_broadcasting_ele(sym_bcast):
             dat_npy = np.random.rand(*shape)
             groundtruth = dat_npy
             grad_nd = mx.nd.empty(shape)
             outgrad_npy = np.random.rand(*target_shape)
             grad_groundtruth = _np_reduce(outgrad_npy, axis=axis, keepdims=True,
                                           numpy_reduce_func=np.sum)
-            net = b.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
-                         args_grad={'a': grad_nd})
+            net = sym_bcast.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
+                                                 args_grad={'a': grad_nd})
+            net_bcast_to = sym_bcast_to.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
+                                             args_grad={'a': grad_nd})
             net.forward(is_train=True)
             assert (net.outputs[0].shape == target_shape).all()
-            err_forward = np.square(net.outputs[0].asnumpy() - groundtruth).mean()
-            assert err_forward < 1E-8
+            err_forward = reldiff(net.outputs[0].asnumpy(), groundtruth)
+            assert err_forward < 1E-4
             net.backward(out_grads=mx.nd.array(outgrad_npy))
-            err_backward = np.square(grad_nd.asnumpy() - grad_groundtruth).sum()\
-                           /np.prod(target_shape)
-            assert err_backward < 1E-6
-    test_broadcast_axis()
-
+            err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth)
+            assert err_backward < 1E-4
+        test_broadcasting_ele(sym_bcast_axis)
+        test_broadcasting_ele(sym_bcast_to)
 
 def test_transpose():
     for ndim in range(1, 6):

From 90f28b468bfb6138377bc80f998701e9e606132a Mon Sep 17 00:00:00 2001
From: "Qiang Kou (KK)" <qkou@umail.iu.edu>
Date: Fri, 24 Jun 2016 13:00:16 -0700
Subject: [PATCH 064/126] (1) export more C++ functions; (2) R docs for CUDA
 (#2520)

---
 R-package/R/mxnet_generated.R | 429 +++++++++++++++++++++++++++++-----
 docs/how_to/build.md          |  31 +++
 2 files changed, 403 insertions(+), 57 deletions(-)

diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 08393769a368..d8e32ad58ea9 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -22,6 +22,64 @@ NULL
 #' @name mx.nd.argmax.channel
 NULL
 
+#' Broadcast data in the given axis to the given size. The original size of the broadcasting axis must be 1.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.axis
+NULL
+
+#' lhs divide rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.div
+NULL
+
+#' lhs minus rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.minus
+NULL
+
+#' lhs multiple rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.mul
+NULL
+
+#' lhs add rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.plus
+NULL
+
 #' Take ceil value of the src
 #' 
 #' @param src  NDArray
@@ -68,6 +126,16 @@ NULL
 #' @name mx.nd.cos
 NULL
 
+#' Crop the input matrix and return a new one
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.crop
+NULL
+
 #' Calculate dot product of two matrices or two vectors
 #' 
 #' @param lhs  NDArray
@@ -90,6 +158,16 @@ NULL
 #' @name mx.nd.exp
 NULL
 
+#' Expand the shape of array by inserting a new axis.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.expand.dims
+NULL
+
 #' Fill one element of each line(row for python, column for R/Julia) in lhs according to index indicated by rhs and values indicated by mhs. This function assume rhs uses 0-based index.
 #' 
 #' @param lhs  NDArray
@@ -104,6 +182,16 @@ NULL
 #' @name mx.nd.fill.element.0index
 NULL
 
+#' Flip the input matrix along axis and return a new one
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.flip
+NULL
+
 #' Take floor value of the src
 #' 
 #' @param src  NDArray
@@ -124,7 +212,7 @@ NULL
 #' @name mx.nd.log
 NULL
 
-#' Take max of the src.The result will be ndarray of shape (1,) on the same device.
+#' Take max of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
@@ -134,7 +222,17 @@ NULL
 #' @name mx.nd.max
 NULL
 
-#' Take min of the src.The result will be ndarray of shape (1,) on the same device.
+#' (Depreciated! Use max instead!) Take max of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.max.axis
+NULL
+
+#' Take min of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
@@ -144,6 +242,16 @@ NULL
 #' @name mx.nd.min
 NULL
 
+#' (Depreciated! Use min instead!) Take min of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.min.axis
+NULL
+
 #' Take L2 norm of the src.The result will be ndarray of shape (1,) on the same device.
 #' 
 #' @param src  NDArray
@@ -194,6 +302,26 @@ NULL
 #' @name mx.nd.sin
 NULL
 
+#' Slice the input along certain axis and return a sliced array.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.slice.axis
+NULL
+
+#' Calculate Smooth L1 Loss(lhs, scalar)
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.smooth.l1
+NULL
+
 #' Calculate cross_entropy(lhs, one_hot(rhs))
 #' 
 #' @param lhs  NDArray
@@ -226,7 +354,7 @@ NULL
 #' @name mx.nd.square
 NULL
 
-#' Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+#' Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
@@ -236,6 +364,16 @@ NULL
 #' @name mx.nd.sum
 NULL
 
+#' (Depreciated! Use sum instead!) Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sum.axis
+NULL
+
 #' Transpose the input matrix and return a new one
 #' 
 #' @param src  NDArray
@@ -331,6 +469,8 @@ mx.io.CSVIter <- function(...) {
 #'     Dataset Param: Shape of each instance generated by the DataIter.
 #' @param inter.method  int, optional, default='1'
 #'     Augmentation Param: 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand.
+#' @param pad  int, optional, default='0'
+#'     Augmentation Param: Padding size.
 #' @param mirror  boolean, optional, default=False
 #'     Augmentation Param: Whether to mirror the image.
 #' @param rand.mirror  boolean, optional, default=False
@@ -505,6 +645,19 @@ mx.symbol.Crop <- function(...) {
   mx.varg.symbol.Crop(list(...))
 }
 
+#' Custom operator implemented in frontend.
+#' 
+#' @param op.type  string
+#'     Type of custom operator. Must be registered first.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.Custom <- function(...) {
+  mx.varg.symbol.Custom(list(...))
+}
+
 #' Apply deconvolution to input then add a bias.
 #' 
 #' @param data  Symbol
@@ -518,7 +671,11 @@ mx.symbol.Crop <- function(...) {
 #' @param stride  Shape(tuple), optional, default=(1,1)
 #'     deconvolution stride: (y, x)
 #' @param pad  Shape(tuple), optional, default=(0,0)
-#'     pad for deconvolution: (y, x)
+#'     pad for deconvolution: (y, x), a good number is : (kernel-1)/2, if target_shape set, pad will be ignored and will be computed automatically
+#' @param adj  Shape(tuple), optional, default=(0,0)
+#'     adjustment for output shape: (y, x), if target_shape set, adj will be ignored and will be computed automatically
+#' @param target.shape  Shape(tuple), optional, default=(0,0)
+#'     output shape with targe shape : (y, x)
 #' @param num.filter  int (non-negative), required
 #'     deconvolution filter(channel) number
 #' @param num.group  int (non-negative), optional, default=1
@@ -586,7 +743,7 @@ mx.symbol.Embedding <- function(...) {
 #' Flatten input
 #' 
 #' @param data  Symbol
-#'     Input data to  flatten.
+#'     Input data to flatten.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -745,10 +902,27 @@ mx.symbol.MAERegressionOutput <- function(...) {
   mx.varg.symbol.MAERegressionOutput(list(...))
 }
 
+#' Get output from a symbol and pass 1 gradient back. This is used as a terminal loss if unary and binary operator are used to composite a loss with no declaration of backward dependency
+#' 
+#' @param data  Symbol
+#'     Input data.
+#' @param grad.scale  float, optional, default=1
+#'     gradient scale as a supplement to unary and binary operators
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.MakeLoss <- function(...) {
+  mx.varg.symbol.MakeLoss(list(...))
+}
+
 #' Perform spatial pooling on inputs.
 #' 
 #' @param data  Symbol
 #'     Input data to the pooling operator.
+#' @param global.pool  boolean, optional, default=False
+#'     Ignore kernel size, do global pooling based on current input feature map. This is useful for input with different shape
 #' @param kernel  Shape(tuple), required
 #'     pooling kernel size: (y, x)
 #' @param pool.type  {'avg', 'max', 'sum'}, required
@@ -766,14 +940,16 @@ mx.symbol.Pooling <- function(...) {
   mx.varg.symbol.Pooling(list(...))
 }
 
-#' Resize regions of interest in an input plane to a fixed size by MAX pooling.
+#' Performs region-of-interest pooling on inputs. Resize bounding box coordinates by spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled by max pooling to a fixed size output indicated by pooled_size. batch_size will change to the number of region bounding boxes after ROIPooling
 #' 
-#' @param data  Symbol[]
-#'     [input tensor, regions of interest]
+#' @param data  Symbol
+#'     Input data to the pooling operator, a 4D Feature maps
+#' @param rois  Symbol
+#'     Bounding box coordinates, a 2D array of [[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners of designated region of interest. batch_index indicates the index of corresponding image in the input data
 #' @param pooled.size  Shape(tuple), required
-#'     target size: (h, w)
+#'     fix pooled size: (h, w)
 #' @param spatial.scale  float, required
-#'     Ratio of input plane height (or w) to raw image height (or w).
+#'     Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal of total stride in convolutional layers
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -786,11 +962,13 @@ mx.symbol.ROIPooling <- function(...) {
 #' Reshape input to target shape
 #' 
 #' @param data  Symbol
-#'     Input data to  reshape.
-#' @param target.shape  Shape(tuple), required
-#'     Target new shape. One and only one dim can be 0, in which case it will be inferred from the rest of dims
+#'     Input data to reshape.
+#' @param target.shape  Shape(tuple), optional, default=(0,0)
+#'     (Deprecated! Use shape instead.) Target new shape. One and only one dim can be 0, in which case it will be inferred from the rest of dims
 #' @param keep.highest  boolean, optional, default=False
-#'     Whether keep the highest dim unchanged.If set to yes, than the first dim in target_shape is ignored,and always fixed as input
+#'     (Deprecated! Use shape instead.) Whether keep the highest dim unchanged.If set to yes, than the first dim in target_shape is ignored,and always fixed as input
+#' @param shape  , optional, default=()
+#'     Target new shape. If the dim is same, set it to 0. If the dim is set to be -1, it will be inferred from the rest of dims. One and only one dim can be -1
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -829,6 +1007,8 @@ mx.symbol.SliceChannel <- function(...) {
 #'     If set to true, for a (n,k,x_1,..,x_n) dimensional input tensor, softmax will generate n*x_1*...*x_n output, each has k classes
 #' @param use.ignore  boolean, optional, default=False
 #'     If set to true, the ignore_label value will not contribute to the backward gradient
+#' @param normalization  {'batch', 'null', 'valid'},optional, default='null'
+#'     If set to null, op will do nothing on output gradient.If set to batch, op will normalize gradient by divide batch sizeIf set to valid, op will normalize gradient by divide sample not ignored
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -858,7 +1038,7 @@ mx.symbol.SoftmaxActivation <- function(...) {
 #' @param data  Symbol
 #'     Input data to softmax.
 #' @param label  Symbol
-#'     Label data.
+#'     Label data, can also be probability value with same shape as data
 #' @param grad.scale  float, optional, default=1
 #'     Scale the gradient by a float factor
 #' @param ignore.label  float, optional, default=-1
@@ -867,6 +1047,8 @@ mx.symbol.SoftmaxActivation <- function(...) {
 #'     If set to true, for a (n,k,x_1,..,x_n) dimensional input tensor, softmax will generate n*x_1*...*x_n output, each has k classes
 #' @param use.ignore  boolean, optional, default=False
 #'     If set to true, the ignore_label value will not contribute to the backward gradient
+#' @param normalization  {'batch', 'null', 'valid'},optional, default='null'
+#'     If set to null, op will do nothing on output gradient.If set to batch, op will normalize gradient by divide batch sizeIf set to valid, op will normalize gradient by divide sample not ignored
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -907,6 +1089,8 @@ mx.symbol.SwapAxis <- function(...) {
 #'     How to handle multiple input. concat means concatenate upsampled images along the channel dimension. sum means add all images together, only available for nearest neighbor upsampling.
 #' @param num.args  int, required
 #'     Number of inputs to be upsampled. For nearest neighbor upsampling, this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other inputs will be upsampled to thesame size. For bilinear upsampling this must be 2; 1 input and 1 weight.
+#' @param workspace  long (non-negative), optional, default=512
+#'     Tmp workspace for deconvolution (MB)
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -918,24 +1102,93 @@ mx.symbol.UpSampling <- function(...) {
 
 #' Take absolute value of the src
 #' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.abs <- function(...) {
+  mx.varg.symbol.abs(list(...))
+}
+
+#' Broadcast data in the given axis to the given size. The original size of the broadcasting axis must be 1.
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_axis <- function(...) {
+  mx.varg.symbol.broadcast_axis(list(...))
+}
+
+#' lhs divide rhs with broadcast
+#' 
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_div <- function(...) {
+  mx.varg.symbol.broadcast_div(list(...))
+}
+
+#' lhs minus rhs with broadcast
+#' 
+#' @param lhs  Symbol
 #'     Left symbolic input to the function
+#' @param rhs  Symbol
+#'     Right symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
 #' 
 #' @export
-mx.symbol.abs <- function(...) {
-  mx.varg.symbol.abs(list(...))
+mx.symbol.broadcast_minus <- function(...) {
+  mx.varg.symbol.broadcast_minus(list(...))
 }
 
-#' Take ceil value of the src
+#' lhs multiple rhs with broadcast
+#' 
+#' @param lhs  Symbol
+#'     Left symbolic input to the function
+#' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_mul <- function(...) {
+  mx.varg.symbol.broadcast_mul(list(...))
+}
+
+#' lhs add rhs with broadcast
 #' 
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_plus <- function(...) {
+  mx.varg.symbol.broadcast_plus(list(...))
+}
+
+#' Take ceil value of the src
+#' 
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -948,9 +1201,7 @@ mx.symbol.ceil <- function(...) {
 
 #' Take cos of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -966,7 +1217,7 @@ mx.symbol.cos <- function(...) {
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
-#'     Left symbolic input to the function
+#'     Right symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -978,9 +1229,7 @@ mx.symbol.dot <- function(...) {
 
 #' Take exp of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -991,11 +1240,22 @@ mx.symbol.exp <- function(...) {
   mx.varg.symbol.exp(list(...))
 }
 
-#' Take floor value of the src
+#' Expand the shape of array by inserting a new axis.
 #' 
-#' @param lhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.expand_dims <- function(...) {
+  mx.varg.symbol.expand_dims(list(...))
+}
+
+#' Take floor value of the src
+#' 
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1008,9 +1268,7 @@ mx.symbol.floor <- function(...) {
 
 #' Take log of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1021,11 +1279,26 @@ mx.symbol.log <- function(...) {
   mx.varg.symbol.log(list(...))
 }
 
+#' Sample a normal distribution
+#' 
+#' @param loc  float, optional, default=0
+#'     Mean of the distribution.
+#' @param scale  float, optional, default=1
+#'     Standard deviation of the distribution.
+#' @param shape  Shape(tuple), required
+#'     The shape of the output
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.normal <- function(...) {
+  mx.varg.symbol.normal(list(...))
+}
+
 #' Take round value of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1038,9 +1311,7 @@ mx.symbol.round <- function(...) {
 
 #' Take rsqrt of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1053,9 +1324,7 @@ mx.symbol.rsqrt <- function(...) {
 
 #' Take sign value of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1068,9 +1337,7 @@ mx.symbol.sign <- function(...) {
 
 #' Take sin of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1081,12 +1348,38 @@ mx.symbol.sin <- function(...) {
   mx.varg.symbol.sin(list(...))
 }
 
+#' Slice the input along certain axis and return a sliced array.
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.slice_axis <- function(...) {
+  mx.varg.symbol.slice_axis(list(...))
+}
+
+#' Calculate Smooth L1 Loss(lhs, scalar)
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.smooth_l1 <- function(...) {
+  mx.varg.symbol.smooth_l1(list(...))
+}
+
 #' Calculate cross_entropy(lhs, one_hot(rhs))
 #' 
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
-#'     Left symbolic input to the function
+#'     Right symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1098,9 +1391,7 @@ mx.symbol.softmax_cross_entropy <- function(...) {
 
 #' Take sqrt of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1113,9 +1404,7 @@ mx.symbol.sqrt <- function(...) {
 
 #' Take square of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1126,11 +1415,9 @@ mx.symbol.square <- function(...) {
   mx.varg.symbol.square(list(...))
 }
 
-#' Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+#' Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1141,11 +1428,22 @@ mx.symbol.sum <- function(...) {
   mx.varg.symbol.sum(list(...))
 }
 
-#' Transpose the input matrix and return a new one
+#' (Depreciated! Use sum instead!) Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
-#' @param lhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.sum_axis <- function(...) {
+  mx.varg.symbol.sum_axis(list(...))
+}
+
+#' Transpose the input matrix and return a new one
+#' 
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1155,3 +1453,20 @@ mx.symbol.sum <- function(...) {
 mx.symbol.transpose <- function(...) {
   mx.varg.symbol.transpose(list(...))
 }
+
+#' Sample a uniform distribution
+#' 
+#' @param low  float, optional, default=0
+#'     The lower bound of distribution
+#' @param high  float, optional, default=1
+#'     The upper bound of distribution
+#' @param shape  Shape(tuple), required
+#'     The shape of the output
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.uniform <- function(...) {
+  mx.varg.symbol.uniform(list(...))
+}
diff --git a/docs/how_to/build.md b/docs/how_to/build.md
index 862b1116e005..4ef8ac5420b7 100644
--- a/docs/how_to/build.md
+++ b/docs/how_to/build.md
@@ -243,6 +243,37 @@ Now you should have the R package as a tar.gz file and you can install it as a n
 R CMD INSTALL mxnet_0.5.tar.gz
 ```
 
+If you can't load `mxnet` after enabling CUDA during the installation. Please add following lines into `$RHOME/etc/ldpaths`. You can find your `$RHOME` by using `R.home()` inside R.
+
+```bash
+export CUDA_HOME=/usr/local/cuda 
+export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+```
+
+To install the package using GPU on Windows without building the package from scratch. Note that you need a couple of programs installed already:  
+- You'll need the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). This depends on Visual Studio, and a free compatible version would be [Visual Studio Community 2013](https://www.visualstudio.com/en-us/news/vs2013-community-vs.aspx). For instructions and compatibility checks, read http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/ .
+
+- You will also need to register as a developer at nvidia and download CUDNN V3, https://developer.nvidia.com/cudnn . 
+
+
+1. Download the mxnet package as a ZIP from the Github repository https://github.com/dmlc/mxnet and unpack it. You will be editing the `/mxnet/R-package` folder.
+
+2. Download the most recent GPU-enabled package from the [Releases tab](https://github.com/dmlc/mxnet/releases). Unzip this file so you have a folder `/nocudnn`. Note that this file and the folder you'll save it in will be used for future reference and not directly for installing the package. Only some files will be copied from it into the `R-package` folder.
+
+(Note: you now have 2 folders we're working with, possibly in different locations, that we'll reference with `R-package/` and `nocudnn/`.)
+
+3. Download CUDNN V3 from https://developer.nvidia.com/cudnn. Unpack the .zip file and you'll see 3 folders, `/bin`, `/include`, `/lib`. Copy and replace these 3 folders into `nocudnn/3rdparty/cudnn/`, or unpack the .zip file there directly.
+
+4. Create the folder `R-package/inst/libs/x64`. We only support 64-bit operating system now, so you need the x64 folder;
+
+5. Put dll files in `R-package/inst/libs/x64`. 
+
+The first dll file you need is `nocudnn/lib/libmxnet.dll`. The other dll files you need are the ones in all 4 subfolders of `nocudnn/3rdparty/`, for the `cudnn` and `openblas` you'll need to look in the `/bin` folders. There should be 11 dll files now in `R-package/inst/libs/x64`.
+
+6. Copy the folder `nocudnn/include/` to `R-package/inst/`. So now you should have a folder `R-package/inst/include/` with 3 subfolders.
+
+7. Run `R CMD INSTALL --no-multiarch R-package`. Make sure that R is added to your PATH in Environment Variables. Running the command `Where R` in Command Prompt should return the location.
+
 Note on Library Build:
 
 We isolate the library build with Rcpp end to maximize the portability

From 4ad8c9b60335ae04d5bed2fcf82096f7b47cd5c8 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <javelinjs@gmail.com>
Date: Sun, 26 Jun 2016 12:25:16 +0800
Subject: [PATCH 065/126] [scala] change NDArray._random_uniform to
 NDArray._sample_uniform. support kwargs for GenericNDArrayFunctio (#2529)

---
 .../main/scala/ml/dmlc/mxnet/LibInfo.scala    |  7 +++
 .../main/scala/ml/dmlc/mxnet/NDArray.scala    | 61 ++++++++++++++-----
 .../src/main/scala/ml/dmlc/mxnet/Random.scala | 10 +--
 .../main/native/ml_dmlc_mxnet_native_c_api.cc | 48 +++++++++++++++
 4 files changed, 107 insertions(+), 19 deletions(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
index e11d85fe7cb7..98ce1953243d 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
@@ -39,6 +39,13 @@ class LibInfo {
                            useVars: Array[NDArrayHandle],
                            scalarArgs: Array[MXFloat],
                            mutateVars: Array[NDArrayHandle]): Int
+  @native def mxFuncInvokeEx(function: FunctionHandle,
+                             useVars: Array[NDArrayHandle],
+                             scalarArgs: Array[MXFloat],
+                             mutateVars: Array[NDArrayHandle],
+                             numParams: Int,
+                             paramKeys: Array[Array[Byte]],
+                             paramVals: Array[Array[Byte]]): Int
   @native def mxNDArrayGetShape(handle: NDArrayHandle,
                                 ndim: MXUintRef,
                                 data: ArrayBuffer[Int]): Int
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
index cb8d11746a12..6807d6a72f31 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
@@ -91,14 +91,28 @@ object NDArray {
    * Invoke this function by passing in parameters
    *
    * @param args Positional arguments of input scalars and NDArray
-   * @param out NDArray or tuple of NDArray, optional
+   * @param kwargs: Key-value arguments for functions. e.g.,
+   *            out: NDArray or tuple of NDArray, optional
    *            Output NDArray, used to hold the output result.
    * @return The result NDArray(tuple) of result of computation.
    */
   def invokeGenericFunc(funcName: String,
-                        args: Array[Any],
-                        out: Array[NDArray] = null): Array[NDArray] = {
-    var mutateVars = out
+                        args: Array[Any] = null,
+                        kwargs: Map[String, Any] = null): Array[NDArray] = {
+    var mutateVars: Array[NDArray] = null
+    val realKwargs =
+      if (kwargs != null && kwargs.contains("out")) {
+        val out = kwargs("out")
+        mutateVars =
+          if (out.isInstanceOf[NDArray]) {
+            Array(kwargs("out").asInstanceOf[NDArray])
+          } else {
+            kwargs("out").asInstanceOf[Array[NDArray]]
+          }
+        kwargs - "out"
+      } else {
+        kwargs
+      }
     val function = functions(funcName)
     require(function != null, s"invalid function name $funcName")
     function match {
@@ -116,10 +130,21 @@ object NDArray {
           mutateVars = Array.fill[NDArray](nMutateVars)(new NDArray(newEmptyHandle()))
           addDependency(useVars, mutateVars)
         }
-        checkCall(_LIB.mxFuncInvoke(handle,
+        val (numKwargs: Int,
+              kwargKeys: Option[Array[Array[Byte]]],
+              kwargVals: Option[Array[Array[Byte]]]) =
+          if (realKwargs == null) {
+            (0, None, None)
+          } else {
+            (realKwargs.size,
+              Some(realKwargs.keys.map(_.getBytes("ASCII") ++ Array(0.toByte)).toArray),
+              Some(realKwargs.values.map(_.toString.getBytes("ASCII") ++ Array(0.toByte)).toArray))
+          }
+        checkCall(_LIB.mxFuncInvokeEx(handle,
           useVars.map(_.handle),
           scalarVars,
-          mutateVars.map(_.handle).array))
+          mutateVars.map(_.handle).array,
+          numKwargs, kwargKeys.orNull, kwargVals.orNull))
       case _ => throw new IllegalArgumentException(s"call $funcName as generic function")
     }
     mutateVars
@@ -464,11 +489,15 @@ object NDArray {
   }
 
   def randomUniform(low: Float, high: Float, out: NDArray): NDArray = {
-    NDArray.invokeGenericFunc("_random_uniform", Array(low, high), Array(out))(0)
+    require(out != null)
+    NDArray.invokeGenericFunc("_sample_uniform", kwargs = Map[String, Any](
+      "low" -> low, "high" -> high, "shape" -> out.shape, "out" -> out))(0)
   }
 
-  def randomGaussian(mean: Float, stdvar: Float, out: NDArray): NDArray = {
-    NDArray.invokeGenericFunc("_random_gaussian", Array(mean, stdvar), Array(out))(0)
+  def randomGaussian(loc: Float, scale: Float, out: NDArray): NDArray = {
+    require(out != null)
+    NDArray.invokeGenericFunc("_sample_normal", kwargs = Map[String, Any](
+      "loc" -> loc, "scale" -> scale, "shape" -> out.shape, "out" -> out))(0)
   }
 
   /**
@@ -743,7 +772,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    */
   def set(value: Float): NDArray = {
     require(writable, "trying to assign to a readonly NDArray")
-    NDArray.invokeGenericFunc("_set_value", Array[Any](value), out = Array(this))
+    NDArray.invokeGenericFunc("_set_value", Array[Any](value), Map[String, Any]("out" -> this))
     this
   }
 
@@ -777,7 +806,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to add to a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_plus_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_plus_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
@@ -800,7 +830,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to subtract from a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_minus_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_minus_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
@@ -827,7 +858,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to multiply to a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_mul_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_mul_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
@@ -850,7 +882,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to divide from a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_div_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_div_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
index e1279e095dfa..c66dd32cc6a8 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
@@ -37,15 +37,15 @@ object Random {
   /**
    * Generate normal(Gaussian) distribution N(mean, stdvar^^2) with shape.
    *
-   * @param mean The mean of the normal distribution.
-   * @param stdvar The standard deviation of normal distribution.
+   * @param loc The mean of the normal distribution.
+   * @param scale The standard deviation of normal distribution.
    * @param shape Output shape of the NDArray generated.
    * @param ctx Context of output NDArray, will use default context if not specified.
    * @param out Output place holder
    * @return The result NDArray with generated result.
    */
-  def normal(mean: Float,
-             stdvar: Float,
+  def normal(loc: Float,
+             scale: Float,
              shape: Shape = null,
              ctx: Context = null,
              out: NDArray = null): NDArray = {
@@ -56,7 +56,7 @@ object Random {
       require(shape != null, "shape is required when out is not specified")
       outCopy = empty(shape, ctx)
     }
-    randomGaussian(mean, stdvar, outCopy)
+    randomGaussian(loc, scale, outCopy)
   }
 
 
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index 5e00481ae416..394176487172 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -139,6 +139,54 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxFuncInvoke
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxFuncInvokeEx
+  (JNIEnv *env, jobject obj, jlong funcPtr, jlongArray useVars,
+    jfloatArray scalarArgs, jlongArray mutateVars,
+    jint numParams, jobjectArray paramKeys, jobjectArray paramVals) {
+  jlong *cUseVars = env->GetLongArrayElements(useVars, NULL);
+  jfloat *cScalarArgs = env->GetFloatArrayElements(scalarArgs, NULL);
+  jlong *cMutateVars = env->GetLongArrayElements(mutateVars, NULL);
+  jbyte **cParamKeys = NULL;
+  jbyte **cParamVals = NULL;
+  if (numParams > 0) {
+    cParamKeys = new jbyte *[numParams];
+    cParamVals = new jbyte *[numParams];
+    for (size_t i = 0; i < numParams; i++) {
+      jbyteArray jkey = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramKeys, i));
+      jbyte *cParamKey = env->GetByteArrayElements(jkey, NULL);
+      cParamKeys[i] = cParamKey;
+      env->DeleteLocalRef(jkey);
+      jbyteArray jval = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramVals, i));
+      jbyte *cParamVal = env->GetByteArrayElements(jval, NULL);
+      cParamVals[i] = cParamVal;
+      env->DeleteLocalRef(jval);
+    }
+  }
+  int ret = MXFuncInvokeEx(reinterpret_cast<FunctionHandle>(funcPtr),
+                           reinterpret_cast<NDArrayHandle *>(cUseVars),
+                           reinterpret_cast<mx_float *>(cScalarArgs),
+                           reinterpret_cast<NDArrayHandle *>(cMutateVars),
+                           static_cast<int>(numParams),
+                           reinterpret_cast<char **>(cParamKeys),
+                           reinterpret_cast<char **>(cParamVals));
+  env->ReleaseLongArrayElements(useVars, cUseVars, 0);
+  env->ReleaseFloatArrayElements(scalarArgs, cScalarArgs, 0);
+  env->ReleaseLongArrayElements(mutateVars, cMutateVars, 0);
+  if (numParams > 0) {
+    for (size_t i = 0; i < numParams; i++) {
+      jbyteArray jkey = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramKeys, i));
+      env->ReleaseByteArrayElements(jkey, cParamKeys[i], 0);
+      env->DeleteLocalRef(jkey);
+      jbyteArray jval = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramVals, i));
+      env->ReleaseByteArrayElements(jval, cParamVals[i], 0);
+      env->DeleteLocalRef(jval);
+    }
+    delete[] cParamKeys;
+    delete[] cParamVals;
+  }
+  return ret;
+}
+
 JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxNDArraySaveRawBytes
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jobject dataBuf) {
   size_t length;

From f857c1ebfbf1e1d6fd289e8a5f24bd8bb37e00b5 Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@ust.hk>
Date: Mon, 27 Jun 2016 00:36:35 +0800
Subject: [PATCH 066/126] Add broadcast_power (#2534)

Fix lint
---
 .../elementwise_binary_broadcast_op-inl.h     | 99 +++++++++++++++++++
 tests/python/unittest/test_operator.py        |  9 +-
 2 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/src/operator/elementwise_binary_broadcast_op-inl.h b/src/operator/elementwise_binary_broadcast_op-inl.h
index 5cf73257b3d9..5abdafa6e85c 100644
--- a/src/operator/elementwise_binary_broadcast_op-inl.h
+++ b/src/operator/elementwise_binary_broadcast_op-inl.h
@@ -414,6 +414,98 @@ void BroadcastDivBackward_(const OutputGrad& out_grad,
   });
 }
 
+template<typename xpu>
+void BroadcastPowerBackward_(const OutputGrad& out_grad,
+  const Input0& lhs,
+  const Input1& rhs,
+  const EnvArguments& env,
+  TBlob* lhs_grad,
+  TBlob* rhs_grad,
+  OpReqType req_lhs_grad,
+  OpReqType req_rhs_grad,
+  RunContext ctx) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
+    MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      CHECK_NE(req_rhs_grad, kWriteInplace);
+      ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
+        F<mshadow_op::log>(mlhs_data) *
+        F<mshadow_op::power>(mlhs_data, mrhs_data) * mout_grad);
+      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
+        mrhs_data *
+        F<mshadow_op::power>(mlhs_data, mrhs_data - scalar<DType>(1)) *
+        mout_grad);
+    });
+    return;
+  }
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
+  MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      mshadow::Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::log>(broadcast_to(mlhs_data, out_new_shape_)) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+                             broadcast_to(mrhs_data, out_new_shape_)) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+                             broadcast_to(mrhs_data, out_new_shape_) - scalar<DType>(1)) *
+        mout_grad);
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::log>(broadcast_to(mlhs_data, out_new_shape_)) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+        broadcast_to(mrhs_data, out_new_shape_)) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+        broadcast_to(mrhs_data, out_new_shape_) - scalar<DType>(1)) *
+        mout_grad);
+    }
+  });
+}
+
 
 MXNET_REGISTER_SIMPLE_OP(broadcast_plus, XPU)
 .set_shape_function(BinaryBroadcastShape_)
@@ -445,6 +537,13 @@ MXNET_REGISTER_SIMPLE_OP(broadcast_div, XPU)
 .set_gradient(XPU::kDevMask, BroadcastDivBackward_<XPU>, kNoInplace)
 .describe("lhs divide rhs with broadcast");
 
+MXNET_REGISTER_SIMPLE_OP(broadcast_power, XPU)
+.set_shape_function(BinaryBroadcastShape_)
+.set_function(XPU::kDevMask, BinaryBroadcastForward_<
+              XPU, mshadow_op::power>, kNoInplace, kRegisterSymbolic)
+.set_gradient(XPU::kDevMask, BroadcastPowerBackward_<XPU>, kNoInplace)
+.describe("lhs power rhs with broadcast");
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_ELEMENTWISE_BINARY_BROADCAST_OP_INL_H_
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d0833edb96c9..5ef79cfb6314 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -848,10 +848,17 @@ def test_bdiv(a, b):
         _check_broadcast_op_forward(c, lambda a, b: a / b)
         _check_broadcast_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)))
 
+    def test_bpow(a, b):
+        c = mx.sym.broadcast_power(a, b)
+        _check_broadcast_op_forward(c, lambda a, b: a ** b)
+        _check_broadcast_op_backward(c, lambda g_out, a, b: (g_out * a **(b - 1) * b,
+                                                             g_out * a ** b * np.log(a)))
+
     test_bplus(a, b)
     test_bminus(a, b)
     test_bmul(a, b)
     test_bdiv(a, b)
+    test_bpow(a, b)
 
 def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):
     # Input for spike response
@@ -1035,8 +1042,6 @@ def test_broadcasting_ele(sym_bcast):
                                           numpy_reduce_func=np.sum)
             net = sym_bcast.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
                                                  args_grad={'a': grad_nd})
-            net_bcast_to = sym_bcast_to.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
-                                             args_grad={'a': grad_nd})
             net.forward(is_train=True)
             assert (net.outputs[0].shape == target_shape).all()
             err_forward = reldiff(net.outputs[0].asnumpy(), groundtruth)

From 37789909a32901cdb80e27cc20257d7c90f5de6f Mon Sep 17 00:00:00 2001
From: Ldpe2G <liangdepeng@gmail.com>
Date: Sat, 25 Jun 2016 09:39:36 +0800
Subject: [PATCH 067/126] finish PrefetchIterator

---
 .../ml/dmlc/mxnet/io/PrefetchingIter.scala    | 118 ++++++++++++++++--
 .../test/scala/ml/dmlc/mxnet/IOSuite.scala    |  35 +++++-
 2 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
index 5ae522658581..ecbbab304184 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
@@ -2,12 +2,14 @@ package ml.dmlc.mxnet.io
 
 import ml.dmlc.mxnet.{DataBatch, DataIter, NDArray, Shape}
 import org.slf4j.LoggerFactory
+import java.util.concurrent.Semaphore
 
 /**
  * TODO
  * Base class for prefetching iterators. Takes one or more DataIters
  * and combine them with prefetching.
  *
+ * @author Depeng Liang
  *
  * @param iters list of DataIters
  * @param dataNames
@@ -18,46 +20,142 @@ class PrefetchingIter(val iters: IndexedSeq[DataIter],
                       val labelNames: Map[String, String] = null) extends DataIter {
   private val logger = LoggerFactory.getLogger(classOf[PrefetchingIter])
 
+  require(iters.length > 0, "Iters length must be greater than 0")
+
+  private val _provideData: Map[String, Shape] = {
+    if (dataNames == null) {
+      iters.map(_.provideData).foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    } else {
+      iters.map(_.provideData).map(m => m.map(t => (dataNames(t._1), t._2)))
+             .foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    }
+  }
+
+  private val _provideLabel: Map[String, Shape] = {
+    if (labelNames == null) {
+      iters.map(_.provideLabel).foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    } else {
+      iters.map(_.provideLabel).map(m => m.map(t => (labelNames(t._1), t._2)))
+             .foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    }
+  }
+
+  private val _batchSize: Int = this._provideData.toList(0)._2(0)
+  private val dataReady: IndexedSeq[Semaphore] =
+                                        (0 until iters.length).map(i => new Semaphore(0))
+  private val dataTaken: IndexedSeq[Semaphore] =
+                                        (0 until iters.length).map(i => new Semaphore(1))
+
+  private var started: Boolean = true
   private var currentBatch: DataBatch = null
-  private var nextBatch: DataBatch = null
+  private var nextBatch: Array[DataBatch] = (0 until iters.length).map { i =>
+    new DataBatch(null, null, null, 0)
+  }.toArray
+
+  // thread entry
+  def prefetchFunc(i: Int): Runnable = new Runnable {
+    override def run(): Unit = {
+      while (started) {
+        dataTaken(i).acquire()
+        if (started) {
+          try {
+            nextBatch(i) = iters(i).next()
+          } catch {
+            case ex: NoSuchElementException => nextBatch(i) = null
+          }
+          dataReady(i).release()
+        }
+      }
+    }
+  }
+
+  private val prefetchThreads =
+    for (i <- 0 until iters.length) yield new Thread(prefetchFunc(i))
+  prefetchThreads.foreach(_.start())
+
+  override def next(): DataBatch = currentBatch
 
   /**
    * reset the iterator
    */
-  override def reset(): Unit = ???
+  override def reset(): Unit = {
+    for (e <- dataReady) e.acquire()
+    for (i <- iters) i.reset()
+    for (e <- dataTaken) e.release()
+  }
+
+  override def batchSize: Int = this._batchSize
 
   /**
    * get data of current batch
    * @return the data of current batch
    */
-  override def getData(): IndexedSeq[NDArray] = ???
+  override def getData(): IndexedSeq[NDArray] = currentBatch.data
 
   /**
    * Get label of current batch
    * @return the label of current batch
    */
-  override def getLabel(): IndexedSeq[NDArray] = ???
+  override def getLabel(): IndexedSeq[NDArray] = currentBatch.label
 
   /**
    * the index of current batch
    * @return
    */
-  override def getIndex(): IndexedSeq[Long] = ???
+  override def getIndex(): IndexedSeq[Long] = currentBatch.index
 
   // The name and shape of label provided by this iterator
-  override def provideLabel: Map[String, Shape] = ???
+  override def provideLabel: Map[String, Shape] = this._provideLabel
 
   /**
    * get the number of padding examples
    * in current batch
    * @return number of padding examples in current batch
    */
-  override def getPad(): Int = ???
+  override def getPad(): Int = this.currentBatch.pad
 
   // The name and shape of data provided by this iterator
-  override def provideData: Map[String, Shape] = ???
+  override def provideData: Map[String, Shape] = this._provideData
 
-  override def hasNext: Boolean = ???
+  override def hasNext: Boolean = {
+    for (e <- dataReady) e.acquire()
+    if (nextBatch(0) == null) {
+      for (i <- nextBatch) {
+        assert(i == null, "Number of entry mismatches between iterators")
+      }
+      for (e <- dataReady) e.release()
+      false
+    } else {
+      for (batch <- nextBatch) {
+        assert(batch.pad == nextBatch(0).pad,
+            "Number of entry mismatches between iterators")
+      }
+      val datas = for (batch <- nextBatch) yield batch.data
+      val labels = for (batch <- nextBatch) yield batch.label
+      currentBatch = new DataBatch(datas.toIndexedSeq.flatten,
+                                      labels.toIndexedSeq.flatten,
+                                      nextBatch(0).index,
+                                      nextBatch(0).pad)
+      for (e <- dataTaken) e.release()
+      true
+    }
+  }
 
-  override def batchSize: Int = ???
+  /**
+   * Stop all its internal prefetching threads.
+   * The object shall never be used after it is disposed.
+   */
+  def dispose(): Unit = {
+    started = false
+    for (e <- dataTaken) e.release()
+    for (t <- prefetchThreads) t.join()
+  }
 }
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
index 38d8adf930e1..7ccfd04e8472 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
@@ -1,6 +1,6 @@
 package ml.dmlc.mxnet
 
-import ml.dmlc.mxnet.io.{NDArrayIter, ResizeIter}
+import ml.dmlc.mxnet.io.{NDArrayIter, ResizeIter, PrefetchingIter}
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import scala.sys.process._
 
@@ -150,6 +150,39 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     assert(batchCount === nBatch)
   }
 
+  test("test PrefetchIter") {
+    // get data
+    "./scripts/get_mnist_data.sh" !
+
+    val params = Map(
+      "image" -> "data/train-images-idx3-ubyte",
+      "label" -> "data/train-labels-idx1-ubyte",
+      "data_shape" -> "(784,)",
+      "batch_size" -> "100",
+      "shuffle" -> "1",
+      "flat" -> "1",
+      "silent" -> "0",
+      "seed" -> "10"
+    )
+
+    val mnistIter = IO.MNISTIter(params)
+    val mnistIter2 = IO.MNISTIter(params)
+    var prefetchIter = new PrefetchingIter(IndexedSeq(mnistIter, mnistIter2))
+
+    while(prefetchIter.hasNext) {
+      prefetchIter.next()
+    }
+
+    prefetchIter.reset()
+    while(prefetchIter.hasNext) {
+      prefetchIter.next()
+    }
+
+    prefetchIter.dispose()
+
+    assert(true)
+  }
+
   test("test NDArrayIter") {
     val shape0 = Shape(Array(1000, 2, 2))
     val data = IndexedSeq(NDArray.ones(shape0), NDArray.zeros(shape0))

From 7ec93d21acfcc1eeafa59d721664965f048638d1 Mon Sep 17 00:00:00 2001
From: Ldpe2G <liangdepeng@gmail.com>
Date: Sun, 26 Jun 2016 09:46:51 +0800
Subject: [PATCH 068/126] add some check to prefetchIter

---
 .../ml/dmlc/mxnet/io/PrefetchingIter.scala    | 10 ++--
 .../test/scala/ml/dmlc/mxnet/IOSuite.scala    | 48 +++++++++++++++----
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
index ecbbab304184..17266eed93db 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
@@ -16,8 +16,8 @@ import java.util.concurrent.Semaphore
  * @param labelNames
  */
 class PrefetchingIter(val iters: IndexedSeq[DataIter],
-                      val dataNames: Map[String, String] = null,
-                      val labelNames: Map[String, String] = null) extends DataIter {
+                      val dataNames: IndexedSeq[Map[String, String]] = null,
+                      val labelNames: IndexedSeq[Map[String, String]] = null) extends DataIter {
   private val logger = LoggerFactory.getLogger(classOf[PrefetchingIter])
 
   require(iters.length > 0, "Iters length must be greater than 0")
@@ -28,7 +28,8 @@ class PrefetchingIter(val iters: IndexedSeq[DataIter],
         acc ++ elem
       }
     } else {
-      iters.map(_.provideData).map(m => m.map(t => (dataNames(t._1), t._2)))
+      iters.zipWithIndex.map(tu => (tu._1.provideData, tu._2))
+             .map(m => m._1.map(t => (dataNames(m._2)(t._1), t._2)))
              .foldLeft(Map[String, Shape]()) { (acc, elem) =>
         acc ++ elem
       }
@@ -41,7 +42,8 @@ class PrefetchingIter(val iters: IndexedSeq[DataIter],
         acc ++ elem
       }
     } else {
-      iters.map(_.provideLabel).map(m => m.map(t => (labelNames(t._1), t._2)))
+      iters.zipWithIndex.map(tu => (tu._1.provideLabel, tu._2))
+             .map(m => m._1.map(t => (labelNames(m._2)(t._1), t._2)))
              .foldLeft(Map[String, Shape]()) { (acc, elem) =>
         acc ++ elem
       }
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
index 7ccfd04e8472..045be628e38d 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
@@ -154,7 +154,7 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     // get data
     "./scripts/get_mnist_data.sh" !
 
-    val params = Map(
+     val params = Map(
       "image" -> "data/train-images-idx3-ubyte",
       "label" -> "data/train-labels-idx1-ubyte",
       "data_shape" -> "(784,)",
@@ -165,22 +165,52 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
       "seed" -> "10"
     )
 
-    val mnistIter = IO.MNISTIter(params)
-    val mnistIter2 = IO.MNISTIter(params)
-    var prefetchIter = new PrefetchingIter(IndexedSeq(mnistIter, mnistIter2))
+    val mnistPack1 = IO.MNISTPack(params)
+    val mnistPack2 = IO.MNISTPack(params)
+
+    val nBatch = 600
+    var batchCount = 0
+
+    val mnistIter1 = mnistPack1.iterator
+    val mnistIter2 = mnistPack2.iterator
 
+    var prefetchIter = new PrefetchingIter(
+        IndexedSeq(mnistIter1, mnistIter2),
+        IndexedSeq(Map("data" -> "data1"), Map("data" -> "data2")),
+        IndexedSeq(Map("label" -> "label1"), Map("label" -> "label2"))
+    )
+
+    // test loop
     while(prefetchIter.hasNext) {
       prefetchIter.next()
+      batchCount += 1
     }
+    assert(nBatch === batchCount)
 
+    // test provideData
+    val provideData = prefetchIter.provideData
+    val provideLabel = prefetchIter.provideLabel
+    assert(provideData("data1") === Shape(100, 784))
+    assert(provideData("data2") === Shape(100, 784))
+    assert(provideLabel("label1") === Shape(100))
+    assert(provideLabel("label2") === Shape(100))
+
+    // test reset
     prefetchIter.reset()
-    while(prefetchIter.hasNext) {
-      prefetchIter.next()
-    }
+    prefetchIter.next()
+    val label0 = prefetchIter.getLabel().head.toArray
+    val data0 = prefetchIter.getData().head.toArray
+    prefetchIter.next()
+    prefetchIter.next()
+    prefetchIter.next()
+    prefetchIter.reset()
+    prefetchIter.next()
+    val label1 = prefetchIter.getLabel().head.toArray
+    val data1 = prefetchIter.getData().head.toArray
+    assert(label0 === label1)
+    assert(data0 === data1)
 
     prefetchIter.dispose()
-
-    assert(true)
   }
 
   test("test NDArrayIter") {

From 409565665832ef76ca54f44d8e4eb6b8a8456cbb Mon Sep 17 00:00:00 2001
From: Ldpe2G <liangdepeng@gmail.com>
Date: Sun, 26 Jun 2016 10:30:13 +0800
Subject: [PATCH 069/126] fix code style

---
 scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
index 045be628e38d..5fefc0704ba4 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
@@ -154,7 +154,7 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     // get data
     "./scripts/get_mnist_data.sh" !
 
-     val params = Map(
+    val params = Map(
       "image" -> "data/train-images-idx3-ubyte",
       "label" -> "data/train-labels-idx1-ubyte",
       "data_shape" -> "(784,)",

From 03bd1d7cbdb2cdf419fc65dc3d6b47f2eaf66098 Mon Sep 17 00:00:00 2001
From: Ldpe2G <liangdepeng@gmail.com>
Date: Mon, 27 Jun 2016 09:14:53 +0800
Subject: [PATCH 070/126] fix some problems

---
 .../src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
index 17266eed93db..8aa821daf628 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
@@ -5,7 +5,6 @@ import org.slf4j.LoggerFactory
 import java.util.concurrent.Semaphore
 
 /**
- * TODO
  * Base class for prefetching iterators. Takes one or more DataIters
  * and combine them with prefetching.
  *
@@ -56,7 +55,7 @@ class PrefetchingIter(val iters: IndexedSeq[DataIter],
   private val dataTaken: IndexedSeq[Semaphore] =
                                         (0 until iters.length).map(i => new Semaphore(1))
 
-  private var started: Boolean = true
+  @volatile private var started: Boolean = true
   private var currentBatch: DataBatch = null
   private var nextBatch: Array[DataBatch] = (0 until iters.length).map { i =>
     new DataBatch(null, null, null, 0)
@@ -73,8 +72,8 @@ class PrefetchingIter(val iters: IndexedSeq[DataIter],
           } catch {
             case ex: NoSuchElementException => nextBatch(i) = null
           }
-          dataReady(i).release()
         }
+        dataReady(i).release()
       }
     }
   }

From 0ad34c9bada6eb3a5e11c8b56b7e36ddaa214bfd Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Mon, 27 Jun 2016 19:29:26 -0700
Subject: [PATCH 071/126] cudnn auto tuning (#2538)

* cudnn convolution auto tune

* move tune

* fix

* lint

* lint

* turn on tune by default

* remove logging
---
 src/operator/convolution-inl.h       |  14 +-
 src/operator/convolution.cc          |  12 +-
 src/operator/convolution.cu          |   8 +-
 src/operator/cudnn_convolution-inl.h | 122 +++++++++++-------
 src/operator/cudnn_convolution.cc    | 185 +++++++++++++++++++++++++++
 src/operator/custom.cc               |   2 +-
 src/symbol/graph_executor.cc         |  11 +-
 src/symbol/graph_executor.h          |   7 +-
 8 files changed, 301 insertions(+), 60 deletions(-)
 create mode 100644 src/operator/cudnn_convolution.cc

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 74801d55a557..46e99d4f514d 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -25,6 +25,7 @@ namespace conv {
 enum ConvolutionOpInputs {kData, kWeight, kBias};
 enum ConvolutionOpOutputs {kOut};
 enum ConvolutionOpResource {kTempSpace};
+enum ConvolutionOpCudnnTune {kOff, kLimited, kFastest};
 }
 
 struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
@@ -36,6 +37,7 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   uint32_t num_group;
   uint64_t workspace;
   bool no_bias;
+  int cudnn_tune;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     int shape[] = {1, 1};
     DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x)");
@@ -56,6 +58,13 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     .describe("Tmp workspace for convolution (MB).");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
     .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(cudnn_tune)
+    .add_enum("off", conv::kOff)
+    .add_enum("limited_workspace", conv::kLimited)
+    .add_enum("fastest", conv::kFastest)
+    .set_default(conv::kLimited)
+    .describe("Whether to find convolution algo by running performance test."
+              "Leads to higher startup time but may give better speed");
   }
 };
 
@@ -289,7 +298,10 @@ class ConvolutionOp : public Operator {
 };  // class ConvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(ConvolutionParam param, int dtype);
+Operator* CreateOp(ConvolutionParam param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
 
 #if DMLC_USE_CXX11
 class ConvolutionProp : public OperatorProperty {
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index f575020f9a89..28fc2e2d0257 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -10,7 +10,10 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ConvolutionParam param, int dtype) {
+Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new ConvolutionOp<cpu, DType>(param);
@@ -19,13 +22,14 @@ Operator* CreateOp<cpu>(ConvolutionParam param, int dtype) {
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-Operator *ConvolutionProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
+Operator *ConvolutionProp::CreateOperatorEx(Context ctx,
+                                            std::vector<TShape> *in_shape,
+                                            std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
   CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
 }
 
 DMLC_REGISTER_PARAMETER(ConvolutionParam);
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index 1a77fff616b6..6f5904658e20 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -6,6 +6,7 @@
 */
 
 #include "./convolution-inl.h"
+#include <vector>
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn_convolution-inl.h"
 #endif  // MXNET_USE_CUDNN
@@ -13,12 +14,15 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ConvolutionParam param, int dtype) {
+Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
   Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   if (param.dilate[0] == 1 && param.dilate[1] == 1) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new CuDNNConvolutionOp<DType>(param);
+      op = new CuDNNConvolutionOp<DType>(param, in_shape, out_shape, ctx);
     })
   } else {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index a7f321aeb4d4..c307c8336ee4 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -12,17 +12,38 @@
 #include "./convolution-inl.h"
 namespace mxnet {
 namespace op {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1
+#if MXNET_USE_CUDNN == 1
+void TuneCudnnConvolution(ConvolutionParam param,
+                          std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape,
+                          Context ctx,
+                          cudnnDataType_t dtype,
+                          cudnnConvolutionFwdAlgo_t *algo_,
+                          cudnnConvolutionBwdDataAlgo_t *back_algo_,
+                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w_,
+                          size_t *forward_workspace_byte_,
+                          size_t *backward_workspace_byte_);
+
 template<typename DType>
 class CuDNNConvolutionOp : public Operator {
  public:
-  explicit CuDNNConvolutionOp(ConvolutionParam param) {
+  explicit CuDNNConvolutionOp(ConvolutionParam param,
+                              std::vector<TShape> *in_shape,
+                              std::vector<TShape> *out_shape,
+                              Context ctx) {
+    using namespace mshadow;
     this->param_ = param;
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
     init_cudnn_ = false;
     // TODO(xxx): fp16
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+
+    if (param.cudnn_tune != conv::kOff) {
+      TuneCudnnConvolution(param, in_shape, out_shape, ctx, dtype_,
+                           &algo_, &back_algo_, &back_algo_w_,
+                           &forward_workspace_byte_, &backward_workspace_byte_);
+    }
   }
 
   ~CuDNNConvolutionOp() {
@@ -278,53 +299,56 @@ class CuDNNConvolutionOp : public Operator {
                                             1,
                                             1), CUDNN_STATUS_SUCCESS);
       }
-      CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-      CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
-               in_desc_,
-               filter_desc_,
-               conv_desc_,
-               out_desc_,
-               CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &algo_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
-               in_desc_,
-               out_desc_,
-               conv_desc_,
-               filter_desc_,
-               CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &back_algo_w_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
-               filter_desc_,
-               out_desc_,
-               conv_desc_,
-               in_desc_,
-               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &back_algo_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
-               filter_desc_,
-               out_desc_,
-               conv_desc_,
-               in_desc_,
-               back_algo_,
-               &back_size), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
-               in_desc_,
-               out_desc_,
-               conv_desc_,
-               filter_desc_,
-               back_algo_w_,
-               &back_size_w), CUDNN_STATUS_SUCCESS);
-      backward_workspace_byte_ = std::max(back_size, back_size_w);
-      CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
-               in_desc_,
-               filter_desc_,
-               conv_desc_,
-               out_desc_,
-               algo_,
-               &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
+
+      if (!param_.cudnn_tune) {
+        CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+        CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
+                 in_desc_,
+                 filter_desc_,
+                 conv_desc_,
+                 out_desc_,
+                 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &algo_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+                 in_desc_,
+                 out_desc_,
+                 conv_desc_,
+                 filter_desc_,
+                 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &back_algo_w_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+                 filter_desc_,
+                 out_desc_,
+                 conv_desc_,
+                 in_desc_,
+                 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &back_algo_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
+                 filter_desc_,
+                 out_desc_,
+                 conv_desc_,
+                 in_desc_,
+                 back_algo_,
+                 &back_size), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
+                 in_desc_,
+                 out_desc_,
+                 conv_desc_,
+                 filter_desc_,
+                 back_algo_w_,
+                 &back_size_w), CUDNN_STATUS_SUCCESS);
+        backward_workspace_byte_ = std::max(back_size, back_size_w);
+        CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+                 in_desc_,
+                 filter_desc_,
+                 conv_desc_,
+                 out_desc_,
+                 algo_,
+                 &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
+      }
       forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
       backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
     }
diff --git a/src/operator/cudnn_convolution.cc b/src/operator/cudnn_convolution.cc
new file mode 100644
index 000000000000..1e72afe6af96
--- /dev/null
+++ b/src/operator/cudnn_convolution.cc
@@ -0,0 +1,185 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_convolution.cc
+ * \brief
+ * \author Junyuan Xie
+*/
+#include "./cudnn_convolution-inl.h"
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+
+namespace mxnet {
+namespace op {
+#if MXNET_USE_CUDNN == 1
+void TuneCudnnConvolution(ConvolutionParam param,
+                          std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape,
+                          Context ctx,
+                          cudnnDataType_t dtype,
+                          cudnnConvolutionFwdAlgo_t *algo,
+                          cudnnConvolutionBwdDataAlgo_t *back_algo,
+                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w,
+                          size_t *forward_workspace_byte,
+                          size_t *backward_workspace_byte) {
+  using namespace mshadow;
+  // convert MB to bytes
+
+  size_t expected = param.no_bias ? 2 : 3;
+#if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+#endif
+  CHECK_EQ(in_shape->size(), expected);
+  CHECK_EQ(out_shape->size(), 1);
+  TShape &x_shape = (*in_shape)[conv::kData];
+  TShape &y_shape = (*out_shape)[conv::kOut];
+
+
+  size_t workspace_byte = param.workspace << 20;
+  cudnnTensorDescriptor_t in_desc;
+  cudnnTensorDescriptor_t out_desc;
+  cudnnTensorDescriptor_t bias_desc;
+  cudnnFilterDescriptor_t filter_desc;
+  cudnnConvolutionDescriptor_t conv_desc;
+  CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc), CUDNN_STATUS_SUCCESS);
+#if CUDNN_MAJOR == 5
+  CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
+                                      dtype,
+                                      format,
+                                      param.num_filter / param.num_group,
+                                      x_shape[1] / param.num_group,
+                                      param.kernel[0],
+                                      param.kernel[1]), CUDNN_STATUS_SUCCESS);
+#else
+  CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
+                                      dtype,
+                                      param.num_filter / param.num_group,
+                                      x_shape[1] / param.num_group,
+                                      param.kernel[0],
+                                      param.kernel[1]), CUDNN_STATUS_SUCCESS);
+#endif
+  CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc,
+                                           param.pad[0],
+                                           param.pad[1],
+                                           param.stride[0],
+                                           param.stride[1],
+                                           1,
+                                           1,
+                                           CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc,
+                                        dtype,
+                                        x_shape[0],
+                                        x_shape[1] / param.num_group,
+                                        x_shape[2],
+                                        x_shape[3],
+                                        x_shape[1] * x_shape[2] * x_shape[3],
+                                        x_shape[2] * x_shape[3],
+                                        x_shape[3],
+                                        1), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc,
+                                        dtype,
+                                        y_shape[0],
+                                        y_shape[1] / param.num_group,
+                                        y_shape[2],
+                                        y_shape[3],
+                                        y_shape[1] * y_shape[2] * y_shape[3],
+                                        y_shape[2] * y_shape[3],
+                                        y_shape[3],
+                                        1), CUDNN_STATUS_SUCCESS);
+  if (!param.no_bias) {
+    TShape bias_shape = (*in_shape)[conv::kBias];
+    CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc,
+                                        CUDNN_TENSOR_NCHW,
+                                        dtype,
+                                        1,
+                                        bias_shape[0] / param.num_group,
+                                        1,
+                                        1), CUDNN_STATUS_SUCCESS);
+  }
+
+  Engine::VarHandle var = Engine::Get()->NewVariable();
+  Engine::Get()->PushSync([=](RunContext rctx) {
+    Stream<gpu> *s = rctx.get_stream<gpu>();
+    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    const int kMaxAlgos = 10;
+    int nalgo = kMaxAlgos;
+    int i;
+
+    cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos];
+    CHECK_EQ(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_,
+             in_desc,
+             filter_desc,
+             conv_desc,
+             out_desc,
+             kMaxAlgos,
+             &nalgo,
+             fwd_algo), CUDNN_STATUS_SUCCESS);
+    i = 0;
+    while (i < nalgo
+           && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
+           || (param.cudnn_tune == conv::kLimited
+           && fwd_algo[i].memory > workspace_byte))) ++i;
+    if (i == nalgo) {
+      LOG(FATAL) << "Failed to find an convolution algorithm.";
+    } else {
+      *forward_workspace_byte = fwd_algo[i].memory;
+      *algo = fwd_algo[i].algo;
+    }
+
+    cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos];
+    CHECK_EQ(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+             in_desc,
+             out_desc,
+             conv_desc,
+             filter_desc,
+             kMaxAlgos,
+             &nalgo,
+             bwd_filter_algo), CUDNN_STATUS_SUCCESS);
+    i = 0;
+    while (i < nalgo
+           && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
+           || (param.cudnn_tune == conv::kLimited
+           && bwd_filter_algo[i].memory > workspace_byte))) ++i;
+    if (i == nalgo) {
+      LOG(FATAL) << "Failed to find an convolution algorithm.";
+    } else {
+      *backward_workspace_byte = bwd_filter_algo[i].memory;
+      *back_algo_w = bwd_filter_algo[i].algo;
+    }
+
+    cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos];
+    CHECK_EQ(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+             filter_desc,
+             out_desc,
+             conv_desc,
+             in_desc,
+             kMaxAlgos,
+             &nalgo,
+             bwd_data_algo), CUDNN_STATUS_SUCCESS);
+    i = 0;
+    while (i < nalgo
+           && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
+           || (param.cudnn_tune == conv::kLimited
+           && bwd_data_algo[i].memory > workspace_byte))) ++i;
+    if (i == nalgo) {
+      LOG(FATAL) << "Failed to find an convolution algorithm.";
+    } else {
+      *backward_workspace_byte = std::max(*backward_workspace_byte, bwd_data_algo[i].memory);
+      *back_algo = bwd_data_algo[i].algo;
+    }
+  }, ctx, {}, {var});
+  Engine::Get()->WaitForVar(var);
+  Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
+
+  CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyTensorDescriptor(bias_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyFilterDescriptor(filter_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyConvolutionDescriptor(conv_desc), CUDNN_STATUS_SUCCESS);
+}
+#endif  // CUDNN
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/custom.cc b/src/operator/custom.cc
index b09bae006b82..09ab894044df 100644
--- a/src/operator/custom.cc
+++ b/src/operator/custom.cc
@@ -1,6 +1,6 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file ndarray_op.cc
+ * \file custom.cc
  * \brief
  * \author Junyuan Xie
 */
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index d54972b27433..d9117963e589 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -791,7 +791,7 @@ void GraphExecutor::InitResources() {
   }
 }
 
-void GraphExecutor::InitOpNodes() {
+void GraphExecutor::InitOperators() {
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
@@ -811,6 +811,15 @@ void GraphExecutor::InitOpNodes() {
           graph_.nodes[graph_.nodes[nid].backward_source_id].op.get(),
           op_nodes_[graph_.nodes[nid].backward_source_id].op));
     }
+  }
+}
+
+void GraphExecutor::InitCachedOps() {
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& op_node = op_nodes_[nid];
     bool allow_cache = true;
     for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
       DataEntryInfo& info = op_nodes_[e.source_id].outputs[e.index];
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index 50d3a289e9e1..2ac755d8734e 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -64,9 +64,10 @@ class GraphExecutor : public Executor {
                     in_args, arg_grad_store, grad_req_type,
                     need_backward);
     this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type, aux_states);
+    this->InitOperators();
     this->InitDataEntryMemory();
     this->InitResources();
-    this->InitOpNodes();
+    this->InitCachedOps();
   }
 
  protected:
@@ -218,7 +219,9 @@ class GraphExecutor : public Executor {
   // initialize the internal resources for each op
   void InitResources();
   // initialize OpNode data structure
-  void InitOpNodes();
+  void InitOperators();
+  // initialize OpNode data structure
+  void InitCachedOps();
   // assign context to the graph, this will mutate the graph.
   void AssignContext(const Context default_ctx,
                      const std::map<std::string, Context>& ctx_map,

From 6272e76f11c3aaca6d4d5ce241e44a4a7e712a0f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 27 Jun 2016 19:57:14 -0700
Subject: [PATCH 072/126] KV: enable aggregate on GPU for distributed kv
 (#2545)

---
 src/kvstore/kvstore.cc       |  8 +++--
 src/kvstore/kvstore_device.h | 58 +++++++++++++++++++++++++++++-------
 src/kvstore/kvstore_dist.h   | 21 ++++++-------
 src/kvstore/kvstore_local.h  | 24 ++++++++++-----
 4 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index 0de025ba9a35..95000fccae29 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -23,14 +23,16 @@ KVStore* KVStore::Create(const char *type_name) {
       tname == "local_allreduce_cpu") {
     kv =  new kvstore::KVStoreLocal();
   } else if (tname == "device" ||
+             tname == "local_update_device" ||
              tname == "local_allreduce_device") {
-    tname = "local_allreduce_device";
-    kv = new kvstore::KVStoreDevice();
+    kv = new kvstore::KVStoreDevice(true);
   } else if (tname == "dist_async" ||
              tname == "dist_sync" ||
+             tname == "dist_sync_device" ||
              tname == "dist") {
 #if MXNET_USE_DIST_KVSTORE
-    kv = new kvstore::KVStoreDist();
+    kv = new kvstore::KVStoreDist(
+        tname.find("device") != std::string::npos);
     if (tname == "dist_sync" &&
         kv->IsWorkerNode() &&
         kv->get_rank() == 0) {
diff --git a/src/kvstore/kvstore_device.h b/src/kvstore/kvstore_device.h
index 2667df523272..82c04f9ec337 100644
--- a/src/kvstore/kvstore_device.h
+++ b/src/kvstore/kvstore_device.h
@@ -21,6 +21,10 @@ namespace kvstore {
  * \brief Device implementation of KVStore that do reduction on GPU reduction.
  */
 class KVStoreDevice : public KVStoreLocal {
+ public:
+  explicit KVStoreDevice(bool device_mode)
+      : device_mode_(device_mode) {}
+
  protected:
   using KeyShape = std::pair<int, TShape>;
   void Init(const std::vector<int>& keys,
@@ -57,27 +61,27 @@ class KVStoreDevice : public KVStoreLocal {
         }
       }
 
-      tm_buf.merged = NDArray(s, tm_buf.ctx);
+      tm_buf.merged = NDArray(s, Context::CPUPinned(tm_buf.ctx.dev_id));
+      tm_buf.merged_device = NDArray(s, tm_buf.ctx);
       ctx_info[tm_buf.ctx.dev_id].second += s.Size();
     }
   }
 
   const NDArray& MergePushValue(
       int key, const std::vector<NDArray>& val, int priority) override {
-    if (updater_ != nullptr) {
-      // fall back to CPU based update if updater presents
+    if (!device_mode_) {
       return KVStoreLocal::MergePushValue(key, val, priority);
     }
-
-    if (merge_buf_.empty()) {
+    if (!buf_initialized_) {
       InitMergeBuffers(val);
+      buf_initialized_ = true;
     }
 
     auto& buf = merge_buf_[key];
     std::vector<NDArray> reduce(val.size());
-    CHECK(!buf.merged.is_none());
-    CopyFromTo(val[0], &(buf.merged), priority);
-    reduce[0] = buf.merged;
+    CHECK(!buf.merged_device.is_none());
+    CopyFromTo(val[0], &(buf.merged_device), priority);
+    reduce[0] = buf.merged_device;
 
     for (size_t i = 1; i < val.size(); ++i) {
       NDArray *copy_buf = buf.AllocCopyBuf(
@@ -85,11 +89,45 @@ class KVStoreDevice : public KVStoreLocal {
       CopyFromTo(val[i], copy_buf, priority);
       reduce[i] = *copy_buf;
     }
-    ElementwiseSum(reduce, &buf.merged);
-    return buf.merged;
+    ElementwiseSum(reduce, &buf.merged_device);
+
+    if (updater_ != nullptr) {
+      CopyFromTo(buf.merged_device, &(buf.merged));
+      return buf.merged;
+    } else {
+      return buf.merged_device;
+    }
+  }
+
+  void ScatterPullValue(
+      int key,
+      const NDArray& src,
+      const std::vector<NDArray*>& vals,
+      int priority) override {
+    if (!device_mode_) {
+      KVStoreLocal::ScatterPullValue(key, src, vals, priority);
+      return;
+    }
+    auto it = merge_buf_.find(key);
+    if (it != merge_buf_.end() && it->first == key) {
+      auto& buf = it->second;
+      if (!buf.merged_device.is_none()) {
+        CopyFromTo(src, &(buf.merged_device));
+        for (auto* vptr : vals) {
+          CopyFromTo(buf.merged_device, vptr, priority);
+        }
+        return;
+      }
+    }
+    // default, copy back
+    for (auto* vptr : vals) {
+      CopyFromTo(src, vptr, priority);
+    }
   }
 
  private:
+  bool device_mode_;
+  bool buf_initialized_{false};
   std::vector<KeyShape> sorted_key_shape_;
 };
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 1fa3cd7fd322..2705effe0104 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -7,7 +7,7 @@
 #define MXNET_KVSTORE_KVSTORE_DIST_H_
 #include <string>
 #include <vector>
-#include "./kvstore_local.h"
+#include "./kvstore_device.h"
 #include "mxnet/engine.h"
 #include "ps/ps.h"
 #include "./kvstore_dist_server.h"
@@ -25,9 +25,11 @@ namespace kvstore {
  * it's the server node's job to control the data consistency among all
  * workers. see details on \ref ServerHandle::Start
  */
-class KVStoreDist : public KVStoreLocal {
+class KVStoreDist : public KVStoreDevice {
  public:
-  KVStoreDist() : ps_worker_(nullptr), server_(nullptr) {
+  explicit KVStoreDist(bool device_mode)
+      : KVStoreDevice(device_mode),
+        ps_worker_(nullptr), server_(nullptr) {
     if (IsWorkerNode()) {
       ps_worker_ = new ps::KVWorker<real_t>(0);
       ps::Start("mxnet\0");
@@ -113,11 +115,11 @@ class KVStoreDist : public KVStoreLocal {
       if (buf.is_none()) {
         buf = NDArray(vals[0]->shape(), pinned_ctx_);
       }
-      real_t* data = static_cast<real_t*>(buf.data().dptr_);
-      size_t size = buf.shape().Size();
 
-      auto pull_from_servers = [this, key, data, size](
+      auto pull_from_servers = [this, key, buf] (
           RunContext rctx, Engine::CallbackOnComplete cb) {
+        real_t* data = static_cast<real_t*>(buf.data().dptr_);
+        size_t size = buf.shape().Size();
         // convert to ps keys
         PSKV& pskv = EncodeKey(key, size);
 
@@ -134,10 +136,7 @@ class KVStoreDist : public KVStoreLocal {
           {buf.var()},
           FnProperty::kNormal, priority);
 
-      // copy data from buffer to vals
-      for (auto v : vals) {
-        CopyFromTo(buf, v);
-      }
+      ScatterPullValue(key, buf, vals, priority);
     }
   }
 
@@ -268,6 +267,8 @@ class KVStoreDist : public KVStoreLocal {
     return pskv;
   }
 
+  // whether use device distributed local sync.
+  bool device_mode_;
   /**
    * \brief for worker to push and pull data
    */
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index e897f6437256..3e6ab7b5b3b0 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -68,15 +68,11 @@ class KVStoreLocal : public KVStore {
       if (updater_ != nullptr || it == merge_buf_.end()) {
         auto it = local_.find(key);
         CHECK(it != local_.end()) << "key " << key << " has not been inited";
-        const NDArray& src = it->second;
-        for (auto* vptr : grouped_vals[i]) {
-          CopyFromTo(src, vptr, priority);
-        }
+        ScatterPullValue(
+            key, it->second, grouped_vals[i], priority);
       } else {
-        auto& src = it->second.merged;
-        for (auto* vptr : grouped_vals[i]) {
-          CopyFromTo(src, vptr, priority);
-        }
+        ScatterPullValue(
+            key, it->second.merged, grouped_vals[i], priority);
       }
     }
   }
@@ -88,6 +84,8 @@ class KVStoreLocal : public KVStore {
     Context ctx;
     // the merged value
     NDArray merged;
+    // the merged value on device
+    NDArray merged_device;
     /// \brief the cpu buffer for gpu data
     std::vector<NDArray> copy_buf;
     // allocate copy buffer, if it has not been allocated
@@ -169,6 +167,16 @@ class KVStoreLocal : public KVStore {
     return buf.merged;
   }
 
+  virtual void ScatterPullValue(
+      int key,
+      const NDArray& src,
+      const std::vector<NDArray*>& vals,
+      int priority) {
+    for (auto* vptr : vals) {
+      CopyFromTo(src, vptr, priority);
+    }
+  }
+
   /// \brief buffer for merging push value
   std::unordered_map<int, BufferEntry> merge_buf_;
   // pinned context

From 474f74b4b465db673b2fa09024a5c895b05a05f0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 28 Jun 2016 15:10:32 -0700
Subject: [PATCH 073/126] [EXEC] More generic segment training api (#2551)

---
 python/mxnet/module/module.py |   3 +-
 src/symbol/graph_executor.cc  | 128 ++++++++++++++++++++++------------
 src/symbol/graph_executor.h   |  20 +++++-
 3 files changed, 102 insertions(+), 49 deletions(-)

diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index e327a6b4c9f9..e2b96dd5b370 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -176,7 +176,8 @@ def _impl(name, arr, cache):
                     if cache_arr is not arr:
                         cache_arr.copyto(arr)
                 else:
-                    assert allow_missing
+                    if not allow_missing:
+                        raise RuntimeError("%s is not presented" % name)
                     if initializer != None:
                         initializer(name, arr)
             else:
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index d9117963e589..acd9c0743250 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -289,9 +289,9 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
 
 GraphExecutor::~GraphExecutor() {
   Engine::Get()->WaitForAll();
-  for (auto &kv : cached_seg_opr_) {
-    if (kv.second != nullptr) {
-      Engine::Get()->DeleteOperator(kv.second);
+  for (auto item : cached_seg_opr_) {
+    if (item.opr != nullptr) {
+      Engine::Get()->DeleteOperator(item.opr);
     }
   }
   // need to delete the operators before delete the NDArray they referenced.
@@ -839,38 +839,76 @@ void GraphExecutor::InitCachedOps() {
   }
 }
 
-void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
-  // heurestic, only enable bulk on forward only
-  bool bulk_exec = prefer_bulk_execution_ && !monitor_callback_
-      && topo_start == 0 && num_forward_nodes_ == topo_order_.size();
+void GraphExecutor::InitOpSegs() {
+  // heurestic to enable bulk execution.
+  cached_seg_opr_.clear();
+  CachedSegOpr p;
+  p.opr = nullptr;
+  cached_seg_opr_.resize(topo_order_.size(), p);
 
-  if (bulk_exec) {
-    // encode things into a key
-    size_t key = topo_start * op_nodes_.size() + topo_end;
-    if (cached_seg_opr_.count(key) == 0) {
-      cached_seg_opr_[key] = this->CreateCachedOpr(topo_start, topo_end);
-      if (cached_seg_opr_.at(key) != nullptr) {
-        LOG(INFO) << "Created bulk execution on segment ["
-                  << topo_start << ", " << topo_end << ")";
+  if (!prefer_bulk_execution_) return;
+  if (monitor_callback_) return;
+  if (num_forward_nodes_ == topo_order_.size()) {
+    cached_seg_opr_[0] = this->CreateCachedSegOpr(0, topo_order_.size());
+    return;
+  }
+  int num_cseg = 0;
+  // normal procedure
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    size_t j = i;
+    int hit_count = 0;
+    for (; j < topo_order_.size(); ++j) {
+      if (j == num_forward_nodes_) break;
+      uint32_t nid = topo_order_[j];
+      const OpNode& op_node = op_nodes_[nid];
+      const StaticGraph::Node& gnode = graph_.nodes[nid];
+      if (!op_node.activated) continue;
+      if (graph_.nodes[nid].is_variable()) continue;
+      if (op_node.op->exec_type() != Operator::kSync) break;
+      bool hit = false, tobind = false;
+
+      for (const DataEntryInfo& out : op_node.outputs) {
+        if (out.type == kBindByExternal) hit = true;
       }
-    }
-    auto cached_op = cached_seg_opr_.at(key);
-    if (cached_op != nullptr) {
-      Context* pctx = nullptr;
-      for (size_t i = topo_start; i < topo_end; ++i) {
-        uint32_t nid = topo_order_[i];
-        if (!op_nodes_[nid].activated) continue;
-        if (graph_.nodes[nid].is_variable()) continue;
-        OpNode& opnode = op_nodes_[nid];
-        opnode.op_ctx.is_train = is_train;
-        pctx = &(opnode.ctx);
+      const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
+      for (size_t i = 0; i < ninput; ++i) {
+        const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i];
+        const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        if (info.type == kBindByExternal) hit = true;
+        if (info.type == kTobeBindByExternal) tobind = true;
       }
-      Engine::Get()->Push(cached_op, *pctx);
-      return;
+      if (hit) ++hit_count;
+      if (tobind) break;
+      // if encounter consecutive 3 blocks containing parameters, use as segment.
+      // this usually means conv-relu-bn
+      const int kHitMaxMagic = 2;
+      if (hit_count > kHitMaxMagic) break;
+    }
+    if (j > i + 1) {
+      cached_seg_opr_[i] = CreateCachedSegOpr(i, j);
+      ++num_cseg;
+      i = j - 1;
     }
   }
+}
 
+void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
   for (size_t i = topo_start; i < topo_end; ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& opnode = op_nodes_[nid];
+    opnode.op_ctx.is_train = is_train;
+  }
+
+  for (size_t i = topo_start; i < topo_end; ++i) {
+    auto seg_op = cached_seg_opr_[i];
+    if (seg_op.opr != nullptr && seg_op.topo_end <= topo_end) {
+      Engine::Get()->Push(seg_op.opr, seg_op.ctx);
+      i = seg_op.topo_end - 1;
+      continue;
+    }
+
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
     if (graph_.nodes[nid].is_variable()) continue;
@@ -884,7 +922,6 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
                  &(opnode.outputs[0].data));
       continue;
     }
-    opnode.op_ctx.is_train = is_train;
     if (opnode.cached_opr != nullptr) {
       Engine::Get()->Push(opnode.cached_opr, opnode.ctx);
     } else {
@@ -988,49 +1025,50 @@ void GraphExecutor::Backward(const std::vector<NDArray> &head_grads) {
   RunOps(true, num_forward_nodes_, topo_order_.size());
 }
 
-Engine::OprHandle GraphExecutor::CreateCachedOpr(size_t topo_start, size_t topo_end) {
+GraphExecutor::CachedSegOpr
+GraphExecutor::CreateCachedSegOpr(size_t topo_start, size_t topo_end) {
   std::vector<Engine::VarHandle> read_vars;
   std::vector<Engine::VarHandle> write_vars;
   Context *pctx = nullptr;
-
+  CachedSegOpr ret;
+  ret.topo_begin = topo_start;
+  ret.topo_end = topo_end;
+  ret.opr = nullptr;
   for (size_t k = topo_start; k < topo_end; ++k) {
     uint32_t nid = topo_order_[k];
     OpNode& op_node = op_nodes_[nid];
+    const StaticGraph::Node& gnode = graph_.nodes[nid];
     if (!op_nodes_[nid].activated) continue;
     if (graph_.nodes[nid].is_variable()) continue;
-    if (op_node.op->exec_type() != Operator::kSync) {
-      return nullptr;
-    }
+    if (op_node.op->exec_type() != Operator::kSync) return ret;
     if (pctx == nullptr) pctx = &(op_node.ctx);
     if (*pctx != op_node.ctx) {
-      return nullptr;
+      return ret;
     }
-    const StaticGraph::Node& gnode = graph_.nodes[nid];
     // AddTO: index is used to store in-place add resources.
     const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
 
     for (const DataEntryInfo& out : op_node.outputs) {
+      if (out.type == kTobeBindByExternal) return ret;
       write_vars.push_back(out.data.var());
-      if (out.type == kTobeBindByExternal) return nullptr;
     }
 
     for (const DataEntryInfo& aux : op_node.aux_states) {
+      if (aux.type == kTobeBindByExternal) return ret;
       write_vars.push_back(aux.data.var());
-      if (aux.type == kTobeBindByExternal) return nullptr;
     }
-
     for (size_t i = 0; i < ninput; ++i) {
-      const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i];
+      const StaticGraph::DataEntry& e = gnode.inputs[i];
       const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+      if (info.type == kTobeBindByExternal) return ret;
       read_vars.push_back(info.data.var());
-      if (info.type == kTobeBindByExternal) return nullptr;
     }
-
     for (const Resource& r : op_node.op_ctx.requested) {
       write_vars.push_back(r.var);
     }
   }
-  if (pctx == nullptr) return nullptr;
+  if (pctx == nullptr) return ret;
+  ret.ctx = *pctx;
   // deduplication
   std::sort(write_vars.begin(), write_vars.end());
   write_vars.resize(std::unique(write_vars.begin(), write_vars.end()) -
@@ -1048,7 +1086,6 @@ Engine::OprHandle GraphExecutor::CreateCachedOpr(size_t topo_start, size_t topo_
     }
   }
   read_vars.resize(rtop - read_vars.begin());
-
   bool is_gpu = pctx->dev_mask() == gpu::kDevMask;
   auto exec_fun = [this, topo_start, topo_end, is_gpu]
       (RunContext ctx, Engine::CallbackOnComplete on_complete) {
@@ -1105,8 +1142,9 @@ Engine::OprHandle GraphExecutor::CreateCachedOpr(size_t topo_start, size_t topo_
     }
     on_complete();
   };
-  return Engine::Get()->NewOperator(
+  ret.opr =  Engine::Get()->NewOperator(
       exec_fun, read_vars, write_vars, FnProperty::kNormal);
+  return ret;
 }
 
 Executor *Executor::Bind(Symbol symbol,
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index 2ac755d8734e..a2b5f054721e 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -68,6 +68,7 @@ class GraphExecutor : public Executor {
     this->InitDataEntryMemory();
     this->InitResources();
     this->InitCachedOps();
+    this->InitOpSegs();
   }
 
  protected:
@@ -158,6 +159,17 @@ class GraphExecutor : public Executor {
       }
     }
   };
+  // a cached segment operator that executes a segment
+  struct CachedSegOpr {
+    // context of the operator
+    Context ctx;
+    // begin in topo order
+    size_t topo_begin;
+    // end in topo order
+    size_t topo_end;
+    // the cached operator
+    Engine::OprHandle opr;
+  };
   /*!
    * \brief Get input option of a node.
    *  This function is overriden for both Forward and Backward node.
@@ -198,9 +210,9 @@ class GraphExecutor : public Executor {
    * \param topo_start beginning of segment
    * \param topo_end end of segment
    * \return the cached operator.
-   *  Can be nullptr if cached operator cannot be created.
+   * The ret.opr can be nullptr if tyhe creation failed
    */
-  Engine::OprHandle CreateCachedOpr(size_t topo_start, size_t topo_end);
+  CachedSegOpr CreateCachedSegOpr(size_t topo_start, size_t topo_end);
   // initialize the internal graph structure
   void InitGraph(const Symbol &symbol,
                  const Context& default_ctx,
@@ -222,6 +234,8 @@ class GraphExecutor : public Executor {
   void InitOperators();
   // initialize OpNode data structure
   void InitCachedOps();
+  // initialize segments of code to run together as a group.
+  void InitOpSegs();
   // assign context to the graph, this will mutate the graph.
   void AssignContext(const Context default_ctx,
                      const std::map<std::string, Context>& ctx_map,
@@ -261,7 +275,7 @@ class GraphExecutor : public Executor {
   // monitor call back
   std::function<void(const char*, void*)> monitor_callback_;
   // cached segment operator
-  std::unordered_map<size_t, Engine::OprHandle> cached_seg_opr_;
+  std::vector<CachedSegOpr> cached_seg_opr_;
 };  // class GraphExecutor
 }  // namespace mxnet
 #endif  // MXNET_SYMBOL_GRAPH_EXECUTOR_H_

From 6c27c3b7388f979b57c4cc2ccd895054ef0fd1c3 Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Tue, 28 Jun 2016 18:20:32 -0700
Subject: [PATCH 074/126] update (#2563)

---
 src/operator/batch_norm-inl.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index e8c5502d86af..bd76bd044afc 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -38,7 +38,7 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
     DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
     .describe("Momentum for moving average");
     DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
-    .describe("Fix gamma while training");
+    .describe("Fix gamma to 1");
     DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
     .describe("Whether use global moving statistics instead of local batch-norm. "
               "This will force change batch-norm into a scale shift operator.");
@@ -88,6 +88,8 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 1> bias = in_data[batchnorm::kBeta].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
+
+    if (param_.fix_gamma) slope = 1.0f;
     // whether use global statistics
     if (ctx.is_train && !param_.use_global_stats) {
       Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
@@ -190,6 +192,7 @@ class BatchNormOp : public Operator {
                                                                                      data.shape_)) +
                broadcast<1>(gmean, data.shape_) * scale);
       } else {
+        Assign(gslope, req[batchnorm::kGamma], 0.0f);
         Assign(grad_in, req[batchnorm::kData], grad *
                broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
                broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
@@ -208,6 +211,7 @@ class BatchNormOp : public Operator {
                broadcast<1>(
                    1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
       } else {
+        Assign(gslope, req[batchnorm::kGamma], 0.0f);
         Assign(grad_in, req[batchnorm::kData], grad *
                broadcast<1>(
                    1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));

From 6df4df2ac9a38efd6e77fdc0e7c4d5ff67994dc0 Mon Sep 17 00:00:00 2001
From: lancy <14307130246@fudan.edu.cn>
Date: Wed, 29 Jun 2016 09:21:21 +0800
Subject: [PATCH 075/126] Improve caffe converter (#2550)

---
 tools/caffe_converter/convert_symbol.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index b4fea0b0b7c1..a62a25e7e7cf 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -97,9 +97,14 @@ def proto2script(proto_file):
         if layer[i].type == 'Pooling' or layer[i].type == 17:
             type_string = 'mx.symbol.Pooling'
             param = layer[i].pooling_param
-            param_string = "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
-                (param.pad, param.pad, param.kernel_size,\
-                param.kernel_size, param.stride, param.stride)
+            param_string = ''
+            if param.global_pooling == True:
+                # there must be a param `kernel` in a pooling layer
+                param_string += "global_pool=True, kernel=(1,1)"
+            else:
+                param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
+                    (param.pad, param.pad, param.kernel_size,\
+                    param.kernel_size, param.stride, param.stride)
             if param.pool == 0:
                 param_string = param_string + ", pool_type='max'"
             elif param.pool == 1:
@@ -129,9 +134,6 @@ def proto2script(proto_file):
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'Softmax' or layer[i].type == 20:
             type_string = 'mx.symbol.SoftmaxOutput'
-
-            # We only support single output network for now.
-            output_name = name
         if layer[i].type == 'Flatten' or layer[i].type == 8:
             type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
@@ -140,9 +142,16 @@ def proto2script(proto_file):
         if layer[i].type == 'Concat' or layer[i].type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
+        if layer[i].type == 'Crop':
+            type_string = 'mx.symbol.Crop'
+            need_flatten[name] = True
+            param_string = 'center_crop=True'
+        if layer[i].type == 'BatchNorm':
+            type_string = 'mx.symbol.BatchNorm'
+            param = layer[i].batch_norm_param
+            param_string = 'use_global_stats=%s' % param.use_global_stats
         if type_string == '':
             raise Exception('Unknown Layer %s!' % layer[i].type)
-
         if type_string != 'split':
             bottom = layer[i].bottom
             if param_string != "":
@@ -163,6 +172,7 @@ def proto2script(proto_file):
                     (name, type_string, name, ','.join([mapping[x] for x in bottom]), param_string)
         for j in range(len(layer[i].top)):
             mapping[layer[i].top[j]] = name
+        output_name = name
     return symbol_string, output_name, input_dim
 
 def proto2symbol(proto_file):

From fd4c560a66ab36c7ab1b496d35c17f6066bdfe5c Mon Sep 17 00:00:00 2001
From: tornadomeet <tornadomeet@users.noreply.github.com>
Date: Thu, 30 Jun 2016 01:22:10 +0800
Subject: [PATCH 076/126] [OP]Spatial Transformer Network (#2547)

* [OP]Spatial Transformer Network

fix lint

fix test

fix test

* fix pylint

use cpu for test_operator.py

fix typo

* fix python3 divide
---
 example/image-classification/train_mnist.py  |  25 +-
 ps-lite                                      |   2 +-
 python/mxnet/initializer.py                  |  10 +
 src/operator/cudnn_spatial_transformer-inl.h | 182 +++++++++++++
 src/operator/spatial_transformer-inl.h       | 264 +++++++++++++++++++
 src/operator/spatial_transformer.cc          | 138 ++++++++++
 src/operator/spatial_transformer.cu          | 160 +++++++++++
 tests/python/unittest/test_operator.py       |  42 +++
 8 files changed, 820 insertions(+), 3 deletions(-)
 create mode 100644 src/operator/cudnn_spatial_transformer-inl.h
 create mode 100644 src/operator/spatial_transformer-inl.h
 create mode 100644 src/operator/spatial_transformer.cc
 create mode 100644 src/operator/spatial_transformer.cu

diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
index 5b6fa3c05b60..7758d28c7977 100644
--- a/example/image-classification/train_mnist.py
+++ b/example/image-classification/train_mnist.py
@@ -16,6 +16,21 @@ def _download(data_dir):
         os.system("unzip -u mnist.zip; rm mnist.zip")
     os.chdir("..")
 
+def get_loc(data, attr={'lr_mult':'0.01'}):
+    """
+    the localisation network in lenet-stn, it will increase acc about more than 1%,
+    when num-epoch >=15
+    """
+    loc = mx.symbol.Convolution(data=data, num_filter=30, kernel=(5, 5), stride=(2,2))
+    loc = mx.symbol.Activation(data = loc, act_type='relu')
+    loc = mx.symbol.Pooling(data=loc, kernel=(2, 2), stride=(2, 2), pool_type='max')
+    loc = mx.symbol.Convolution(data=loc, num_filter=60, kernel=(3, 3), stride=(1,1), pad=(1, 1))
+    loc = mx.symbol.Activation(data = loc, act_type='relu')
+    loc = mx.symbol.Pooling(data=loc, global_pool=True, kernel=(2, 2), pool_type='avg')
+    loc = mx.symbol.Flatten(data=loc)
+    loc = mx.symbol.FullyConnected(data=loc, num_hidden=6, name="stn_loc", attr=attr)
+    return loc
+
 def get_mlp():
     """
     multi-layer perceptron
@@ -29,13 +44,16 @@ def get_mlp():
     mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
     return mlp
 
-def get_lenet():
+def get_lenet(add_stn=False):
     """
     LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
     Haffner. "Gradient-based learning applied to document recognition."
     Proceedings of the IEEE (1998)
     """
     data = mx.symbol.Variable('data')
+    if(add_stn):
+        data = mx.sym.SpatialTransformer(data=data, loc=get_loc(data), target_shape = (28,28),
+                                         transform_type="affine", sampler_type="bilinear")
     # first conv
     conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
     tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
@@ -88,7 +106,7 @@ def get_iterator_impl(args, kv):
 def parse_args():
     parser = argparse.ArgumentParser(description='train an image classifer on mnist')
     parser.add_argument('--network', type=str, default='mlp',
-                        choices = ['mlp', 'lenet'],
+                        choices = ['mlp', 'lenet', 'lenet-stn'],
                         help = 'the cnn to use')
     parser.add_argument('--data-dir', type=str, default='mnist/',
                         help='the input data directory')
@@ -124,6 +142,9 @@ def parse_args():
     if args.network == 'mlp':
         data_shape = (784, )
         net = get_mlp()
+    elif args.network == 'lenet-stn':
+        data_shape = (1, 28, 28)
+        net = get_lenet(True)
     else:
         data_shape = (1, 28, 28)
         net = get_lenet()
diff --git a/ps-lite b/ps-lite
index 8aff164580f0..708d4daded09 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 8aff164580f0e4ff81ad98038b6ec4ec02452ce8
+Subproject commit 708d4daded09e857faf315c13206671daf5c2928
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 3d59443419e6..d308626a6377 100644
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -1,4 +1,5 @@
 # coding: utf-8
+# pylint: disable=too-many-branches
 """Initialization helper for mxnet"""
 from __future__ import absolute_import
 
@@ -29,6 +30,10 @@ def __call__(self, name, arr):
             raise TypeError('arr must be NDArray')
         if name.startswith('upsampling'):
             self._init_bilinear(name, arr)
+        elif name.startswith('stn_loc') and name.endswith('weight'):
+            self._init_zero(name, arr)
+        elif name.startswith('stn_loc') and name.endswith('bias'):
+            self._init_loc_bias(name, arr)
         elif name.endswith('bias'):
             self._init_bias(name, arr)
         elif name.endswith('gamma'):
@@ -59,6 +64,11 @@ def _init_bilinear(self, _, arr):
             weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
         arr[:] = weight.reshape(shape)
 
+    def _init_loc_bias(self, _, arr):
+        shape = arr.shape
+        assert(shape[0] == 6)
+        arr[:] = np.array([1.0, 0, 0, 0, 1.0, 0])
+
     def _init_zero(self, _, arr):
         arr[:] = 0.0
 
diff --git a/src/operator/cudnn_spatial_transformer-inl.h b/src/operator/cudnn_spatial_transformer-inl.h
new file mode 100644
index 000000000000..12e30b603582
--- /dev/null
+++ b/src/operator/cudnn_spatial_transformer-inl.h
@@ -0,0 +1,182 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file cudnn_spatial_transformer-inl.h
+ * \brief
+ * \author Wei Wu
+*/
+#ifndef MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
+
+#include <algorithm>
+#include <vector>
+#include "./spatial_transformer-inl.h"
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+template<typename DType>
+class CuDNNSpatialTransformerOp : public Operator {
+ public:
+  explicit CuDNNSpatialTransformerOp(SpatialTransformerParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    if (param_.sampler_type == st::kBilinear) {
+      sampler_ = CUDNN_SAMPLER_BILINEAR;
+    }
+  }
+
+  ~CuDNNSpatialTransformerOp() {
+    if (init_cudnn_) {
+      CHECK_EQ(cudnnDestroySpatialTransformerDescriptor(st_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2);
+    Tensor<gpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+                                .get_with_shape<gpu, 4, DType>(grid_shape, s);
+    if (!init_cudnn_) {
+     Init(s, in_data, out_data);
+    }
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
+    if (param_.transform_type == st::kAffine) {
+      CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_,
+                                                  st_desc_,
+                                                  loc.dptr_,
+                                                  grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+    }
+    CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_,
+                                          st_desc_,
+                                          &alpha,
+                                          in_desc_,
+                                          data.dptr_,
+                                          grid.dptr_,
+                                          &beta,
+                                          out_desc_,
+                                          out.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(out_grad.size(), 1);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> ddata = in_grad[st::kData].get<gpu, 4, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2);
+    Tensor<gpu, 3, DType> dloc = in_grad[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+                    .get_with_shape<gpu, 4, DType>(grid_shape, s);
+    // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in
+    // DeclareBackwardDependency, another, we can we reuse grid for inplace operator
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
+    typename DataType<DType>::ScaleType alpha_dgrid = 1.0f;
+    typename DataType<DType>::ScaleType beta_dgrid = 0.0f;
+    CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_,
+                                           st_desc_,
+                                           &alpha,
+                                           in_desc_,
+                                           data.dptr_,
+                                           &beta,
+                                           in_desc_/*reuse in_desc_*/,
+                                           ddata.dptr_/*output*/,
+                                           &alpha_dgrid,
+                                           out_desc_/*reuse out_desc_*/,
+                                           grad.dptr_,
+                                           grid.dptr_,
+                                           &beta_dgrid,
+                                           grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS);
+    if (param_.transform_type == st::kAffine) {
+      CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_,
+                                                   st_desc_,
+                                                   grid.dptr_,
+                                                   dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    #if CUDNN_MAJOR == 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+      CHECK_EQ(cudnnCreateSpatialTransformerDescriptor(&st_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+                                          format_,
+                                          dtype_,
+                                          data.size(0),
+                                          data.size(1),
+                                          data.size(2),
+                                          data.size(3)), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+                                          format_,
+                                          dtype_,
+                                          out.size(0),
+                                          out.size(1),
+                                          out.size(2),
+                                          out.size(3)), CUDNN_STATUS_SUCCESS);
+      if (param_.sampler_type == st::kBilinear) {
+        int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
+                     static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
+        CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
+                                                        sampler_,
+                                                        dtype_,
+                                                        4,
+                                                        dim) , CUDNN_STATUS_SUCCESS);
+      }
+    }
+  }
+
+  bool init_cudnn_;
+  cudnnDataType_t dtype_;
+  cudnnSpatialTransformerDescriptor_t st_desc_;
+  cudnnTensorDescriptor_t in_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnSamplerType_t sampler_;
+  #if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format_;
+  #endif
+  SpatialTransformerParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
new file mode 100644
index 000000000000..74d35ffd7b9e
--- /dev/null
+++ b/src/operator/spatial_transformer-inl.h
@@ -0,0 +1,264 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file spatial_transformer-inl.h
+ * \brief
+ *  Reproducing paper: aderberg M, Simonyan K, Zisserman A. "Spatial transformer networks"
+ * \author Wei Wu
+*/
+#ifndef MXNET_OPERATOR_SPATIAL_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_SPATIAL_TRANSFORMER_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+namespace st {
+enum SpatialTransformerOpInputs {kData, kLoc};
+enum SpatialTransformerOpOutputs {kOut, kGridDst, kGridSrc};
+enum SpatialTransformerOpResource {kTempSpace};
+enum SpatialTransformerTransformType {kAffine};
+enum SpatialTransformerSamplerType {kBilinear};
+}
+
+struct SpatialTransformerParam : public dmlc::Parameter<SpatialTransformerParam> {
+  TShape target_shape;
+  int transform_type;
+  int sampler_type;
+  DMLC_DECLARE_PARAMETER(SpatialTransformerParam) {
+    int shape[] = {0, 0};
+    DMLC_DECLARE_FIELD(target_shape).set_default(TShape(shape, shape + 2))
+        .describe("output shape(h, w) of spatial transformer: (y, x)");
+    DMLC_DECLARE_FIELD(transform_type).add_enum("affine", st::kAffine)
+        .describe("transformation type");
+    DMLC_DECLARE_FIELD(sampler_type).add_enum("bilinear", st::kBilinear)
+        .describe("sampling type");
+  }
+};
+
+template<typename xpu, typename DType>
+class SpatialTransformerOp : public Operator {
+ public:
+  explicit SpatialTransformerOp(SpatialTransformerParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data = in_data[st::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[st::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grid_dst = out_data[st::kGridDst].get<xpu, 2, DType>(s);
+    Tensor<xpu, 3, DType> grid_src = out_data[st::kGridSrc].get<xpu, 3, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Tensor<xpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<xpu, 3, DType>(loc_shape, s);
+    Tensor<cpu, 2, DType> workspace =
+          ctx.requested[st::kTempSpace].get_host_space_typed<2, DType>(
+          grid_dst.shape_);
+    for (index_t i = 1; i <= workspace.size(1); i++) {
+      // grid dst coordinate is (x, y, 1)
+      workspace[0][i-1] = -1.0 + (i-1) % param_.target_shape[1] * 2.0 /
+                          (param_.target_shape[1] - 1);
+      workspace[1][i-1] = -1.0 + (i-1) / param_.target_shape[1] * 2.0 /
+                          (param_.target_shape[0] - 1);
+      workspace[2][i-1] = 1.0;
+    }
+    Copy(grid_dst, workspace, grid_dst.stream_);
+    for (index_t batch = 0; batch < data.size(0); batch++) {
+        if (param_.transform_type == st::kAffine) {
+          grid_src[batch] = dot(loc[batch], grid_dst);
+        }
+    }
+    if (param_.sampler_type == st::kBilinear) {
+      BilinearSamplingForward(out, data, grid_src);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data = in_data[st::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> grad = out_grad[st::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> gdata = in_grad[st::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grid_dst = out_data[st::kGridDst].get<xpu, 2, DType>(s);
+    Tensor<xpu, 3, DType> grid_src = out_data[st::kGridSrc].get<xpu, 3, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Tensor<xpu, 3, DType> gloc = in_grad[st::kLoc].get_with_shape<xpu, 3, DType>(loc_shape, s);
+    gdata = 0.0;
+    if (param_.sampler_type == st::kBilinear) {
+      BilinearSamplingBackward(gdata, grid_src, grad, data);
+    }
+    for (index_t batch = 0; batch < data.size(0); batch++) {
+        if (param_.transform_type == st::kAffine) {
+          gloc[batch] = dot(grid_src[batch], grid_dst.T());
+        }
+    }
+  }
+
+ private:
+  SpatialTransformerParam param_;
+};  // class SpatialTransformerOp
+
+template<typename xpu>
+Operator* CreateOp(SpatialTransformerParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SpatialTransformerProp : public OperatorProperty {
+ public:
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 3;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+      return {"data", "loc"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "grid_dst", "grid_src"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, loc]";
+    CHECK_EQ(param_.transform_type, st::kAffine) << "only supports affine transform currently";
+    CHECK_EQ(param_.sampler_type, st::kBilinear) << "only supports bilinear sampling currently";
+    const TShape &dshape = (*in_shape)[st::kData];
+    const TShape &lshape = (*in_shape)[st::kLoc];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 4) \
+        << "input data should be 4D in batch-num_filter-y-x";
+    if (lshape.ndim() ==  0) return false;
+    CHECK_EQ(lshape.ndim(), 2) \
+        << "locolisation paramter should be 4D in batch-num_hidden";
+    if (param_.transform_type == st::kAffine) {
+      CHECK_EQ(lshape[1], 6) << "incorrect locolisation network shape[1], should be 6";
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    CHECK_GT(param_.target_shape[0], 0) \
+        << "incorrect target_shape: " << param_.target_shape[0];
+    CHECK_GT(param_.target_shape[1], 0) \
+        << "incorrect target_shape: " << param_.target_shape[1];
+    (*out_shape)[st::kOut][2] = param_.target_shape[0];
+    (*out_shape)[st::kOut][3] = param_.target_shape[1];
+    out_shape->push_back(Shape2(3, param_.target_shape[0]*param_.target_shape[1]));
+    out_shape->push_back(Shape3(dshape[0], 2, param_.target_shape[0]*param_.target_shape[1]));
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                   std::vector<int> *out_type,
+                   std::vector<int> *aux_type) const override {
+      int dtype = -1;
+      for (size_t i = 0; i < in_type->size(); ++i) {
+        if (dtype == -1) {
+          dtype = in_type->at(i);
+        } else {
+          CHECK(in_type->at(i) == dtype ||
+                in_type->at(i) == -1) <<
+                "Non-uniform data type in SpatialTransformer";
+        }
+      }
+      if (dtype == -1) {
+        LOG(FATAL) << "Not enough information to infer type in SpatialTransformer.";
+        return false;
+      }
+      size_t nin = this->ListArguments().size();
+      in_type->clear();
+      for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+      size_t naux = this->ListAuxiliaryStates().size();
+      aux_type->clear();
+      for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
+      size_t nout = this->ListOutputs().size();
+      out_type->clear();
+      for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+      return true;
+    }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SpatialTransformerProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SpatialTransformer";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[st::kOut],
+            out_data[st::kGridDst],
+            out_data[st::kGridSrc],
+            in_data[st::kData]
+           };
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  #if CUDNN_MAJOR == 5
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+  #endif
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  SpatialTransformerParam param_;
+};  // class SpatialTransformerProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SPATIAL_TRANSFORMER_INL_H_
diff --git a/src/operator/spatial_transformer.cc b/src/operator/spatial_transformer.cc
new file mode 100644
index 000000000000..de1dc733ef57
--- /dev/null
+++ b/src/operator/spatial_transformer.cc
@@ -0,0 +1,138 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file spatial_transformer.cc
+ * \brief
+ * \author Wei Wu
+*/
+
+#include "./spatial_transformer-inl.h"
+
+namespace mshadow {
+template<typename DType>
+inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
+                                    const Tensor<cpu, 4, DType> &input,
+                                    const Tensor<cpu, 3, DType> grid_src) {
+  DType *out = output.dptr_;
+  const DType *data = input.dptr_;
+  const DType *grid = grid_src.dptr_;
+  int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
+  int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
+  for (index_t n = 0; n < o_n; ++n) {
+    for (index_t c = 0; c < o_c; ++c) {
+      for (index_t h = 0; h < o_h; ++h) {
+        for (index_t w = 0; w < o_w; ++w) {
+          index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+          index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+          DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+          DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
+          index_t top_left_y = std::min(i_h, std::max(0, static_cast<int>(floor(y_real))));
+          index_t top_left_x = std::min(i_w, std::max(0, static_cast<int>(floor(x_real))));
+          DType top_left_y_w = 1.0 - (y_real - top_left_y);
+          DType top_left_x_w = 1.0 - (x_real - top_left_x);
+          index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+          DType top_left_v = *(data + data_index);
+          DType top_right_v = *(data + data_index + 1);
+          DType bottom_left_v = *(data + data_index + i_w);
+          DType bottom_right_v = *(data + data_index + i_w + 1);
+          *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
+                             top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                             bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                             bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void BilinearSamplingBackward(const Tensor<cpu, 4, DType> &input_grad,
+                                     const Tensor<cpu, 3, DType> &grid_src_data,
+                                     const Tensor<cpu, 4, DType> &output_grad,
+                                     const Tensor<cpu, 4, DType> &input_data) {
+  DType *g_input = input_grad.dptr_;
+  DType *grid_src = grid_src_data.dptr_;
+  const DType *grad = output_grad.dptr_;
+  const DType *data = input_data.dptr_;
+  int o_n = output_grad.size(0), o_c = output_grad.size(1),
+      o_h = output_grad.size(2), o_w = output_grad.size(3);
+  int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
+  for (index_t n = 0; n < o_n; ++n) {
+     for (index_t h = 0; h < o_h; ++h) {
+        for (index_t w = 0; w < o_w; ++w) {
+          DType top_left_y_gw = 0.0;
+          DType top_left_x_gw = 0.0;
+          index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+          DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+          DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
+          index_t top_left_y = std::min(i_h, std::max(0, static_cast<int>(floor(y_real))));
+          index_t top_left_x = std::min(i_w, std::max(0, static_cast<int>(floor(x_real))));
+          DType top_left_y_w = 1.0 - (y_real - top_left_y);
+          DType top_left_x_w = 1.0 - (x_real - top_left_x);
+          for (index_t c = 0; c < o_c; ++c) {
+            index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+            index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w
+                                 + top_left_x;
+            // calc 4 vertex value in input data
+            DType top_left_v = *(data + data_index);
+            DType top_right_v = *(data + data_index + 1);
+            DType bottom_left_v = *(data + data_index + i_w);
+            DType bottom_right_v = *(data + data_index + i_w + 1);
+            // calc input grad
+            *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
+            *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w
+                                           * (1.0 - top_left_x_w);
+            *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w)
+                                            * top_left_x_w;
+            *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w)
+                                                * (1.0 - top_left_x_w);
+            // calc weight grad of top_left_w, then multiple -1 is the grad of grid_src
+            top_left_y_gw -= *(grad + grad_index) * (top_right_v - bottom_right_v +
+                             (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
+                             * top_left_x_w);
+            top_left_x_gw -= *(grad + grad_index) * (bottom_left_v - bottom_right_v +
+                             (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
+                             * top_left_y_w);
+          }
+          // calc grid_src grad
+          *(grid_src + grid_src_index + o_h * o_w) = top_left_y_gw * (i_h - 1) / 2;
+          *(grid_src + grid_src_index) = top_left_x_gw * (i_w - 1) / 2;
+        }
+      }
+    }
+  }
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(SpatialTransformerParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SpatialTransformerOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+Operator *SpatialTransformerProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SpatialTransformerParam);
+
+MXNET_REGISTER_OP_PROPERTY(SpatialTransformer, SpatialTransformerProp)
+.add_argument("data", "Symbol", "Input data to the SpatialTransformerOp.")
+.add_argument("loc", "Symbol", "localisation net, the output dim should be 6 when transform_type "
+              "is affine, and the name of loc symbol should better starts with 'stn_loc', so that "
+              "initialization it with iddentify tranform, or you shold initialize the weight and "
+              "bias by yourself.")
+.add_arguments(SpatialTransformerParam::__FIELDS__())
+.describe("Apply spatial transformer to input feature map.");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/spatial_transformer.cu b/src/operator/spatial_transformer.cu
new file mode 100644
index 000000000000..11b4d54bf139
--- /dev/null
+++ b/src/operator/spatial_transformer.cu
@@ -0,0 +1,160 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file spatial_transformer.cu
+ * \brief
+ * \author Wei Wu
+*/
+
+#include "./spatial_transformer-inl.h"
+#include <algorithm>
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+#include "./cudnn_spatial_transformer-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+
+namespace mshadow {
+template<typename DType>
+__global__ void BilinearSamplingForwardKernel(const int i_c, const int i_h,
+                                              const int i_w, const DType* data,
+                                              const DType* grid, const int o_n,
+                                              const int o_c, const int o_h,
+                                              const int o_w, DType* out) {
+  for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+       index < o_n * o_c * o_h * o_w;
+       index += blockDim.x * gridDim.x * gridDim.y) {
+    // (n, c, h, w) is the element in out
+    int w = index % o_w;
+    int h = (index / o_w) % o_h;
+    int c = (index / o_w / o_h) % o_c;
+    int n = index / o_w / o_h / o_c;
+    index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+    index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+    DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+    DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
+    index_t top_left_y = min(i_h, max(0, static_cast<int>(floor(y_real))));
+    index_t top_left_x = min(i_w, max(0, static_cast<int>(floor(x_real))));
+    DType top_left_y_w = 1.0 - (y_real - top_left_y);
+    DType top_left_x_w = 1.0 - (x_real - top_left_x);
+    index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+    DType top_left_v = *(data + data_index);
+    DType top_right_v = *(data + data_index + 1);
+    DType bottom_left_v = *(data + data_index + i_w);
+    DType bottom_right_v = *(data + data_index + i_w + 1);
+    *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
+                       top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                       bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                       bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+    }
+}
+
+template<typename DType>
+__global__ void BilinearSamplingBackwardKernel(const int i_c, const int i_h,
+                                              const int i_w, const DType* grad,
+                                              const DType* data, const int o_n,
+                                              const int o_c, const int o_h,
+                                              const int o_w, DType* g_input,
+                                              DType* grid_src) {
+  for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+       index < o_n * o_h * o_w;
+       index += blockDim.x * gridDim.x * gridDim.y) {
+    // (n, c, h, w) is the element in grad
+    int w = index % o_w;
+    int h = (index / o_w) % o_h;
+    int n = index / o_w / o_h;
+    DType top_left_y_gw = 0.0;
+    DType top_left_x_gw = 0.0;
+    index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+    DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+    DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
+    index_t top_left_y = min(i_h, max(0, static_cast<int>(floor(y_real))));
+    index_t top_left_x = min(i_w, max(0, static_cast<int>(floor(x_real))));
+    DType top_left_y_w = 1.0 - (y_real - top_left_y);
+    DType top_left_x_w = 1.0 - (x_real - top_left_x);
+    for (index_t c = 0; c < o_c; ++c) {
+      index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+      index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+      // calc 4 vertex value in input data
+      DType top_left_v = *(data + data_index);
+      DType top_right_v = *(data + data_index + 1);
+      DType bottom_left_v = *(data + data_index + i_w);
+      DType bottom_right_v = *(data + data_index + i_w + 1);
+      // calc input grad
+      *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
+      *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w * (1.0 - top_left_x_w);
+      *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w) * top_left_x_w;
+      *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w) *
+                                          (1.0 - top_left_x_w);
+      // calc weight grad of top_left_w, then multiple -1 is the grad of grid_src
+      top_left_y_gw -= *(grad + grad_index) * (top_right_v - bottom_right_v +
+                       (top_left_v - top_right_v - bottom_left_v + bottom_right_v) * top_left_x_w);
+      top_left_x_gw -= *(grad + grad_index) * (bottom_left_v - bottom_right_v + (top_left_v -
+                       top_right_v - bottom_left_v + bottom_right_v) * top_left_y_w);
+    }
+    // calc grid_src grad
+    *(grid_src + grid_src_index + o_h * o_w) = top_left_y_gw * (i_h - 1) / 2;
+    *(grid_src + grid_src_index) = top_left_x_gw * (i_w - 1) / 2;
+  }
+}
+
+template<typename DType>
+inline void BilinearSamplingForward(const Tensor<gpu, 4, DType> &output,
+                                    const Tensor<gpu, 4, DType> &input,
+                                    const Tensor<gpu, 3, DType> grid_src) {
+    DType *out = output.dptr_;
+    const DType *data = input.dptr_;
+    const DType *grid = grid_src.dptr_;
+    int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
+    int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
+    using namespace cuda;
+    const int max_block = (output.shape_.Size() + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+    dim3 num_blocks(kMaxGridNum, (max_block + kMaxGridNum - 1) / kMaxGridNum);
+    dim3 threads_per_block(kMaxThreadsPerBlock);
+    CheckLaunchParam(num_blocks, threads_per_block, "spatial transformer forward");
+    cudaStream_t stream = Stream<gpu>::GetStream(output.stream_);
+    BilinearSamplingForwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
+      i_c, i_h, i_w, data, grid, o_n, o_c, o_h, o_w, out);
+}
+
+template<typename DType>
+inline void BilinearSamplingBackward(const Tensor<gpu, 4, DType> &input_grad,
+                                     const Tensor<gpu, 3, DType> &grid_src_data,
+                                     const Tensor<gpu, 4, DType> &output_grad,
+                                     const Tensor<gpu, 4, DType> &input_data) {
+  DType *g_input = input_grad.dptr_;
+  DType *grid_src = grid_src_data.dptr_;
+  const DType *grad = output_grad.dptr_;
+  const DType *data = input_data.dptr_;
+  int o_n = output_grad.size(0), o_c = output_grad.size(1),
+      o_h = output_grad.size(2), o_w = output_grad.size(3);
+  int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
+  using namespace cuda;
+  const int max_block = (output_grad.shape_.Size() / o_c + kMaxThreadsPerBlock - 1)
+                        / kMaxThreadsPerBlock;
+  dim3 num_blocks(kMaxGridNum, (max_block + kMaxGridNum - 1) / kMaxGridNum);
+  dim3 threads_per_block(kMaxThreadsPerBlock);
+  CheckLaunchParam(num_blocks, threads_per_block, "spatial transformer backward");
+  cudaStream_t stream = Stream<gpu>::GetStream(input_grad.stream_);
+  BilinearSamplingBackwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
+    i_c, i_h, i_w, grad, data, o_n, o_c, o_h, o_w, g_input, grid_src);
+}
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(SpatialTransformerParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNSpatialTransformerOp<DType>(param);
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SpatialTransformerOp<gpu, DType>(param);
+  })
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 5ef79cfb6314..cc69f9289e1a 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1135,6 +1135,47 @@ def test_flip():
             y = mx.nd.flip(x, axis=axis)
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
+def test_stn():
+    import pdb
+    np.set_printoptions(threshold=np.nan)
+    num_filter = 2  # conv of loc net
+    kernel = (3, 3)  # conv of loc net
+    num_hidden = 6  # fc of loc net
+    for n in [1, 2, 3, 4]:
+        for c in [1, 2, 3, 4]:
+            for h in [5, 9, 13, 17]:  # for convenience test, this third and forth input dim should be 4x + 1
+                for w in [5, 9, 13, 17]:
+                    data_shape = (n, c, h, w)
+                    target_shape = (int((data_shape[2]+1)/2), int((data_shape[3]+1)/2))
+                    data = mx.sym.Variable(name="data")
+                    loc = mx.sym.Convolution(data=data, kernel=kernel, pad=(1, 1), num_filter=num_filter, name="loc_conv")
+                    loc = mx.sym.Flatten(data=loc)
+                    loc = mx.sym.FullyConnected(data=loc, num_hidden=num_hidden, name="loc_fc")
+                    stn = mx.sym.SpatialTransformer(data=data, loc=loc, target_shape=target_shape,
+                                                    transform_type="affine", sampler_type="bilinear")
+                    arg_names = stn.list_arguments()
+                    arg_shapes, out_shapes, _ = stn.infer_shape(data=data_shape)
+                    # check shape
+                    assert out_shapes[0] == (data_shape[0], data_shape[1], target_shape[0], target_shape[1])
+                    dev = mx.cpu()
+                    #dev = mx.gpu(0)
+                    args = {}
+                    args['data'] = mx.random.normal(0, 1, data_shape, dev)
+                    args['loc_conv_weight'] = mx.nd.zeros((num_filter, data_shape[1], kernel[0], kernel[1]), ctx=dev)
+                    args['loc_conv_bias'] = mx.nd.zeros((num_filter,), ctx=dev)
+                    args['loc_fc_weight'] = mx.nd.zeros((6, num_filter*data_shape[2]*data_shape[3]), ctx=dev)
+                    args['loc_fc_bias'] = mx.nd.array([0.5, 0, 0, 0, 0.5, 0], ctx=dev)
+                    grad_grad = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
+                    exe = stn.bind(dev, args=args, args_grad=grad_grad)
+                    exe.forward(is_train=True)
+                    out = exe.outputs[0].asnumpy()
+                    # check forward
+                    reldiff(out, args['data'].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6
+                    out_grad = mx.nd.ones(out.shape, ctx=dev)
+                    exe.backward([out_grad])
+                    # check backward
+                    reldiff(out_grad.asnumpy(), grad_grad[0].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6
+
 if __name__ == '__main__':
     test_expand_dims()
     test_slice_axis()
@@ -1169,3 +1210,4 @@ def test_flip():
     test_reshape()
     test_reduce()
     test_broadcast()
+    test_stn()

From ca800df89851dae7e12787095322c4a8e74cc256 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 30 Jun 2016 18:10:33 -0700
Subject: [PATCH 077/126] [SYMBOL] Fix stackoverflow on big chain destruction
 (#2586)

---
 src/symbol/symbol.cc | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index e3abf8c0c7e1..01614a47d86a 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -63,6 +63,39 @@ struct Symbol::Node {
       attr.reset(new std::map<std::string, std::string>(*(other.attr)));
     }
   }
+  ~Node() {
+    if (inputs.size() != 0 || backward_source_node.get() != nullptr) {
+      // explicit destructor to resolve problem of hell
+      // of stack overflow caused by recursive deletion chain
+      // run a DFS to explicit store to be deleted node on to_delete
+      std::vector<std::shared_ptr<Symbol::Node> > to_delete;
+      std::vector<Symbol::Node*> stack{this};
+
+      while (!stack.empty()) {
+        Node *n = stack.back();
+        stack.pop_back();
+
+        for (DataEntry& e : n->inputs) {
+          // if the ref is the only reference
+          // the target node need to be deleted
+          if (e.source.unique()) {
+            stack.push_back(e.source.get());
+            to_delete.emplace_back(std::move(e.source));
+          } else {
+            // otherwise, reset the shared_ptr won't trigger destructor.
+            e.source.reset();
+          }
+        }
+        if (n->backward_source_node.unique()) {
+          stack.push_back(n->backward_source_node.get());
+          to_delete.emplace_back(std::move(n->backward_source_node));
+        } else {
+          n->backward_source_node.reset();
+        }
+        n->inputs.clear();
+      }
+    }
+  }
   /*! \return Whether the symbol is atomic */
   inline bool is_atomic() const {
     return inputs.size() == 0 && op != nullptr;

From 13bb5c138c19967f4af9aa0d1a901406d4effa9f Mon Sep 17 00:00:00 2001
From: unknown <T2Y@L1473SWK.dsone.3ds.com>
Date: Fri, 1 Jul 2016 20:54:32 -0400
Subject: [PATCH 078/126] Fixed a minor typo in markdown that breaks a link to
 file in github.

---
 docs/how_to/multi_devices.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/how_to/multi_devices.md b/docs/how_to/multi_devices.md
index a6611ad971bb..95ad8f8ba120 100644
--- a/docs/how_to/multi_devices.md
+++ b/docs/how_to/multi_devices.md
@@ -140,7 +140,7 @@ start a job by using `ssh`, `mpi`, `sge`, or `yarn`.
 
 Assume we are at the directory `mxnet/example/image-classification`.  and want
 to train mnist with lenet by using
-[train_mnist.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_mnist.py]).
+[train_mnist.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_mnist.py).
 On a single machine  we can run by
 
 ```bash

From 97fa49f5767421bbc717d61f40c6bdfd3c601872 Mon Sep 17 00:00:00 2001
From: Yushu Gao <shuokay@gmail.com>
Date: Sun, 3 Jul 2016 00:14:36 +0800
Subject: [PATCH 079/126] fixed typo in warpctc install doc which caused
 compile error (#2552)

---
 example/warpctc/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/warpctc/README.md b/example/warpctc/README.md
index c34c2b4f55af..a07d1146a6d8 100644
--- a/example/warpctc/README.md
+++ b/example/warpctc/README.md
@@ -19,7 +19,7 @@ Baidu-warpctc is a CTC implement by Baidu which support GPU. CTC can be used wit
 
 ```
   comment out following lines in make/config.mk
-  WARPCTC_PATH = $(HOME)/warpctc
+  WARPCTC_PATH = $(HOME)/warp-ctc
   MXNET_PLUGINS += plugin/warpctc/warpctc.mk
   
   rebuild mxnet by

From 2a9ca98e3ac12fb48848a4f88c29260f66f250b4 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Tue, 28 Jun 2016 18:00:22 -0700
Subject: [PATCH 080/126] softmax shape fix

---
 src/operator/softmax_output-inl.h | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 6aa9c395a059..cacc77720f08 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -82,8 +82,13 @@ class SoftmaxOutputOp : public Operator {
           out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
       Softmax(out, data);
     } else {
-      Tensor<xpu, 2, DType> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+      int n = in_data[softmaxout_enum::kData].size(0);
+      int k = in_data[softmaxout_enum::kData].Size()/n;
+      Shape<2> s2 = Shape2(n, k);
+      Tensor<xpu, 2, DType> data =
+          in_data[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
+      Tensor<xpu, 2, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
       Softmax(out, data);
     }
   }
@@ -114,7 +119,9 @@ class SoftmaxOutputOp : public Operator {
       int n = out_data[softmaxout_enum::kOut].size(0);
       int k = out_data[softmaxout_enum::kOut].size(1);
       Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
-      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
+      Shape<2> s2 = Shape2(s3[0], s3[2]);
+      Tensor<xpu, 2, DType> label =
+          in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
       Tensor<xpu, 3, DType> out =
           out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
       Tensor<xpu, 3, DType> grad =
@@ -149,11 +156,14 @@ class SoftmaxOutputOp : public Operator {
                     (param_.normalization == softmaxout_enum::kValid ? 1 : s3[2]) /
                     valid_cnt);
     } else {
-      const TShape& label_shape = in_data[softmaxout_enum::kLabel].shape_;
+      int n = out_data[softmaxout_enum::kOut].size(0);
+      Shape<2> s2 = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
       Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
-          Shape1(label_shape.ProdShape(0, label_shape.ndim())), s);
-      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+          Shape1(in_data[softmaxout_enum::kLabel].Size()), s);
+      Tensor<xpu, 2, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+      Tensor<xpu, 2, DType> grad =
+          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
       index_t valid_cnt = label.shape_.Size();
       if (param_.use_ignore) {
         SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));

From b40ec35779067d8c3900a69243e2f3221868fe04 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Thu, 30 Jun 2016 17:02:58 -0700
Subject: [PATCH 081/126] module monitor

---
 python/mxnet/module/base_module.py    | 14 ++++++++++++--
 python/mxnet/module/executor_group.py |  5 +++++
 python/mxnet/module/module.py         |  6 +++++-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 4465d49437f9..4a203868b109 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -276,7 +276,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
             eval_batch_end_callback=None, initializer=Uniform(0.01),
             arg_params=None, aux_params=None, allow_missing=False,
             force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
-            validation_metric=None):
+            validation_metric=None, monitor=None):
         """Train the module parameters.
 
         Parameters
@@ -327,11 +327,12 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
         num_epoch : int
             Number of epochs to run training.
         """
-
         assert num_epoch is not None, 'please specify number of epochs'
 
         self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
                   for_training=True, force_rebind=force_rebind)
+        if monitor is not None:
+            self.install_monitor(monitor)
         self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                          allow_missing=allow_missing, force_init=force_init)
         self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
@@ -349,10 +350,15 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
             tic = time.time()
             eval_metric.reset()
             for nbatch, data_batch in enumerate(train_data):
+                if monitor is not None:
+                    monitor.tic()
                 self.forward_backward(data_batch)
                 self.update()
                 self.update_metric(eval_metric, data_batch.label)
 
+                if monitor is not None:
+                    monitor.toc_print()
+
                 if batch_end_callback is not None:
                     batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                                                      eval_metric=eval_metric,
@@ -505,6 +511,10 @@ def load_params(self, fname):
                 raise ValueError("Invalid param file " + fname)
         self.set_params(arg_params, aux_params)
 
+    def install_monitor(self, mon):
+        """Install monitor on all executors"""
+        raise NotImplementedError()
+
     ################################################################################
     # Computations
     ################################################################################
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index f51d94df4799..094553cfc77d 100644
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -424,3 +424,8 @@ def _sliced_shape(self, shapes, i):
         """
         return [(k, tuple([self.slices[i].stop-self.slices[i].start] + list(v[1:])))
                 for k, v in shapes]
+
+    def install_monitor(self, mon):
+        """Install monitor on all executors"""
+        for exe in self.execs:
+            mon.install(exe)
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index e2b96dd5b370..99efea3a2e5a 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -256,7 +256,6 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                                                      label_shapes, self._param_names,
                                                      for_training, inputs_need_grad,
                                                      shared_group, logger=self.logger)
-
         if shared_module is not None:
             self.params_initialized = True
             self._arg_params = shared_module._arg_params
@@ -451,3 +450,8 @@ def _sync_params_from_devices(self):
         latest parameters from `self._arg_params` and `self._aux_params`.
         """
         self._exec_group.get_params(self._arg_params, self._aux_params)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        self._exec_group.install_monitor(mon)

From d7bdcc1bacda9101aa8135e289debb63c409b7c2 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sun, 3 Jul 2016 17:31:52 -0700
Subject: [PATCH 082/126] fix batch norm

---
 src/operator/batch_norm-inl.h          | 43 +++++++++-----------------
 src/operator/cudnn_batch_norm-inl.h    |  3 +-
 tests/python/unittest/test_operator.py |  6 ----
 3 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index bd76bd044afc..bbe231d755d1 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -38,7 +38,7 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
     DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
     .describe("Momentum for moving average");
     DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
-    .describe("Fix gamma to 1");
+    .describe("Fix gamma while training");
     DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
     .describe("Whether use global moving statistics instead of local batch-norm. "
               "This will force change batch-norm into a scale shift operator.");
@@ -89,7 +89,6 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
 
-    if (param_.fix_gamma) slope = 1.0f;
     // whether use global statistics
     if (ctx.is_train && !param_.use_global_stats) {
       Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
@@ -100,16 +99,10 @@ class BatchNormOp : public Operator {
       mean = scale * sumall_except_dim<1>(data);
       var = scale * sumall_except_dim<1>(F<mshadow_op::square>(
           data - broadcast<1>(mean, data.shape_)));
-      if (param_.fix_gamma) {
-        Assign(out, req[batchnorm::kOut], (data - broadcast<1>(mean, data.shape_)) /
-               F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
-               broadcast<1>(bias, out.shape_));
-      } else {
-        Assign(out, req[batchnorm::kOut], broadcast<1>(slope, out.shape_) *
-               (data - broadcast<1>(mean, data.shape_)) /
-               F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
-               broadcast<1>(bias, out.shape_));
-      }
+      Assign(out, req[batchnorm::kOut], broadcast<1>(slope, out.shape_) *
+             (data - broadcast<1>(mean, data.shape_)) /
+             F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
+             broadcast<1>(bias, out.shape_));
     } else {
       Assign(out, req[batchnorm::kOut], broadcast<1>(slope /
                                           F<mshadow_op::square_root>(moving_var + param_.eps),
@@ -185,20 +178,15 @@ class BatchNormOp : public Operator {
                sumall_except_dim<1>(
                    grad * (data - broadcast<1>(mean, data.shape_)) /
                    F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_))));
-        Assign(grad_in, req[batchnorm::kData],
-               (grad * broadcast<1>(slope, data.shape_)) *
-               broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
-               broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
-                                                                                     data.shape_)) +
-               broadcast<1>(gmean, data.shape_) * scale);
       } else {
         Assign(gslope, req[batchnorm::kGamma], 0.0f);
-        Assign(grad_in, req[batchnorm::kData], grad *
-               broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
-               broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
-                                                                                     data.shape_)) +
-               broadcast<1>(gmean, data.shape_) * scale);
       }
+      Assign(grad_in, req[batchnorm::kData],
+             (grad * broadcast<1>(slope, data.shape_)) *
+             broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
+             broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
+                                                                                   data.shape_)) +
+             broadcast<1>(gmean, data.shape_) * scale);
       Assign(gbias, req[batchnorm::kBeta], sumall_except_dim<1>(grad));
     } else {
       // use global statistics with freeze moving mean and var.
@@ -207,15 +195,12 @@ class BatchNormOp : public Operator {
                sumall_except_dim<1>(
                    grad * (data - broadcast<1>(moving_mean, data.shape_)) /
                    F<mshadow_op::square_root>(broadcast<1>(moving_var + param_.eps, data.shape_))));
-        Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
-               broadcast<1>(
-                   1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
       } else {
         Assign(gslope, req[batchnorm::kGamma], 0.0f);
-        Assign(grad_in, req[batchnorm::kData], grad *
-               broadcast<1>(
-                   1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
       }
+      Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
+             broadcast<1>(
+                 1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
     }
   }
 
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
index b98b4e85df52..c58baad7a703 100644
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -89,7 +89,6 @@ class CuDNNBatchNormOp : public Operator {
     Tensor<gpu, 4> x = in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
     Tensor<gpu, 1> gamma =
       in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    if (param_.fix_gamma) gamma = 1.0f;
     Tensor<gpu, 1> beta =
       in_data[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
     Tensor<gpu, 4> y = out_data[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, real_t>(shape_, s);
@@ -212,7 +211,7 @@ class CuDNNBatchNormOp : public Operator {
                                              save_mean.dptr_,
                                              save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
 #endif
-    if (param_.fix_gamma) dgamma = 0;
+    if (param_.fix_gamma) dgamma = 0.f;
   }
 
  private:
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index cc69f9289e1a..6a9714017154 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -730,12 +730,6 @@ def test_batchnorm_training():
 
         check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
 
-        # Gamma needs to be fixed at one when fix_gamma is true,
-        gamma = np.ones(s)
-
-        test = mx.symbol.BatchNorm(data, fix_gamma=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
-
 def test_convolution_grouping():
     num_filter = 4
     num_group = 2

From d5abf21f58dbe8b86a06babc58166dfb266d82b2 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sun, 3 Jul 2016 18:13:08 -0700
Subject: [PATCH 083/126] monitor for all modules

---
 python/mxnet/module/base_module.py       | 2 +-
 python/mxnet/module/bucketing_module.py  | 6 ++++++
 python/mxnet/module/python_module.py     | 4 ++++
 python/mxnet/module/sequential_module.py | 6 ++++++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 4a203868b109..d73070284fe4 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, too-many-locals, too-many-public-methods
+# pylint: disable=too-many-arguments, too-many-locals, too-many-public-methods, too-many-branches
 """`BaseModule` defines an API for modules."""
 
 import logging
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index 7ab039ea45d4..0d040dea4881 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -325,3 +325,9 @@ def symbol(self):
         """The symbol of the current bucket being used."""
         assert self.binded
         return self._curr_module.symbol
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        for mod in self._buckets.values():
+            mod.install_monitor(mon)
diff --git a/python/mxnet/module/python_module.py b/python/mxnet/module/python_module.py
index 09866d8948e4..ab9b952010a7 100644
--- a/python/mxnet/module/python_module.py
+++ b/python/mxnet/module/python_module.py
@@ -326,3 +326,7 @@ def get_input_grads(self, merge_multi_context=True):
         """
         assert merge_multi_context == True
         return [self._scores_grad]
+
+    def install_monitor(self, mon):
+        """Install monitor on all executors"""
+        raise NotImplementedError()
diff --git a/python/mxnet/module/sequential_module.py b/python/mxnet/module/sequential_module.py
index 75c499dfafc7..3e9ac3d49855 100644
--- a/python/mxnet/module/sequential_module.py
+++ b/python/mxnet/module/sequential_module.py
@@ -383,3 +383,9 @@ def update_metric(self, eval_metric, labels):
             if meta.has_key(SequentialModule.META_TAKE_LABELS) and \
                     meta[SequentialModule.META_TAKE_LABELS]:
                 module.update_metric(eval_metric, labels)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        for module in self._modules:
+            module.install_monitor(mon)

From c06dfbdfe965e3e561c3114327afee43eb723f34 Mon Sep 17 00:00:00 2001
From: sxjscience <xshiab@ust.hk>
Date: Mon, 4 Jul 2016 17:55:44 +0800
Subject: [PATCH 084/126] 1. Support numpy.ndarray as input for exe.forward()
 2. Add batch_dot

Update MShadow

Revert seed
---
 mshadow                                |  2 +-
 python/mxnet/executor.py               | 13 ++++-
 src/operator/matrix_op-inl.h           | 77 +++++++++++++++++++++++++-
 tests/python/unittest/test_operator.py | 66 ++++++++++++++++++++--
 4 files changed, 147 insertions(+), 11 deletions(-)

diff --git a/mshadow b/mshadow
index e41ae71f7096..0186f06e3c1f 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit e41ae71f7096f4b3592c30786328f95ad0eb6dd0
+Subproject commit 0186f06e3c1ffd0777775fedd670d82052317674
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 13a5c3099864..32c6ec1748a4 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -98,19 +98,26 @@ def forward(self, is_train=False, **kwargs):
         >>> # doing forward by not specifying things, but copy to the executor before hand
         >>> mydata.copyto(texec.arg_dict['data'])
         >>> texec.forward(is_train=True)
+        >>> # doing forward by specifying data and get outputs
+        >>> outputs = texec.forward(is_train=True, data=mydata)
+        >>> print(outputs[0].asnumpy())
         """
         if len(kwargs) != 0:
             arg_dict = self.arg_dict
             for name, array in kwargs.items():
-                if not isinstance(array, NDArray):
-                    raise ValueError('only accept keyword argument of NDArrays')
+                if not isinstance(array, (NDArray, np.ndarray)):
+                    raise ValueError('only accept keyword argument of NDArrays and numpy.ndarray')
                 if name not in arg_dict:
                     raise TypeError('Unknown argument %s' % name)
-                array.copyto(arg_dict[name])
+                if arg_dict[name].shape != array.shape:
+                    raise ValueError('Shape not match! Argument %s, need: %s, received: %s'
+                                     %(name, str(arg_dict[name].shape), str(array.shape)))
+                arg_dict[name][:] = array
 
         check_call(_LIB.MXExecutorForward(
             self.handle,
             ctypes.c_int(int(is_train))))
+        return self.outputs
 
     def backward(self, out_grads=None):
         """Do backward pass to get the gradient of arguments.
diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index 1195c504ed8a..b21c6ab88e47 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -264,7 +264,6 @@ void DotBackward_(const OutputGrad& out_grad,
   }
 }
 
-
 inline TShape DotShape(const TShape& lshape,
                        const TShape& rshape,
                        const EnvArguments& env) {
@@ -283,6 +282,74 @@ inline TShape DotShape(const TShape& lshape,
   }
 }
 
+template<typename xpu>
+void BatchDotForward_(const TBlob& lhs,
+                        const TBlob& rhs,
+                        const EnvArguments& env,
+                        TBlob *ret,
+                        OpReqType req,
+                        RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, lhs.type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(ret->type_flag_, rhs.type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(ret->type_flag_, mshadow::kFloat32)
+      << "dot only support 32 bit float so far";
+
+  if (lhs.shape_.ndim() == 3 && rhs.shape_.ndim() == 3) {
+    mshadow::Tensor<xpu, 3, real_t> out = ret->get<xpu, 3, real_t>(s);
+    ASSIGN_DISPATCH(out, req, (batch_dot<false, false>(lhs.get<xpu, 3, real_t>(s),
+                                                       rhs.get<xpu, 3, real_t>(s))));
+  } else {
+    LOG(FATAL) << "not reached";
+  }
+}
+
+template<typename xpu>
+void BatchDotBackward_(const OutputGrad& out_grad,
+                         const Input0& lhs,
+                         const Input1& rhs,
+                         const EnvArguments& env,
+                         TBlob* lhs_grad,
+                         TBlob* rhs_grad,
+                         OpReqType req_lhs_grad,
+                         OpReqType req_rhs_grad,
+                         RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_NE(req_rhs_grad, kWriteInplace);
+  CHECK_NE(req_lhs_grad, kWriteInplace);
+
+  if (lhs.data.shape_.ndim() == 3 && rhs.data.shape_.ndim() == 3) {
+    mshadow::Tensor<xpu, 3, real_t> mout_grad = out_grad.data.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mlhs_data = lhs.data.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mrhs_data = rhs.data.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mlhs_grad = lhs_grad->get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mrhs_grad = rhs_grad->get<xpu, 3, real_t>(s);
+    ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, (batch_dot<true, false>(mlhs_data, mout_grad)));
+    ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, (batch_dot<false, true>(mout_grad, mrhs_data)));
+  } else {
+    LOG(FATAL) << "not reached";
+  }
+}
+
+inline TShape BatchDotShape(const TShape& lshape,
+                              const TShape& rshape,
+                              const EnvArguments& env) {
+  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+    CHECK(lshape[0] == rshape[0] && lshape[2] == rshape[1])
+      << "batch_dot shape error: " << lshape << " X " << rshape;
+    size_t target_shape[] = {lshape[0], lshape[1], rshape[2]};
+    return TShape(target_shape, target_shape + 3);
+  } else {
+    LOG(FATAL) << "batch_dot currently only support 3D dot 3D array"
+               << lshape << " v.s. " << rshape;
+    return TShape();
+  }
+}
+
 
 struct SimpleCropParam : public dmlc::Parameter<SimpleCropParam> {
   TShape begin, end;
@@ -599,6 +666,14 @@ MXNET_REGISTER_SIMPLE_OP(dot, XPU)
 .set_shape_function(DotShape)
 .set_gradient(XPU::kDevMask, DotBackward_<XPU>, kNoInplace)
 .describe("Calculate dot product of two matrices or two vectors");
+
+// batched_dot
+MXNET_REGISTER_SIMPLE_OP(batch_dot, XPU)
+.set_function(XPU::kDevMask, BatchDotForward_<XPU>, kNoInplace, kRegisterSymbolic)
+.set_shape_function(BatchDotShape)
+.set_gradient(XPU::kDevMask, BatchDotBackward_<XPU>, kNoInplace)
+.describe("Calculate batched dot product of two matrices."
+          " (batch, M, K) batch_dot (batch, K, N) --> (batch, M, N)");
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 6a9714017154..3809b28f1108 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -761,9 +761,9 @@ def test_convolution_grouping():
         np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3)
 
 def _gen_broadcast_data():
-    # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
+    # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
     ndim = np.random.randint(1, 8)
-    shape = np.random.randint(1, 11, size=(ndim,))
+    shape = np.random.randint(1, 6, size=(ndim,))
     l_same_dim = np.random.randint(0, 5)
     r_same_dim = np.random.randint(0, 5)
     l_axis_flags = np.random.randint(0, 2, size=ndim)
@@ -970,9 +970,9 @@ def test_reduce():
     sample_num = 200
     def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
         for i in range(sample_num):
-            # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
+            # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
             ndim = np.random.randint(1, 8)
-            shape = np.random.randint(1, 11, size=(ndim,))
+            shape = np.random.randint(1, 6, size=(ndim,))
             axis_num = np.random.randint(0, ndim, size=1)
             axis_flags = np.random.randint(0, 2, size=ndim)
             axes = []
@@ -1016,9 +1016,9 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
 def test_broadcast():
     sample_num = 200
     for i in range(sample_num):
-        # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
+        # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
         ndim = np.random.randint(1, 8)
-        target_shape = np.random.randint(1, 11, size=(ndim,))
+        target_shape = np.random.randint(1, 6, size=(ndim,))
         axis = tuple(set(np.random.randint(0, ndim, np.random.randint(1, ndim + 1))))
         shape = target_shape.copy()
         size = tuple([shape[ele] for ele in axis])
@@ -1129,6 +1129,7 @@ def test_flip():
             y = mx.nd.flip(x, axis=axis)
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
+
 def test_stn():
     import pdb
     np.set_printoptions(threshold=np.nan)
@@ -1170,6 +1171,57 @@ def test_stn():
                     # check backward
                     reldiff(out_grad.asnumpy(), grad_grad[0].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6
 
+
+def test_dot(ctx=mx.cpu()):
+    for m in range(1, 5):
+        for k in range(1, 5):
+            for n in range(1, 5):
+                a_npy = np.random.normal(0, 1, (m, k))
+                b_npy = np.random.normal(0, 1, (k, n))
+                c_npy = np.empty((m, n))
+                ograd_npy = np.random.normal(0, 1, (m, n))
+                agrad_npy = np.empty((m, k))
+                bgrad_npy = np.empty((k, n))
+                c_npy[:, :] = np.dot(a_npy[:, :], b_npy[:, :])
+                bgrad_npy[:, :] = np.dot(a_npy[:, :].T, ograd_npy[:, :])
+                agrad_npy[:, :] = np.dot(ograd_npy[:, :], b_npy[:, :].T)
+                a = mx.sym.Variable('a')
+                b = mx.sym.Variable('b')
+                c = mx.sym.dot(a, b)
+                exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
+                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
+                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+
+
+def test_batch_dot(ctx=mx.cpu()):
+    for batch_size in range(1, 5):
+        for m in range(1, 5):
+            for k in range(1, 5):
+                for n in range(1, 5):
+                    a_npy = np.random.normal(0, 1, (batch_size, m, k))
+                    b_npy = np.random.normal(0, 1, (batch_size, k, n))
+                    c_npy = np.empty((batch_size, m, n))
+                    ograd_npy = np.random.normal(0, 1, (batch_size, m, n))
+                    agrad_npy = np.empty((batch_size, m, k))
+                    bgrad_npy = np.empty((batch_size, k, n))
+                    for i in range(batch_size):
+                        c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
+                        bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])
+                        agrad_npy[i, :, :] = np.dot(ograd_npy[i, :, :], b_npy[i, :, :].T)
+                    a = mx.sym.Variable('a')
+                    b = mx.sym.Variable('b')
+                    c = mx.sym.batch_dot(a, b)
+                    exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                    outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
+                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                    exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
+                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+
+
 if __name__ == '__main__':
     test_expand_dims()
     test_slice_axis()
@@ -1205,3 +1257,5 @@ def test_stn():
     test_reduce()
     test_broadcast()
     test_stn()
+    test_dot()
+    test_batch_dot()

From 7ff4903569a4cfc0d2da4d99ddb6052dbdcee337 Mon Sep 17 00:00:00 2001
From: Andrew Smith <asmith26@users.noreply.github.com>
Date: Tue, 5 Jul 2016 18:34:02 +0100
Subject: [PATCH 085/126] Allowed for CPU usage.

---
 example/image-classification/train_cifar10.py           | 2 +-
 example/image-classification/train_cifar10_mirroring.py | 2 +-
 example/image-classification/train_cifar10_resnet.py    | 2 +-
 example/image-classification/train_imagenet.py          | 2 +-
 example/kaggle-ndsb1/predict_dsb.py                     | 2 +-
 example/kaggle-ndsb1/train_dsb.py                       | 2 +-
 example/module/train_cifar10.py                         | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/example/image-classification/train_cifar10.py b/example/image-classification/train_cifar10.py
index aa5e2e1b571c..8bbd233deeb3 100644
--- a/example/image-classification/train_cifar10.py
+++ b/example/image-classification/train_cifar10.py
@@ -9,7 +9,7 @@
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, default='cifar10/',
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=60000,
                     help='the number of training examples')
diff --git a/example/image-classification/train_cifar10_mirroring.py b/example/image-classification/train_cifar10_mirroring.py
index 81124a2f1776..93f263132a8d 100644
--- a/example/image-classification/train_cifar10_mirroring.py
+++ b/example/image-classification/train_cifar10_mirroring.py
@@ -24,7 +24,7 @@
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, default='cifar10/',
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=60000,
                     help='the number of training examples')
diff --git a/example/image-classification/train_cifar10_resnet.py b/example/image-classification/train_cifar10_resnet.py
index a90acc4b0aed..6357df67fc01 100644
--- a/example/image-classification/train_cifar10_resnet.py
+++ b/example/image-classification/train_cifar10_resnet.py
@@ -49,7 +49,7 @@
 parser = argparse.ArgumentParser(description='train an image classifer on cifar10')
 parser.add_argument('--data-dir', type=str, default='cifar10/',
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=50000,
                     help='the number of training examples')
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index e53c607bc1a7..394231bd3acc 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -30,7 +30,7 @@
                     help="load the model on an epoch using the model-prefix")
 parser.add_argument('--batch-size', type=int, default=32,
                     help='the batch size')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--kv-store', type=str, default='local',
                     help='the kvstore type')
diff --git a/example/kaggle-ndsb1/predict_dsb.py b/example/kaggle-ndsb1/predict_dsb.py
index 9fd3c71d6bb2..5241730120c4 100644
--- a/example/kaggle-ndsb1/predict_dsb.py
+++ b/example/kaggle-ndsb1/predict_dsb.py
@@ -10,7 +10,7 @@
                     help='the batch size')
 parser.add_argument('--data-dir', type=str, default="data48/",
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--model-prefix', type=str,default= "./models/sample_net-0",
                     help='the prefix of the model to load')
diff --git a/example/kaggle-ndsb1/train_dsb.py b/example/kaggle-ndsb1/train_dsb.py
index eeb57bed0a0e..6c54a0b0270e 100644
--- a/example/kaggle-ndsb1/train_dsb.py
+++ b/example/kaggle-ndsb1/train_dsb.py
@@ -27,7 +27,7 @@
                     help="load the model on an epoch using the model-prefix")
 parser.add_argument('--batch-size', type=int, default=64,
                     help='the batch size')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--kv-store', type=str, default='local',
                     help='the kvstore type')
diff --git a/example/module/train_cifar10.py b/example/module/train_cifar10.py
index 804779dbce28..bcf6f8bd0bf7 100644
--- a/example/module/train_cifar10.py
+++ b/example/module/train_cifar10.py
@@ -15,7 +15,7 @@
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, default=default_data_dir,
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=60000,
                     help='the number of training examples')

From 2f089b8d45c1dcaf63498d3e4121ed6616ed1a11 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <vchuravy@users.noreply.github.com>
Date: Wed, 6 Jul 2016 03:02:43 +0900
Subject: [PATCH 086/126] expose GetAtomicSymbolName in header (#2627)

---
 include/mxnet/c_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 097e3eb603bd..28bc89406c0b 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -479,6 +479,14 @@ MXNET_DLL int MXFuncInvokeEx(FunctionHandle fun,
  */
 MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
                                                AtomicSymbolCreator **out_array);
+
+/*!
+ * \brief Get the name of an atomic symbol.
+ * \param creator the AtomicSymbolCreator.
+ * \param name The returned name of the creator.
+ */
+MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
+                                          const char **name);
 /*!
  * \brief Get the detailed information about atomic symbol.
  * \param creator the AtomicSymbolCreator.

From 5c07a91e4a700d872b73b0f00a203128ef48109f Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Tue, 5 Jul 2016 11:29:40 -0700
Subject: [PATCH 087/126] batchnorm fix

---
 src/operator/batch_norm-inl.h         |  2 ++
 src/operator/cudnn_batch_norm-inl.h   |  3 +++
 tests/python/gpu/test_operator_gpu.py | 28 +++++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index bbe231d755d1..03238b067ea3 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -89,6 +89,8 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
 
+    if (ctx.is_train && param_.fix_gamma) slope = 1.f;
+
     // whether use global statistics
     if (ctx.is_train && !param_.use_global_stats) {
       Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
index c58baad7a703..c4f9afaafeef 100644
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -98,6 +98,9 @@ class CuDNNBatchNormOp : public Operator {
       aux_states[cudnnbatchnorm::kMovingInvVar]
       .get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
     float a = 1.0f, b = 0.0f;
+
+    if (ctx.is_train && param_.fix_gamma) gamma = 1.f;
+
     if (ctx.is_train) {
       Tensor<gpu, 1> save_mean =
         out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index e0502b04b147..daa60e1779a0 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -50,6 +50,24 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write'):
             except Exception, e:
                 print e
 
+    #forward predict
+    for exe in exe_list:
+        exe.forward(is_train=False)
+
+    outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
+    dtypes = [arr.dtype for arr in outputs]
+    max_idx = np.argmax(dtypes)
+
+    for i, exe in enumerate(exe_list):
+        if i == max_idx:
+            continue
+        for arr1, arr2 in zip([outputs[i]], [outputs[max_idx]]):
+            arr2 = arr2.astype(dtypes[i])
+            try:
+                assert_allclose(arr1, arr2, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
+            except Exception, e:
+                print e
+
 def check_speed(sym, ctx, scale=1.0, N=100, grad_req='write'):
     exe = sym.simple_bind(grad_req=grad_req, **ctx)
     init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe.arg_arrays]
@@ -68,6 +86,15 @@ def check_speed(sym, ctx, scale=1.0, N=100, grad_req='write'):
         exe.outputs[0].wait_to_read()
     return (time.time() - tic)*1.0/N
 
+def test_batchnorm_with_type():
+    sym = mx.sym.BatchNorm(name='norm', fix_gamma=False)
+    ctx_list = [{'ctx': mx.gpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
+                {'ctx': mx.cpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+    sym = mx.sym.BatchNorm(name='norm', fix_gamma=True)
+    check_consistency(sym, ctx_list)
+
 def test_convolution_with_type():
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), name='conv')
     ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},
@@ -181,6 +208,7 @@ def test_embedding_with_type():
     check_consistency(sym, ctx_list, grad_req={'embedding_data': 'null','embedding_weight': 'write'})
 
 if __name__ == '__main__':
+    test_batchnorm_with_type()
     test_convolution_with_type()
     test_deconvolution_with_type()
     test_upsampling_with_type()

From 144e53d71f06cc53c7d7e0b74a67904db6ddfaa6 Mon Sep 17 00:00:00 2001
From: Daniel Gordon <xkcd@cs.washington.edu>
Date: Tue, 5 Jul 2016 16:46:18 -0700
Subject: [PATCH 088/126] making adam more in-place

---
 python/mxnet/optimizer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 18c5da666843..fecd838a775b 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -5,7 +5,7 @@
 from .base import _LIB, check_call
 from .base import c_array, mx_uint, mx_float, c_str
 from .base import OptimizerHandle, OptimizerCreator
-from .ndarray import NDArray, zeros, clip, sqrt
+from .ndarray import NDArray, zeros, clip, sqrt, square
 from .random import normal
 
 
@@ -586,14 +586,17 @@ def update(self, index, weight, grad, state):
         if self.clip_gradient is not None:
             clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
 
-        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
-        variance[:] = self.beta2 * variance + (1. - self.beta2) * grad * grad
+        mean *= self.beta1
+        mean += grad * (1. - self.beta1)
+
+        variance *= self.beta2
+        variance += (1 - self.beta2) * square(grad, out=grad)
 
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
 
-        weight[:] -= lr*mean/(sqrt(variance) + self.epsilon)
+        weight -= lr*mean/(sqrt(variance) + self.epsilon)
 
         wd = self._get_wd(index)
         if wd > 0.:

From 2feb5591b1b1f6448715f9a0d4e7a44e64f720a8 Mon Sep 17 00:00:00 2001
From: xu dong <dsqx71@gmail.com>
Date: Fri, 8 Jul 2016 17:37:29 +0800
Subject: [PATCH 089/126] Add correlation op

---
 src/operator/correlation-inl.h         | 234 ++++++++++
 src/operator/correlation.cc            | 144 ++++++
 src/operator/correlation.cu            | 609 +++++++++++++++++++++++++
 tests/python/unittest/test_operator.py | 198 ++++++++
 4 files changed, 1185 insertions(+)
 create mode 100644 src/operator/correlation-inl.h
 create mode 100644 src/operator/correlation.cc
 create mode 100644 src/operator/correlation.cu

diff --git a/src/operator/correlation-inl.h b/src/operator/correlation-inl.h
new file mode 100644
index 000000000000..e6453fc5a3b1
--- /dev/null
+++ b/src/operator/correlation-inl.h
@@ -0,0 +1,234 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file correlation-inl.h
+ * \brief correlation operator and symbol
+ * \author Xu Dong 
+*/
+#ifndef MXNET_OPERATOR_CORRELATION_INL_H_
+#define MXNET_OPERATOR_CORRELATION_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./mshadow_op.h"
+#include "./operator_common.h"
+namespace mxnet {
+namespace op {
+//  Declare enumeration of input order to make code more intuitive.
+//  These enums are only visible within this header
+namespace Correlation {
+enum  CorrelationOpInputs{kData1, kData2};
+enum  CorrelationOpOutputs{kOut, kTemp1, kTemp2};
+}  //  namespace Correlation
+struct CorrelationParam : public dmlc::Parameter<CorrelationParam> {
+  uint32_t max_displacement;
+  uint32_t kernel_size;
+  uint32_t pad_size;
+  uint32_t stride1;
+  uint32_t stride2;
+  bool is_multiply;
+  DMLC_DECLARE_PARAMETER(CorrelationParam) {
+    DMLC_DECLARE_FIELD(kernel_size).set_default(1)
+    .describe("kernel size for Correlation must be an odd number");
+    DMLC_DECLARE_FIELD(max_displacement).set_default(1)
+    .describe("Max displacement of Correlation ");
+    DMLC_DECLARE_FIELD(stride1).set_default(1)
+    .describe("stride1 quantize data1 globally");
+    DMLC_DECLARE_FIELD(stride2).set_default(1)
+    .describe("stride2 quantize data2 within the neighborhood centered around data1");
+    DMLC_DECLARE_FIELD(pad_size).set_default(0)
+    .describe("pad for Correlation");
+    DMLC_DECLARE_FIELD(is_multiply).set_default(true)
+    .describe("operation type is either multiplication or subduction");
+  }
+};
+template<typename xpu>
+class CorrelationOp : public Operator {
+ public:
+  explicit CorrelationOp(CorrelationParam param) {
+    this->param_ = param;
+  }
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data1 = in_data[Correlation::kData1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data2 = in_data[Correlation::kData2].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out   = out_data[Correlation::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp1  = out_data[Correlation::kTemp1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp2  = out_data[Correlation::kTemp2].get<xpu, 4, real_t>(s);
+    tmp1 = 0.0f;
+    tmp2 = 0.0f;
+    out = 0.0f;
+    CHECK_EQ(data1.CheckContiguous(), true);
+    CHECK_EQ(data2.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    CHECK_EQ(tmp1.CheckContiguous(), true);
+    CHECK_EQ(tmp2.CheckContiguous(), true);
+    paddedbottomheight = data1.shape_[2] + 2 * param_.pad_size;
+    paddedbottomwidth  = data1.shape_[3] + 2 * param_.pad_size;
+    kernel_radius_ = (param_.kernel_size - 1) / 2;
+    border_size_ = param_.max_displacement + kernel_radius_;
+    stride1 = param_.stride1;
+    stride2 = param_.stride2;
+    top_width_ = ceil(static_cast<float>(paddedbottomwidth - border_size_ * 2)\
+     / static_cast<float>(stride1));
+    top_height_ = ceil(static_cast<float>(paddedbottomheight - border_size_ * 2)\
+     / static_cast<float>(stride1));
+    neighborhood_grid_radius_ = param_.max_displacement / stride2;
+    neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;
+    top_channels_ = neighborhood_grid_width_ * neighborhood_grid_width_;
+    num =  data1.shape_[0];
+    channels = data1.shape_[1];
+    height = data1.shape_[2];
+    width = data1.shape_[3];
+    CorrelationForward(out, data1, data2, tmp1, tmp2, top_channels_, top_height_, top_width_,
+                       param_.pad_size, param_.is_multiply,
+                       param_.max_displacement, param_.kernel_size,
+                       neighborhood_grid_radius_, neighborhood_grid_width_,
+                       kernel_radius_, param_.stride1, param_.stride2);
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> grad_data1 = in_grad[Correlation::kData1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_data2 = in_grad[Correlation::kData2].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out_g = out_grad[Correlation::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp1 = out_data[Correlation::kTemp1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp2 = out_data[Correlation::kTemp2].get<xpu, 4, real_t>(s);
+    CHECK_EQ(grad_data1.CheckContiguous(), true);
+    CHECK_EQ(grad_data2.CheckContiguous(), true);
+    CHECK_EQ(out_g.CheckContiguous(), true);
+    CHECK_EQ(tmp1.CheckContiguous(), true);
+    CHECK_EQ(tmp2.CheckContiguous(), true);
+    CorrelationBackward(out_g, grad_data1, grad_data2, tmp1, tmp2, top_channels_,
+    top_height_, top_width_, param_.pad_size, param_.is_multiply,
+    param_.max_displacement, param_.kernel_size, neighborhood_grid_radius_,
+    neighborhood_grid_width_, kernel_radius_, param_.stride1, param_.stride2,
+    num, channels, height, width);
+  }
+
+ private:
+    CorrelationParam param_;
+    int paddedbottomheight;
+    int paddedbottomwidth;
+    uint32_t kernel_radius_;
+    uint32_t border_size_;
+    uint32_t stride1;
+    uint32_t stride2;
+    uint32_t top_width_;
+    uint32_t top_height_;
+    uint32_t neighborhood_grid_radius_;
+    uint32_t neighborhood_grid_width_;
+    uint32_t top_channels_;
+    int  num;
+    int  channels;
+    int  height;
+    int  width;
+};   //  class CorrelationOp
+//  Decalre Factory function
+template<typename xpu>
+Operator* CreateOp(CorrelationParam param);
+#if DMLC_USE_CXX11
+class CorrelationProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data1", "data2"};
+  }
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "tmp1", "tmp2"};
+  }
+  int NumOutputs() const override {
+    return 3;
+  }
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data1, data2]";
+    TShape dshape1 = in_shape->at(Correlation::kData1);
+    TShape dshape2 = in_shape->at(Correlation::kData2);
+    CHECK_EQ(dshape1.ndim(), 4) << "data should be a 4D tensor";
+    CHECK_EQ(dshape2.ndim(), 4) << "data should be a 4D tensor";
+    int paddedbottomheight;
+    int paddedbottomwidth;
+    uint32_t kernel_radius_;
+    uint32_t stride1;
+    uint32_t stride2;
+    uint32_t top_width_;
+    uint32_t top_height_;
+    uint32_t neighborhood_grid_radius_;
+    uint32_t neighborhood_grid_width_;
+    uint32_t top_channels_;
+    uint32_t border_size_;
+    paddedbottomheight = dshape1[2] + 2*param_.pad_size;
+    paddedbottomwidth  = dshape1[3] + 2*param_.pad_size;
+    kernel_radius_ = (param_.kernel_size -1)/2;
+    border_size_ = param_.max_displacement + kernel_radius_;
+    stride1 = param_.stride1;
+    stride2 = param_.stride2;
+    top_width_ = ceil(static_cast<float>(paddedbottomwidth - border_size_ * 2)\
+     / static_cast<float>(stride1));
+    top_height_ = ceil(static_cast<float>(paddedbottomheight - border_size_ * 2)\
+     / static_cast<float>(stride1));
+    neighborhood_grid_radius_ = param_.max_displacement / stride2;
+    neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;
+    top_channels_ = neighborhood_grid_width_ * neighborhood_grid_width_;
+    CHECK_GE(top_width_, 1) <<
+    "Correlation cannot be done with current settings.Neighborhood and kernel don't fit in blob";
+    CHECK_GE(top_height_, 1) <<
+    "Correlation cannot be done with current settings.Neighborhood and kernel don't fit in blob";
+    out_shape->clear();
+    out_shape->push_back(Shape4(dshape1[0], top_channels_, top_height_, top_width_));
+    out_shape->push_back(Shape4(dshape1[0], paddedbottomheight, paddedbottomwidth, dshape1[1]));
+    out_shape->push_back(Shape4(dshape1[0], paddedbottomheight, paddedbottomwidth, dshape1[1]));
+    return true;
+  }
+  OperatorProperty* Copy() const override {
+    CorrelationProp* Correlation_sym = new CorrelationProp();
+    Correlation_sym->param_ = this->param_;
+    return Correlation_sym;
+  }
+  std::string TypeString() const override {
+    return "Correlation";
+  }
+  //  decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+     return {out_grad[Correlation::kOut],
+     out_data[Correlation::kTemp1], out_data[Correlation::kTemp2]};
+}
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  CorrelationParam param_;
+};  //  class CorrelationProp
+#endif
+}  //  namespace op
+}  //  namespace mxnet
+#endif  //  MXNET_OPERATOR_CORRELATION_INL_H_
diff --git a/src/operator/correlation.cc b/src/operator/correlation.cc
new file mode 100644
index 000000000000..bcd829fff117
--- /dev/null
+++ b/src/operator/correlation.cc
@@ -0,0 +1,144 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file correlation.cc
+ * \brief correlation op
+ * \author Xu Dong
+*/
+#include "./correlation-inl.h"
+#include "./mshadow_op.h"
+
+namespace mshadow {
+template<typename Dtype>
+void AddPad(const Tensor<cpu, 4, Dtype> &original,
+            const Tensor<cpu, 4, Dtype> &out,
+            int pad_size)
+{ for (index_t nbatch = 0 ; nbatch < original.size(0) ; nbatch++)
+  for (index_t channel = 0 ; channel < original.size(1) ; channel++)
+    for (index_t h = 0 ; h < original.size(2) ; h++)
+      for (index_t w = 0 ; w < original.size(3) ; w++)
+         out[nbatch][h+pad_size][w+pad_size][channel] = original[nbatch][channel][h][w];
+}
+template<typename Dtype>
+inline void CorrelationForward(const Tensor<cpu, 4, Dtype> &out,
+                               const Tensor<cpu, 4, Dtype> &data1,
+                               const Tensor<cpu, 4, Dtype> &data2,
+                               const Tensor<cpu, 4, Dtype> &tmp1,
+                               const Tensor<cpu, 4, Dtype> &tmp2,
+                               int top_channels_, int top_height_, int top_width_,
+                               int pad_size_, bool is_multiply,
+                               int max_displacement_, int kernel_size_,
+                               int neighborhood_grid_radius_, int neighborhood_grid_width_,
+                               int  kernel_radius_, int stride1_, int stride2_) {
+  const int bnum = data1.size(0);
+  const int bchannels = data1.size(1);
+  const int sumelems = kernel_size_ * kernel_size_ * bchannels;
+  AddPad<Dtype>(data1, tmp1, pad_size_);
+  AddPad<Dtype>(data2, tmp2, pad_size_);
+  for (index_t i = 0 ; i < top_height_ ; i++)
+      for (index_t j = 0 ; j < top_width_; j++)
+        for (index_t nbatch = 0 ; nbatch < bnum ; nbatch++) {
+            int x1 = j*stride1_+max_displacement_;
+            int y1 = i*stride1_+max_displacement_;
+            for (index_t top_channel = 0 ; top_channel < top_channels_ ; top_channel++) {
+              int s2o = (top_channel % neighborhood_grid_width_ -\
+                         neighborhood_grid_radius_) * stride2_;
+              int s2p = (top_channel / neighborhood_grid_width_ -\
+                         neighborhood_grid_radius_) * stride2_;
+              int x2 = x1 + s2o;
+              int y2 = y1 + s2p;
+              for (index_t h = 0; h < kernel_size_; h++)
+                for (index_t w = 0; w < kernel_size_; w++)
+                  for (index_t channel = 0; channel < bchannels; channel++) {
+                    if (is_multiply == true)
+                        out[nbatch][top_channel][i][j] += \
+                        tmp1[nbatch][y1+h][x1+w][channel]*tmp2[nbatch][y2+h][x2+w][channel];
+                    else
+                        out[nbatch][top_channel][i][j] += \
+                        fabsf(tmp1[nbatch][y1+h][x1+w][channel]-tmp2[nbatch][y2+h][x2+w][channel]);
+                  }
+              out[nbatch][top_channel][i][j] /= sumelems;
+            }
+        }
+}
+template<typename Dtype>
+inline void CorrelationBackward(const Tensor<cpu, 4, Dtype> &out_grad,
+                                const Tensor<cpu, 4, Dtype> &in_grad1,
+                                const Tensor<cpu, 4, Dtype> &in_grad2,
+                                const Tensor<cpu, 4, Dtype> &tmp1,
+                                const Tensor<cpu, 4, Dtype> &tmp2,
+                                int top_channels_, int top_height_,
+                                int top_width_, int pad_size_,
+                                bool is_multiply, int max_displacement_,
+                                int kernel_size_, int neighborhood_grid_radius_,
+                                int neighborhood_grid_width_,
+                                int  kernel_radius_, int stride1_,
+                                int stride2_, int num,
+                                int channels, int height, int width
+                            ) {
+  const float sumelems = kernel_size_ * kernel_size_ * channels;
+  for (int i = 0 ; i < top_height_ ; i++)
+     for (int j = 0 ; j < top_width_; j++)
+        for (int nbatch = 0 ; nbatch < num ; nbatch++) {
+            int x1 = j*stride1_+max_displacement_;
+            int y1 = i*stride1_+max_displacement_;
+            for (int top_channel = 0 ; top_channel < top_channels_ ; top_channel++) {
+              int s2o = (top_channel % neighborhood_grid_width_ - \
+              neighborhood_grid_radius_) * stride2_;
+              int s2p = (top_channel / neighborhood_grid_width_ - \
+              neighborhood_grid_radius_) * stride2_;
+              int x2 = x1 + s2o;
+              int y2 = y1 + s2p;
+              for (int h = 0; h < kernel_size_; h++)
+                for (int w = 0; w < kernel_size_; w++)
+                  for (int channel = 0 ; channel < channels; channel++) {
+                    if (is_multiply == true) {
+                      if ((y1 +  h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) && \
+                      (y1 + h < height +pad_size_) && (x1 + w < width + pad_size_)) {
+                        in_grad1[nbatch][channel][y1+h-pad_size_][x1+w-pad_size_] += \
+                        out_grad[nbatch][top_channel][i][j] * \
+                        tmp2[nbatch][y2+h][x2+w][channel]/sumelems;
+                       }
+                       if ((y2 +  h - pad_size_ >= 0) && (x2 + w -pad_size_ >=0) && \
+                       (y2 + h < height +pad_size_) && (x2 + w < width + pad_size_)) {
+                       in_grad2[nbatch][channel][y2+h-pad_size_][x2+w-pad_size_] += \
+                       out_grad[nbatch][top_channel][i][j] * \
+                       tmp1[nbatch][y1+h][x1+w][channel]/sumelems;
+                       }
+                    } else {
+                      if ((y1 +  h - pad_size_ >= 0) && (x1 + w -pad_size_ >=0) && \
+                      (y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
+                        Dtype sign  = (tmp1[nbatch][y1+h][x1+w][channel] >= \
+                        tmp2[nbatch][y2+h][x2+w][channel])? Dtype(1.0) : Dtype(-1.0);
+                        in_grad1[nbatch][channel][y1+h-pad_size_][x1+w-pad_size_] +=\
+                        out_grad[nbatch][top_channel][i][j]*sign/sumelems;
+                      }
+                      if ((y2 +  h - pad_size_ >= 0) && (x2 + w - pad_size_ >=0) && \
+                      (y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
+                        Dtype sign  = (tmp1[nbatch][y1+h][x1+w][channel] >= \
+                        tmp2[nbatch][y2+h][x2+w][channel])? Dtype(-1.0) : Dtype(1.0);
+                        in_grad2[nbatch][channel][y2+h-pad_size_][x2+w-pad_size_] +=\
+                        out_grad[nbatch][top_channel][i][j]*sign/sumelems;
+                       }
+                    }
+                  }
+               }
+         }
+}
+}  // namespace mshadow
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(CorrelationParam param) {
+  return new CorrelationOp<cpu>(param);
+}
+Operator* CorrelationProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+DMLC_REGISTER_PARAMETER(CorrelationParam);
+MXNET_REGISTER_OP_PROPERTY(Correlation, CorrelationProp)
+.describe("Apply correlation to inputs")
+.add_argument("data1", "Symbol", "Input data1 to the correlation.")
+.add_argument("data2", "Symbol", "Input data2 to the correlation.")
+.add_arguments(CorrelationParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/correlation.cu b/src/operator/correlation.cu
new file mode 100644
index 000000000000..b26ae04f2d0b
--- /dev/null
+++ b/src/operator/correlation.cu
@@ -0,0 +1,609 @@
+/*!
+ * Copyright [2016] <Contributors>
+ * \file Correation.cu
+ * \brief  Correlation operator
+ * \author Xu Dong
+*/
+#include "./correlation-inl.h"
+#include <mshadow/tensor.h>
+#include <mshadow/cuda/reduce.cuh>
+#include <algorithm>
+#include <vector>
+
+#define ROUND_OFF 50000
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_WARP 32
+#define CORRELATION_CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+#define CUDA_KERNEL_LOOP(i, n) \
+for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+namespace mshadow {
+namespace cuda {
+// == Correlation Kernel
+template <typename Dtype>
+__global__ void CorrelateData(const int nthreads, int num, int topwidth,
+  int topheight, int topchannels, int topcount,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2,
+  int bottomwidth, int bottomheight, int bottomchannels,
+  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) {
+  extern __shared__ char patch_data_char[];
+  Dtype *patch_data = reinterpret_cast<Dtype *>(patch_data_char);
+  //  First (upper left) position of kernel upper-left corner
+  //  in current center position of neighborhood in image 1
+  int x1 = blockIdx.x * stride1 + max_displacement;
+  int y1 = blockIdx.y * stride1 + max_displacement;
+  int item = blockIdx.z;
+  int ch_off = threadIdx.x;
+  //  Load 3D patch into shared shared memory
+  for (int j = 0; j < kernel_size; j++) {  //  HEIGHT
+    for (int i = 0; i < kernel_size; i++) {  //  WIDTH
+      int ji_off = ((j * kernel_size) + i) * bottomchannels;
+      for (int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK))  {
+          //  CHANNELS
+          int idx1 = ((item * bottomheight + y1+j) * bottomwidth + x1+i) * bottomchannels + ch;
+          int idxPatchData = ji_off + ch;
+          patch_data[idxPatchData] = bottom0[idx1];
+      }
+    }
+  }
+  __syncthreads();
+  __shared__ Dtype sum[THREADS_PER_WARP * WARPS_PER_BLOCK];
+  //  Compute correlation
+  for (int top_channel = 0; top_channel < topchannels; top_channel++) {
+    sum[ch_off] = 0;
+    int s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2;
+    int s2p = (top_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride2;
+    for (int j = 0; j < kernel_size; j++) {  //  HEIGHT
+      for (int i = 0; i < kernel_size; i++) {  //  WIDTH
+        int ji_off = ((j * kernel_size) + i) * bottomchannels;
+        for (int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK)) {
+          //  CHANNELS
+          int x2 = x1 + s2o;
+          int y2 = y1 + s2p;
+          int idxPatchData = ji_off + ch;
+          int idx2 = ((item * bottomheight + y2 + j) * bottomwidth + x2 + i) * bottomchannels + ch;
+          sum[ch_off] += patch_data[idxPatchData] * bottom1[idx2];
+        }
+      }
+    }
+    __syncthreads();
+    if (ch_off == 0) {
+        Dtype total_sum = 0;
+        for (int idx = 0; idx < THREADS_PER_WARP * WARPS_PER_BLOCK; idx++) {
+            total_sum += sum[idx];
+        }
+        const int sumelems = kernel_size * kernel_size * bottomchannels;
+        const int index = ((top_channel * topheight + blockIdx.y) * topwidth) + blockIdx.x;
+        top[index + item*topcount] = total_sum / static_cast<float>(sumelems);
+    }  //  Aggregate result of  different threads
+  }
+}
+//  == Correlation Backward Pass Kernel (For data1)
+template <typename Dtype>
+__global__ void CorrelateDataBackward0(const int nthreads, int num, int item,
+  int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
+  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight,
+  int bottomchannels, int bottomcount, int pad_size,
+  Dtype *bottom0diff, const Dtype *bottom1, const Dtype *topdiff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  Get X,Y ranges and clamp
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
+    const int round_off = ROUND_OFF;
+    const int round_off_s1 = stride1 * round_off;
+    //  We add round_off before_s1 the int division and subtract round_off after it,
+    //  to ensure the formula matches ceil behavior:
+    int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    //  Same here:
+    int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (l - max_displacement) / stride1
+    int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (m - max_displacement) / stride1
+    Dtype sum = 0;
+    if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth-1) && (ymin <= topheight-1)) {
+        xmin = max(0, xmin);
+        xmax = min(topwidth-1, xmax);
+        ymin = max(0, ymin);
+        ymax = min(topheight-1, ymax);
+        for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+          for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+            //  Get bottom1 data:
+            int s2o = stride2 * o;
+            int s2p = stride2 * p;
+            int idxbot1 = ((item * pbottomheight + (m + s2p)) * pbottomwidth + (l + s2o))\
+             * bottomchannels + n;
+            Dtype bot1tmp = bottom1[idxbot1];  // bottom1[l+s2o,m+s2p,n]
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width\
+             + (o + neighborhood_grid_radius);  //  index [o,p]
+            int idxopoffset = (item * topchannels + op);
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x;  //  topdiff[x,y,o,p]
+                sum += topdiff[idxtopdiff] * bot1tmp;
+              }
+            }
+          }
+        }
+    }
+    const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2+1) * bottomchannels;
+    const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
+    bottom0diff[bot0index + item * bottomcount] = sum / static_cast<float>(sumelems);
+  }
+}
+// == Correlation Backward Pass Kernel (For Blob 1)
+template <typename Dtype>
+__global__ void CorrelateDataBackward1(const int nthreads,
+  int num, int item, int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
+  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight,
+  int bottomchannels, int bottomcount, int pad_size,
+  const Dtype *bottom0, Dtype *bottom1diff, const Dtype *topdiff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    //  int l = index % bottomwidth + pad_size; //w-pos
+    //  int m = (index / bottomwidth) % bottomheight + pad_size; //  h-pos
+    //  int n = (index / bottomwidth / bottomheight) % bottomchannels; //  channels
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
+    const int round_off = ROUND_OFF;
+    const int round_off_s1 = stride1 * round_off;
+    Dtype sum = 0;
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+        int s2o = stride2 * o;
+        int s2p = stride2 * p;
+        //  Get X,Y ranges and clamp
+        //  We add round_off before_s1 the int division and subtract round_off after it,
+        //  to ensure the formula matches ceil behavior:
+        int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+         // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+        // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        //  Same here:
+        int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off;
+        //  floor (l - max_displacement - s2o) / stride1
+        int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off;
+        //  floor (m - max_displacement - s2p) / stride1
+        if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth - 1) && (ymin <= topheight - 1)) {
+            xmin = max(0, xmin);
+            xmax = min(topwidth-1, xmax);
+            ymin = max(0, ymin);
+            ymax = min(topheight-1, ymax);
+            //  Get bottom0 data:
+            int idxbot0 = ((item * pbottomheight + (m - s2p)) \
+            * pbottomwidth + (l - s2o)) * bottomchannels + n;
+            Dtype bot0tmp = bottom0[idxbot0];  //  bottom1[l+s2o,m+s2p,n]
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * \
+            neighborhood_grid_width + (o+neighborhood_grid_radius);  //  index [o,p]
+            int idxOpOffset = (item * topchannels + op);
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxOpOffset * topheight + y)\
+                 * topwidth + x;  //  topdiff[x,y,o,p]
+                sum += topdiff[idxtopdiff] * bot0tmp;
+              }
+            }
+        }
+      }
+    }
+    const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
+    const int bot1index = ((n * bottomheight) + (m - pad_size)) * bottomwidth + (l - pad_size);
+    bottom1diff[bot1index + item * bottomcount] = sum / static_cast<float>(sumelems);
+  }
+}
+// == Correlation Kernel Subtraction
+template <typename Dtype>
+__global__ void CorrelateDataSubtract(const int nthreads, int num, int item,
+  int topwidth, int topheight, int topchannels, int topcount,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
+  int bottomwidth, int bottomheight, int bottomchannels,
+  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int x = index % topwidth;  //  w-pos
+    int y = (index / topwidth) % topheight;  //  h-pos
+    int c = (index / topwidth / topheight) % topchannels;  //  channels
+    //  Offset of patch in image 2
+    int s2o = (c % neighborhood_grid_width - neighborhood_grid_radius) * stride2;
+    int s2p = (c / neighborhood_grid_width - neighborhood_grid_radius) * stride2;
+    //  First (upper left) position of kernel center in current neighborhood in image 1
+    int x1 = x*stride1 + kernel_radius + max_displacement;
+    int y1 = y*stride1 + kernel_radius + max_displacement;
+    //  Iterate through 3D patch
+    Dtype sum = 0;
+    for (int j = -kernel_radius; j <= kernel_radius; j++) {  //  HEIGHT
+      for (int i = -kernel_radius; i <= kernel_radius; i++) {  //  WIDTH
+        for (int l = 0; l < bottomchannels; l++) {  //  CHANNELS
+          //  Calculate position in image 2
+          int x2 = x1 + s2o;
+          int y2 = y1 + s2p;
+          //  Indices in bottom data: (CH=l,W=x2,H=y2,N)
+          int idx1 = ((item * bottomheight + y1 + j) * bottomwidth + x1 + i) \
+          * bottomchannels + l;
+          int idx2 = ((item * bottomheight + y2 + j) * bottomwidth + x2 + i) \
+          * bottomchannels + l;
+          //  Do the correlation:
+          sum += fabsf(bottom0[idx1] - bottom1[idx2]);
+        }
+      }
+    }
+    const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bottomchannels;
+    top[index + item * topcount] = sum / static_cast<float>(sumelems);
+  }
+}
+//  == Correlation Backward Pass Kernel (For Blob 0)
+template <typename Dtype>
+__global__ void CorrelateDataBackward0Subtract(const int nthreads, int num,
+  int item, int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius,
+  int stride1, int stride2, int bottomwidth, int bottomheight,
+  int pbottomwidth, int pbottomheight,
+  int bottomchannels, int bottomcount, int pad_size,
+  Dtype *bottom0diff, const Dtype *bottom0, const Dtype *bottom1, const Dtype *topdiff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  Get X,Y ranges and clamp
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
+    const int round_off = ROUND_OFF;
+    const int round_off_s1 = stride1 * round_off;
+    int idxbot0 = ((item * pbottomheight + m) * pbottomwidth + l)\
+             * bottomchannels + n;
+    //  We add round_off before_s1 the int division and subtract round_off after it,
+    //  to ensure the formula matches ceil behavior:
+    int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    //  Same here:
+    int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (l - max_displacement) / stride1
+    int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (m - max_displacement) / stride1
+    Dtype sum = 0;
+    if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth-1) && (ymin <= topheight-1)) {
+        xmin = max(0, xmin);
+        xmax = min(topwidth-1, xmax);
+        ymin = max(0, ymin);
+        ymax = min(topheight-1, ymax);
+        for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+          for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+            //  Get bottom1 data:
+            int s2o = stride2 * o;
+            int s2p = stride2 * p;
+            int idxbot1 = ((item * pbottomheight + (m+s2p)) * pbottomwidth\
+             + (l+s2o)) * bottomchannels + n;
+            Dtype bot0tmp = bottom0[idxbot0];
+            Dtype bot1tmp = bottom1[idxbot1];
+            Dtype sign = (bot0tmp >= bot1tmp) ? Dtype(1.0) : Dtype(-1.0);
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width\
+             + (o + neighborhood_grid_radius);  //  index [o,p]
+            int idxopoffset = (item * topchannels + op);
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x;  //  topdiff[x,y,o,p]
+                sum += topdiff[idxtopdiff] * sign;
+              }
+            }
+          }
+        }
+    }
+    const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2+1) * bottomchannels;
+    const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
+    bottom0diff[bot0index + item * bottomcount] = sum / static_cast<float>(sumelems);
+  }
+}
+//  == Correlation Backward Pass Kernel (For Blob 1)
+template <typename Dtype>
+__global__ void CorrelateDataBackward1Subtract(const int nthreads, int num,
+  int item, int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius,
+  int stride1, int stride2, int bottomwidth, int bottomheight,
+  int pbottomwidth, int pbottomheight, int bottomchannels,
+  int bottomcount, int pad_size, const Dtype *bottom0,
+  const Dtype *bottom1, Dtype *bottom1diff, const Dtype *topdiff) {
+    CUDA_KERNEL_LOOP(index, nthreads) {
+    //  int l = index % bottomwidth + pad_size; //w-pos
+    //  int m = (index / bottomwidth) % bottomheight + pad_size; //  h-pos
+    //  int n = (index / bottomwidth / bottomheight) % bottomchannels; //  channels
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
+    const int round_off = ROUND_OFF;
+    const int round_off_s1 = stride1 * round_off;
+    Dtype sum = 0;
+    int idxbot1 = ((item * pbottomheight + m) * pbottomwidth + l)\
+             * bottomchannels + n;
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+        int s2o = stride2 * o;
+        int s2p = stride2 * p;
+        //  Get X,Y ranges and clamp
+        //  We add round_off before_s1 the int division and subtract round_off after it,
+        //  to ensure the formula matches ceil behavior:
+        int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+         // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+        // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        //  Same here:
+        int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off;
+        //  floor (l - max_displacement - s2o) / stride1
+        int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off;
+        //  floor (m - max_displacement - s2p) / stride1
+        if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth - 1) && (ymin <= topheight - 1)) {
+            xmin = max(0, xmin);
+            xmax = min(topwidth-1, xmax);
+            ymin = max(0, ymin);
+            ymax = min(topheight-1, ymax);
+            //  Get bottom0 data:
+            int idxbot0 = ((item * pbottomheight + (m - s2p)) * pbottomwidth + (l - s2o))\
+             * bottomchannels + n;
+            //  bottom0[l+s2o,m+s2p,n]
+            Dtype bot0tmp = bottom0[idxbot0];
+            Dtype bot1tmp = bottom1[idxbot1];
+            Dtype sign = (bot0tmp >= bot1tmp) ? Dtype(-1.0) : Dtype(1.0);
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * \
+            neighborhood_grid_width + (o+neighborhood_grid_radius);  //  index [o,p]
+            int idxOpOffset = (item * topchannels + op);
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxOpOffset * topheight + y)\
+                 * topwidth + x;  //  topdiff[x,y,o,p]
+                sum += topdiff[idxtopdiff] * sign;
+              }
+            }
+        }
+      }
+    }
+    const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
+    const int bot1index = ((n * bottomheight) + (m - pad_size)) * bottomwidth + (l - pad_size);
+    bottom1diff[bot1index + item * bottomcount] = sum / static_cast<float>(sumelems);
+  }
+}
+//  == Forward
+//  == Dimension rearrangement Kernel
+template <typename Dtype>
+__global__ void blob_rearrange_kernel2(const Dtype* in, Dtype* out, int num,
+int channels, int width, int height, int widthheight, int padding, int pwidthheight) {
+    //  change shape from [batchsize,channel,y,x] to [batchsize,y,x,channel]
+    int xy = blockIdx.x * blockDim.x + threadIdx.x;
+    if (xy >= widthheight )
+        return;
+    int ch = blockIdx.y;
+    int n  = blockIdx.z;
+    Dtype value = in[(n * channels + ch) * widthheight + xy];
+    __syncthreads();
+    int xpad  = (xy % width + padding);
+    int ypad  = (xy / width + padding);
+    int xypad = ypad * (width + 2 * padding) + xpad;
+    out[(n * pwidthheight + xypad) * channels + ch] = value;
+}
+template <typename Dtype>
+void Forward_gpu(
+      const Tensor<gpu, 4, Dtype> &out,
+      const Tensor<gpu, 4, Dtype> &data1,
+      const Tensor<gpu, 4, Dtype> &data2,
+      const Tensor<gpu, 4, Dtype> &tmp1,
+      const Tensor<gpu, 4, Dtype> &tmp2,
+      int top_channels_, int top_height_, int top_width_, int pad_size_,
+      bool is_multiply, int max_displacement_, int kernel_size_,
+      int neighborhood_grid_radius_, int neighborhood_grid_width_,
+      int  kernel_radius_, int stride1_, int stride2_, cudaStream_t stream,
+      cudaStream_t stream_tmp1, cudaStream_t stream_tmp2) {
+    const Dtype *bottom_data1 = data1.dptr_;
+    const Dtype *bottom_data2 = data2.dptr_;
+    Dtype *rbot1 = tmp1.dptr_;
+    Dtype *rbot2 = tmp2.dptr_;
+    Dtype *top = out.dptr_;
+    const int bnum = data1.size(0);
+    const int bchannels = data1.size(1);
+    const int bheight = data1.size(2);
+    const int bwidth = data1.size(3);
+    const int bwidthheight = bwidth * bheight;
+    const int topcount = top_width_ * top_height_ * top_channels_;
+    dim3 threadsPerBlock(THREADS_PER_WARP * WARPS_PER_BLOCK);
+    int threads_per_block = 16;
+    dim3 totalBlocksRearr((bwidthheight - 1) / threads_per_block + 1, bchannels, bnum);
+    const int pwidthheight = (bwidth + 2 * pad_size_) * (bheight + 2 * pad_size_);
+    blob_rearrange_kernel2<Dtype><<<totalBlocksRearr, threads_per_block, 0, stream_tmp1>>>
+    (bottom_data1, rbot1, bnum, bchannels, bwidth, bheight, bwidthheight, pad_size_, pwidthheight);
+    blob_rearrange_kernel2<Dtype><<<totalBlocksRearr, threads_per_block, 0, stream_tmp2>>>
+    (bottom_data2, rbot2, bnum, bchannels, bwidth, bheight, bwidthheight, pad_size_, pwidthheight);
+    const int num = bnum;
+    const int channels = bchannels;
+    const int height = bheight + 2 * pad_size_;
+    const int width = bwidth + 2 * pad_size_;
+    const int shared_memory_per_block = (kernel_size_ * kernel_size_) * bchannels;
+    if (is_multiply == true) {
+        //  CorrelationLayer
+        int topThreadCount = topcount;
+        dim3 totalBlocksCorr(top_width_, top_height_, num);
+        CorrelateData<Dtype><<<totalBlocksCorr, threadsPerBlock,
+        shared_memory_per_block * sizeof(Dtype), stream>>>(
+            topThreadCount,
+            num, top_width_, top_height_, top_channels_, topcount,
+            max_displacement_, neighborhood_grid_radius_,
+            neighborhood_grid_width_, kernel_radius_, kernel_size_,
+            stride1_, stride2_,
+            width, height, channels,
+            rbot1, rbot2, top);
+        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+    } else {
+        //  CorrelationLayer
+        for (int n = 0; n < num; n++) {
+            int topThreadCount = topcount;
+            const int gridSize = (topThreadCount + kMaxThreadsPerBlock - 1)\
+             / kMaxThreadsPerBlock;
+            CorrelateDataSubtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream>>>(
+                topThreadCount,
+                num, n, top_width_, top_height_, top_channels_, topcount,
+                max_displacement_, neighborhood_grid_radius_,
+                neighborhood_grid_width_, kernel_radius_,
+                stride1_, stride2_, width, height, channels, rbot1, rbot2, top);
+         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        }
+    }
+}
+template <typename Dtype>
+void Backward_gpu(
+       const Tensor<gpu, 4, Dtype> &out_grad,
+      const Tensor<gpu, 4, Dtype> &in_grad1,
+      const Tensor<gpu, 4, Dtype> &in_grad2,
+      const Tensor<gpu, 4, Dtype> &tmp1,
+      const Tensor<gpu, 4, Dtype> &tmp2,
+      int top_channels_, int top_height_,
+      int top_width_, int pad_size_, bool is_multiply,
+      int max_displacement_, int kernel_size_,
+      int neighborhood_grid_radius_, int neighborhood_grid_width_,
+      int  kernel_radius_, int stride1_, int stride2_,
+      cudaStream_t stream0, cudaStream_t stream1,
+      int num, int channels, int height, int width) {
+    //  Get top diff, compute bottom diff
+    const Dtype* top_diff = out_grad.dptr_;
+    Dtype* bottom0_diff = in_grad1.dptr_;
+    Dtype* bottom1_diff = in_grad2.dptr_;
+    const Dtype* rbot1 = tmp1.dptr_;
+    const Dtype* rbot2 = tmp2.dptr_;
+    const int paddedheight = height + 2 * pad_size_;
+    const int paddedwidth = width + 2 * pad_size_;
+    const int bottomcount = channels * height * width;
+    int botThreadCount = bottomcount;
+    const int gridSize = (botThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+    //  CorrelationLayerBackward
+    if (is_multiply == true) {
+        //  == Run kernel Backward 0
+        dim3 totalBlocksBackward0(width, height, channels * num);  //  First dim is fastest
+        const int buffer_size_backw0 = \
+        (static_cast<int>(ceil(static_cast<float>(2 * kernel_radius_)\
+         / static_cast<float>(stride1_))) + 1) * top_channels_;
+        //  == Run kernel Backward 0
+        for (int n = 0; n < num; n++) {
+        CorrelateDataBackward0<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream0>>>(
+            botThreadCount,
+            num, n, top_width_, top_height_, top_channels_,
+            max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
+            stride1_, stride2_,
+            width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
+            bottom0_diff, rbot2, top_diff);
+        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        }
+        //  == Run kernel Backward 1
+        for (int n = 0; n < num; n++) {
+        CorrelateDataBackward1<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream1>>>(
+            botThreadCount,
+            num, n, top_width_, top_height_, top_channels_,
+            max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
+            stride1_, stride2_,
+            width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
+            rbot1, bottom1_diff, top_diff);
+       CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        }
+    } else  {
+        for (int n = 0; n < num; n++) {
+        //  Bottom0:
+        CorrelateDataBackward0Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream0>>>(
+            botThreadCount,
+            num, n, top_width_, top_height_, top_channels_,
+            max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
+            stride1_, stride2_,
+            width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
+            bottom0_diff, rbot1, rbot2, top_diff);
+        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        }
+        for (int n = 0; n < num; n++) {
+        //  Bottom1:
+        CorrelateDataBackward1Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream1>>>(
+            botThreadCount,
+            num, n, top_width_, top_height_, top_channels_,
+            max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
+            stride1_, stride2_,
+            width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
+            rbot1, rbot2, bottom1_diff, top_diff);
+        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        }
+    }
+}
+}  // namespace cuda
+template<typename Dtype>
+inline void CorrelationForward(const Tensor<gpu, 4, Dtype> &out,
+                               const Tensor<gpu, 4, Dtype> &data1,
+                               const Tensor<gpu, 4, Dtype> &data2,
+                               const Tensor<gpu, 4, Dtype> &tmp1,
+                               const Tensor<gpu, 4, Dtype> &tmp2,
+                               int top_channels_, int top_height_,
+                               int top_width_, int pad_size_, bool is_multiply,
+                               int max_displacement_, int kernel_size_,
+                               int neighborhood_grid_radius_, int neighborhood_grid_width_,
+                               int kernel_radius_, int stride1_, int stride2_
+                           ) {
+  cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
+  cudaStream_t stream_tmp1 = Stream<gpu>::GetStream(tmp1.stream_);
+  cudaStream_t stream_tmp2 = Stream<gpu>::GetStream(tmp2.stream_);
+  cuda::Forward_gpu(out, data1, data2, tmp1, tmp2, top_channels_, top_height_,
+                    top_width_, pad_size_, is_multiply, max_displacement_, kernel_size_,
+                    neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
+                    stride1_, stride2_, stream, stream_tmp1, stream_tmp2);
+}
+
+template<typename Dtype>
+inline void CorrelationBackward(const Tensor<gpu, 4, Dtype> &out_grad,
+                            const Tensor<gpu, 4, Dtype> &in_grad1,
+                            const Tensor<gpu, 4, Dtype> &in_grad2,
+                            const Tensor<gpu, 4, Dtype> &tmp1,
+                            const Tensor<gpu, 4, Dtype> &tmp2,
+                            int top_channels_, int top_height_,
+                            int top_width_, int pad_size_, bool is_multiply,
+                            int max_displacement_, int kernel_size_,
+                            int neighborhood_grid_radius_, int neighborhood_grid_width_,
+                            int  kernel_radius_, int stride1_,
+                            int stride2_, int num, int channels, int height, int width
+                            ) {
+  cudaStream_t stream0 = Stream<gpu>::GetStream(in_grad1.stream_);
+  cudaStream_t stream1 = Stream<gpu>::GetStream(in_grad2.stream_);
+  cuda::Backward_gpu(out_grad, in_grad1, in_grad2, tmp1, tmp2, top_channels_,
+                      top_height_, top_width_, pad_size_, is_multiply,
+                      max_displacement_, kernel_size_, neighborhood_grid_radius_,
+                      neighborhood_grid_width_, kernel_radius_, stride1_, stride2_,
+                      stream0, stream1, num, channels, height, width);
+}
+}  // namespace mshadow
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(CorrelationParam param) {
+  return new CorrelationOp<gpu>(param);
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 3809b28f1108..06537778b7de 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1222,6 +1222,203 @@ def test_batch_dot(ctx=mx.cpu()):
                     assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
 
 
+def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
+    
+    img1 = mx.sym.Variable('img1')
+    img2 = mx.sym.Variable('img2')
+    return mx.sym.Correlation(data1=img1,data2=img2,kernel_size =kernel_size,max_displacement = max_displacement,
+                              stride1 = stride1,stride2 = stride2,pad_size= pad_size,is_multiply = is_multiply)
+
+def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply):
+    
+    # compute output's dimension
+    paddedbottomheight = data1.shape[2] + 2 * pad_size
+    paddedbottomwidth = data1.shape[3] + 2 * pad_size
+    kernel_radius = (kernel_size - 1) // 2
+    border_size = max_displacement + kernel_radius
+    top_width = (paddedbottomwidth - border_size * 2) // stride1
+    top_height = (paddedbottomheight - border_size  * 2) // stride1
+    neighborhood_grid_radius = max_displacement // stride2
+    neighborhood_grid_width = neighborhood_grid_radius * 2 + 1
+    top_channels = neighborhood_grid_width * neighborhood_grid_width
+    
+    out = np.zeros((data1.shape[0], top_channels, top_height, top_width))
+    tmp1 = np.zeros((data1.shape[0],data1.shape[1],paddedbottomheight, paddedbottomwidth))
+    tmp2 = np.zeros((data1.shape[0],data1.shape[1],paddedbottomheight, paddedbottomwidth))
+    
+    tmp1[:, :, pad_size:pad_size + data1.shape[2], pad_size:pad_size + data1.shape[3]] = data1[:,:,:,:]
+    tmp2[:, :, pad_size:pad_size + data2.shape[2], pad_size:pad_size + data2.shape[3]] = data2[:,:,:,:]
+    
+    for i in range(top_height):
+        for j in range(top_width):
+            for nbatch in range(data1.shape[0]):
+                
+                # x1,y1 is the location in data1 , i,j is the location in output
+                x1 = j * stride1 + max_displacement
+                y1 = i * stride1 + max_displacement
+                
+                for top_channel in range(top_channels):
+                    
+                    s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    s2p = (top_channel // neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    
+                    # location in data2 
+                    x2 = x1 + s2o
+                    y2 = y1 + s2p
+                    
+                    for h in range(kernel_size):
+                        for w in range(kernel_size):
+                            for channel in range(data1.shape[1]):
+                                if is_multiply:
+                                    out[nbatch, top_channel, i, j] += tmp1[nbatch, channel,y1 + h, x1 + w] * tmp2[nbatch, channel, y2 + h,x2 + w]
+                                else:
+                                    out[nbatch, top_channel, i, j] += abs(tmp1[nbatch, channel, y1 + h, x1 + w] - tmp2[nbatch, channel, y2 + h, x2 + w])
+    out /= float(kernel_size**2*data1.shape[1])
+    return out,tmp1,tmp2
+
+def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply):
+    
+    # compute output's dimension 
+    paddedbottomheight = data1.shape[2] + 2 * pad_size
+    paddedbottomwidth = data1.shape[3] + 2 * pad_size
+    kernel_radius = (kernel_size - 1) // 2
+    border_size = max_displacement + kernel_radius
+    top_width = (paddedbottomwidth - border_size * 2) // stride1
+    top_height = (paddedbottomheight - border_size  * 2) // stride1
+    neighborhood_grid_radius = max_displacement // stride2
+    neighborhood_grid_width = neighborhood_grid_radius * 2 + 1
+    top_channels = neighborhood_grid_width * neighborhood_grid_width
+    
+    out = np.zeros((data1.shape[0], top_channels, top_height, top_width))
+    tmp1_grad = np.zeros(tmp1.shape)
+    tmp2_grad = np.zeros(tmp2.shape)
+    
+    for i in range(top_height):
+        for j in range(top_width):
+            for nbatch in range(data1.shape[0]):
+                
+                # x1,y1 is the location in data1 , i,j is the location in output
+                x1 = j * stride1 + max_displacement
+                y1 = i * stride1 + max_displacement
+                
+                for top_channel in range(top_channels):
+                    
+                    s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    s2p = (top_channel // neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    
+                    # location in data2 
+                    x2 = x1 + s2o
+                    y2 = y1 + s2p
+                    
+                    for h in range(kernel_size):
+                        for w in range(kernel_size):
+                            for channel in range(data1.shape[1]):
+                                if is_multiply:
+                                    tmp1_grad[nbatch,channel,y1+h,x1+w]+= out_grad[nbatch,top_channel,i,j]*tmp2[nbatch, channel, y2 + h,x2 + w]
+                                    tmp2_grad[nbatch,channel,y2+h,x2+w]+= out_grad[nbatch,top_channel,i,j]*tmp1[nbatch, channel, y1 + h,x1 + w]
+                                else:
+                                    sgn = 1 if (tmp1[nbatch, channel, y1 + h,x1 + w]>=tmp2[nbatch, channel, y2 + h,x2 + w]) else -1
+                                    tmp1_grad[nbatch,channel,y1+h,x1+w]+= out_grad[nbatch,top_channel,i,j]*sgn
+                                    tmp2_grad[nbatch,channel,y2+h,x2+w]+= out_grad[nbatch,top_channel,i,j]*(-sgn)
+    
+    tmp1_grad = tmp1_grad / float(kernel_size**2*data1.shape[1])
+    tmp2_grad = tmp2_grad / float(kernel_size**2*data1.shape[1])
+    return tmp1_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],tmp2_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],
+
+def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
+    
+    img1 = np.random.random(data_shape)
+    img2 = np.random.random(data_shape)
+
+    net1 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply)
+    net2 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply )
+
+    exe1 = net1.simple_bind(mx.cpu(),img1=img1.shape,img2=img1.shape)
+    exe1.arg_dict['img1'][:] = img1
+    exe1.arg_dict['img2'][:] = img2
+
+    #cpu forward
+    exe1.forward()  
+    # python forward
+    forward_result,tmp1,tmp2 = correlation_forward(img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply)
+
+    # forward error
+    assert np.abs(exe1.outputs[0].asnumpy()-forward_result).mean()<1e-4
+    
+    # out_grad 
+    a = np.ones(forward_result.shape)
+    out_grad1 = mx.nd.array(a,mx.cpu())
+    # cpu backward
+    exe1.backward(out_grads=out_grad1)
+    # python backward
+    grad1,grad2 = correlation_backward(a,tmp1,tmp2,img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply)
+
+    # backward error 
+    assert np.abs(exe1.grad_dict['img1'].asnumpy() - grad1).mean() < 1e-4
+    assert np.abs(exe1.grad_dict['img2'].asnumpy() - grad2).mean() < 1e-4
+
+def test_correlation():
+    
+    unittest_correlation((1,3,10,10), kernel_size = 1,max_displacement = 4,stride1 = 1,stride2 = 1,pad_size = 4,is_multiply = False)
+    unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = False)
+    unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = True)
+    unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 10,stride1 = 1,stride2 = 2,pad_size = 10,is_multiply = True)
+    unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = True)
+    unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = True)
+    unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False)
+    unittest_correlation((5,1,6,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False)
+    unittest_correlation((5,1,11,11), kernel_size = 5,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = False)
+    
+def test_dot(ctx=mx.cpu()):
+    for m in range(1, 5):
+        for k in range(1, 5):
+            for n in range(1, 5):
+                a_npy = np.random.normal(0, 1, (m, k))
+                b_npy = np.random.normal(0, 1, (k, n))
+                c_npy = np.empty((m, n))
+                ograd_npy = np.random.normal(0, 1, (m, n))
+                agrad_npy = np.empty((m, k))
+                bgrad_npy = np.empty((k, n))
+                c_npy[:, :] = np.dot(a_npy[:, :], b_npy[:, :])
+                bgrad_npy[:, :] = np.dot(a_npy[:, :].T, ograd_npy[:, :])
+                agrad_npy[:, :] = np.dot(ograd_npy[:, :], b_npy[:, :].T)
+                a = mx.sym.Variable('a')
+                b = mx.sym.Variable('b')
+                c = mx.sym.dot(a, b)
+                exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
+                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
+                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+
+
+def test_batch_dot(ctx=mx.cpu()):
+    for batch_size in range(1, 5):
+        for m in range(1, 5):
+            for k in range(1, 5):
+                for n in range(1, 5):
+                    a_npy = np.random.normal(0, 1, (batch_size, m, k))
+                    b_npy = np.random.normal(0, 1, (batch_size, k, n))
+                    c_npy = np.empty((batch_size, m, n))
+                    ograd_npy = np.random.normal(0, 1, (batch_size, m, n))
+                    agrad_npy = np.empty((batch_size, m, k))
+                    bgrad_npy = np.empty((batch_size, k, n))
+                    for i in range(batch_size):
+                        c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
+                        bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])
+                        agrad_npy[i, :, :] = np.dot(ograd_npy[i, :, :], b_npy[i, :, :].T)
+                    a = mx.sym.Variable('a')
+                    b = mx.sym.Variable('b')
+                    c = mx.sym.batch_dot(a, b)
+                    exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                    outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
+                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                    exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
+                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+
+
 if __name__ == '__main__':
     test_expand_dims()
     test_slice_axis()
@@ -1259,3 +1456,4 @@ def test_batch_dot(ctx=mx.cpu()):
     test_stn()
     test_dot()
     test_batch_dot()
+    test_correlation()

From acdae67c7b49c9d17753c23e49260c87a7464a4b Mon Sep 17 00:00:00 2001
From: Xiang Liang <xlvector@gmail.com>
Date: Sat, 9 Jul 2016 01:49:22 +0800
Subject: [PATCH 090/126] Warpctc mult label length (#2650)

* multi length label

support multi length label

Update README.md

* reset dmlc core

* fix lint
---
 example/warpctc/README.md    |  5 +++
 example/warpctc/lstm_ocr.py  | 24 +++++++----
 example/warpctc/toy_ctc.py   |  2 +-
 plugin/warpctc/warpctc-inl.h | 80 ++++++++++++++++++++++++++----------
 4 files changed, 81 insertions(+), 30 deletions(-)

diff --git a/example/warpctc/README.md b/example/warpctc/README.md
index a07d1146a6d8..32306aa157bd 100644
--- a/example/warpctc/README.md
+++ b/example/warpctc/README.md
@@ -84,3 +84,8 @@ Following code show detail construction of the net:
     return sm
 ```
   
+## Support multi label length
+
+If you label length is smalled than or equal to b. You should provide labels with length b, and for those samples which label length is smaller than b, you should append 0 to label data to make it have length b.
+
+Here, 0 is reserved for blank label.
diff --git a/example/warpctc/lstm_ocr.py b/example/warpctc/lstm_ocr.py
index 22247e85d8d7..048572500b85 100644
--- a/example/warpctc/lstm_ocr.py
+++ b/example/warpctc/lstm_ocr.py
@@ -30,16 +30,18 @@ def provide_label(self):
         return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
 
 def gen_rand():
-    num = random.randint(0, 9999)
-    buf = str(num)
-    while len(buf) < 4:
-        buf = "0" + buf
+    buf = ""
+    max_len = random.randint(3,4)
+    for i in range(max_len):
+        buf += str(random.randint(0,9))
     return buf
 
 def get_label(buf):
     ret = np.zeros(4)
-    for i in range(4):
+    for i in range(len(buf)):
         ret[i] = 1 + int(buf[i])
+    if len(buf) == 3:
+        ret[3] = 0
     return ret
 
 class OCRIter(mx.io.DataIter):
@@ -96,7 +98,15 @@ def ctc_label(p):
         if c2 == 0 or c2 == c1:
             continue
         ret.append(c2)
-    return ret        
+    return ret
+
+def remove_blank(l):
+    ret = []
+    for i in range(len(l)):
+        if l[i] == 0:
+            break
+        ret.append(l[i])
+    return ret
 
 def Accuracy(label, pred):
     global BATCH_SIZE
@@ -104,7 +114,7 @@ def Accuracy(label, pred):
     hit = 0.
     total = 0.
     for i in range(BATCH_SIZE):
-        l = label[i]
+        l = remove_blank(label[i])
         p = []
         for k in range(SEQ_LENGTH):
             p.append(np.argmax(pred[k * BATCH_SIZE + i]))
diff --git a/example/warpctc/toy_ctc.py b/example/warpctc/toy_ctc.py
index 1000e09dbd85..2caa11e68399 100644
--- a/example/warpctc/toy_ctc.py
+++ b/example/warpctc/toy_ctc.py
@@ -67,7 +67,7 @@ def __iter__(self):
                 num, img = gen_rand()
                 data.append(img)
                 label.append(get_label(num))
-
+                
             data_all = [mx.nd.array(data)] + self.init_state_arrays
             label_all = [mx.nd.array(label)]
             data_names = ['data'] + init_state_names
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index aac8ff9e5568..e4ea4b99059f 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -80,6 +80,33 @@ class WarpCTCOp : public Operator {
     Softmax(out_tensor, data_tensor);
   }
 
+  std::vector<int> labelLengths(const int * flat_labels, int minibatch,
+                                int size, int blank, int * total_length) {
+    CHECK_EQ(param_.label_length * minibatch, size)
+        << "label size should = label_length * minibatch";
+    std::vector<int> ret(minibatch, 0);
+    for (int i = 0; i < size; i++) {
+      if (flat_labels[i] == blank) {
+        continue;
+      }
+      int b = i / param_.label_length;
+      ret[b]++;
+      (*total_length)++;
+    }
+    return ret;
+  }
+
+  void removeBlank(const int * flat_labels, int * cpu_labels,
+                   int size, int blank) {
+    int k = 0;
+    for (int i = 0; i < size; i++) {
+      if (flat_labels[i] != blank) {
+        cpu_labels[k] = flat_labels[i];
+        k += 1;
+      }
+    }
+  }
+
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -111,11 +138,37 @@ class WarpCTCOp : public Operator {
     for (int i = 0; i < minibatch; i++) {
       input_lengths.push_back(T);
     }
-    std::vector<int> label_lengths;
-    for (int i = 0; i < minibatch; i++) {
-      label_lengths.push_back(param_.label_length);
+
+#if MXNET_USE_CUDA
+    cudaError_t cuda_status;
+#endif
+    float* activations = static_cast<float*>(data.dptr_);
+    int* flat_labels = static_cast<int*>(label.dptr_);
+    int* cpu_raw_labels = flat_labels;
+    float* grads = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
+    if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      cpu_raw_labels = reinterpret_cast<int*>(malloc(sizeof(int) * label.Size()));
+      cuda_status = cudaMemcpyAsync(cpu_raw_labels, flat_labels,
+                                    label.Size()*sizeof(int),
+                                    cudaMemcpyDeviceToHost,
+                                    ctx.get_stream<gpu>()->stream_);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
+#endif
+    } else {
+      LOG(FATAL) << "Unknown device type " << data.dev_mask_;
     }
 
+    int total_label_length = 0;
+    std::vector<int> label_lengths = labelLengths(cpu_raw_labels,
+                                                  minibatch,
+                                                  label.Size(),
+                                                  0, &total_label_length);
+    int* cpu_labels = reinterpret_cast<int*>(
+        malloc(sizeof(int) * total_label_length));
+    removeBlank(cpu_raw_labels, cpu_labels, label.Size(), 0);
+    free(cpu_raw_labels);
+
     size_t alloc_bytes;
     throw_on_error(get_workspace_size(label_lengths.data(),
                                       input_lengths.data(),
@@ -125,32 +178,14 @@ class WarpCTCOp : public Operator {
                    "Error: get_workspace_size in inf_test");
     void* ctc_workspace;
 
-#if MXNET_USE_CUDA
-    cudaError_t cuda_status;
-#endif
-    float* activations = static_cast<float*>(data.dptr_);
-    int* flat_labels = static_cast<int*>(label.dptr_);
-    int* cpu_labels = flat_labels;
-    float* grads = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
-
     if (data.dev_mask_ == cpu::kDevMask) {
       ctc_workspace = malloc(alloc_bytes);
     } else if (data.dev_mask_ == gpu::kDevMask) {
 #if MXNET_USE_CUDA
-      cpu_labels = reinterpret_cast<int*>(malloc(sizeof(int) * label.Size()));
-      cuda_status = cudaMemcpyAsync(cpu_labels, flat_labels,
-                                    label.Size()*sizeof(int),
-                                    cudaMemcpyDeviceToHost,
-                                    ctx.get_stream<gpu>()->stream_);
-      CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
-
       cuda_status = cudaMalloc(&ctc_workspace, alloc_bytes);
       CHECK_EQ(cuda_status, cudaSuccess) << "cuda malloc worksapce fail";
 #endif
-    } else {
-      LOG(FATAL) << "Unknown device type " << data.dev_mask_;
     }
-
     std::vector<float> costs(minibatch);
     throw_on_error(compute_ctc_loss(activations,
                                     grads,
@@ -163,12 +198,14 @@ class WarpCTCOp : public Operator {
                                     ctc_workspace,
                                     info),
                    "Error: compute_ctc_loss");
+
     if (data.dev_mask_ == cpu::kDevMask) {
       free(ctc_workspace);
     } else if (data.dev_mask_ == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       cuda_status = cudaFree(ctc_workspace);
       CHECK_EQ(cuda_status, cudaSuccess) << "cuda free workspace fail";
+      free(cpu_labels);
 #endif
     }
   }
@@ -207,7 +244,6 @@ class WarpCTCProp : public OperatorProperty {
     if (dshape.ndim() == 0) return false;
     TShape label_shape(dshape.ndim() - 1);
     label_shape[0] = param_.label_length * (dshape[0] / param_.input_length);
-    std::cout << "infer label shape: " << label_shape[0] << std::endl;
     SHAPE_ASSIGN_CHECK(*in_shape, warpctc_enum::kLabel, label_shape);
 
     out_shape->clear();

From 70fe575796947164b390c625ac7b6f6cb2ebcdb6 Mon Sep 17 00:00:00 2001
From: lancy <14307130246@fudan.edu.cn>
Date: Sat, 9 Jul 2016 01:49:46 +0800
Subject: [PATCH 091/126] fix bugs of rcnn example (#2651)

---
 example/rcnn/tools/train_net.py        | 4 ++--
 tools/caffe_converter/convert_model.py | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/example/rcnn/tools/train_net.py b/example/rcnn/tools/train_net.py
index 4078f833b1fa..0214b11326f4 100644
--- a/example/rcnn/tools/train_net.py
+++ b/example/rcnn/tools/train_net.py
@@ -45,9 +45,9 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
     # initialize params
     arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-    args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'], ctx=ctx)
+    args['cls_score_weight'] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict['cls_score_weight'], ctx=ctx)
     args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'], ctx=ctx)
-    args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'], ctx=ctx)
+    args['bbox_pred_weight'] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict['bbox_pred_weight'], ctx=ctx)
     args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'], ctx=ctx)
 
     # train
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 113e0b28cf37..891681fb347a 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -63,10 +63,13 @@ def main():
         if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14:
             assert(len(layer_blobs) == 2)
             wmat_dim = []
-            if len(layer_blobs[0].shape.dim) > 0:
-                wmat_dim = layer_blobs[0].shape.dim
+            if getattr(layer_blobs[0].shape, 'dim', None) is not None:
+                if len(layer_blobs[0].shape.dim) > 0:
+                    wmat_dim = layer_blobs[0].shape.dim
+                else:
+                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width]
             else:
-                wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width]
+                wmat_dim = list(layer_blobs[0].shape)
             wmat = np.array(layer_blobs[0].data).reshape(wmat_dim)
             bias = np.array(layer_blobs[1].data)
             if first_conv:

From 5532511e1c9411494d1891cd0ae1e37993543daf Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@users.noreply.github.com>
Date: Fri, 8 Jul 2016 14:07:45 -0700
Subject: [PATCH 092/126] [OP] 3d conv + 3d pool (#2585)

* cudnn 3d conv & 3d pooling
---
 Makefile                                      |  20 +-
 .../classifyRealImageWithPretrainedModel.Rmd  |   4 +-
 dmlc-core                                     |   2 +-
 docs/how_to/cloud.md                          |   2 +-
 .../r/classifyRealImageWithPretrainedModel.md |   4 +-
 .../symbol_inception-bn-28-small.py           |   2 +-
 .../symbol_inception-bn-full.py               |   2 +-
 .../symbol_inception-bn.py                    |   2 +-
 example/image-classification/train_cifar10.py |   2 +-
 .../train_cifar10_mirroring.py                |   4 +-
 .../train_cifar10_resnet.py                   |   2 +-
 example/image-classification/train_mnist.R    |   6 +-
 example/image-classification/train_mnist.py   |   2 +-
 example/module/train_cifar10.py               |   2 +-
 .../predict-with-pretrained-model.ipynb       |   2 +-
 example/rnn/char-rnn.ipynb                    |   6 +-
 scala-package/core/scripts/get_cifar_data.sh  |   4 +-
 scala-package/core/scripts/get_mnist_data.sh  |   2 +-
 .../scala/ml/dmlc/mxnet/train/ConvSuite.scala |   2 +-
 src/operator/convolution-inl.h                | 119 +++++--
 src/operator/cudnn_convolution-inl.h          | 320 +++++++++++++-----
 src/operator/cudnn_convolution.cc             | 167 ++++++---
 src/operator/cudnn_pooling-inl.h              | 263 +++++++++-----
 src/operator/pooling-inl.h                    |  58 ++--
 tests/python/common/get_data.py               |   4 +-
 tests/travis/run_test.sh                      |   2 +-
 tests/travis/setup.sh                         |   1 -
 27 files changed, 692 insertions(+), 314 deletions(-)

diff --git a/Makefile b/Makefile
index 2ed5a51cda69..ccea1ba0a7b0 100644
--- a/Makefile
+++ b/Makefile
@@ -36,9 +36,9 @@ endif
 CFLAGS += -I$(ROOTDIR)/mshadow/ -I$(ROOTDIR)/dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
 ifeq ($(DEBUG), 1)
-	NVCCFLAGS = -Xcompiler -std=c++98 -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS = -Xcompiler -std=c++98 -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 ifndef LINT_LANG
@@ -149,8 +149,8 @@ endif
 
 build/src/%.o: src/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -c $< -o $@
 
 build/src/%_gpu.o: src/%.cu
 	@mkdir -p $(@D)
@@ -159,20 +159,20 @@ build/src/%_gpu.o: src/%.cu
 
 build/plugin/%.o: plugin/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -c $< -o $@
 
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
 # Use CXX to generate dependency instead.
 build/plugin/%_gpu.o: plugin/%.cu
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
 
 $(EXTRA_OPERATORS)/build/%.o: $(EXTRA_OPERATORS)/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -Isrc/operator -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -Isrc/operator -c $< -o $@
 
 $(EXTRA_OPERATORS)/build/%_gpu.o: $(EXTRA_OPERATORS)/%.cu
 	@mkdir -p $(@D)
@@ -200,7 +200,7 @@ bin/im2rec: tools/im2rec.cc $(ALL_DEP)
 
 $(BIN) :
 	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -std=c++0x  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
+	$(CXX) $(CFLAGS) -std=c++11  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
 
 include tests/cpp/unittest.mk
 
diff --git a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
index ad7c8f5eefad..6b58946eaa31 100644
--- a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
+++ b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
@@ -7,7 +7,7 @@ algorithm can do is to classify real world images.
 In this example we will show how to use a pretrained Inception-BatchNorm Network to predict the class of
 real world image. The network architecture is decribed in [1].
 
-The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)
+The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://data.dmlc.ml/mxnet/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
 
 Preface
@@ -69,7 +69,7 @@ preproc.image <- function(im, mean.image) {
   shape <- dim(im)
   short.edge <- min(shape[1:2])
   xx <- floor((shape[1] - short.edge) / 2)
-  yy <- floor((shape[2] - short.edge) / 2) 
+  yy <- floor((shape[2] - short.edge) / 2)
   croped <- crop.borders(im, xx, yy)
   # resize to 224 x 224, needed by input of the model.
   resized <- resize(croped, 224, 224)
diff --git a/dmlc-core b/dmlc-core
index c39001019e44..25e80e916b8b 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit c39001019e443c7a061789bd1180f58ce85fc3e6
+Subproject commit 25e80e916b8bb42e2f027ce352970904389ba76b
diff --git a/docs/how_to/cloud.md b/docs/how_to/cloud.md
index fc00f3872d0e..26cda6ab8cc0 100644
--- a/docs/how_to/cloud.md
+++ b/docs/how_to/cloud.md
@@ -29,7 +29,7 @@ There are several ways to upload local data to S3. One simple way is using
 [s3cmd](http://s3tools.org/s3cmd). For example:
 
 ```bash
-wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip
+wget http://data.dmlc.ml/mxnet/data/mnist.zip
 unzip mnist.zip && s3cmd put t*-ubyte s3://dmlc/mnist/
 ```
 
diff --git a/docs/packages/r/classifyRealImageWithPretrainedModel.md b/docs/packages/r/classifyRealImageWithPretrainedModel.md
index 2d9276c212d2..7bc5fec1a08f 100644
--- a/docs/packages/r/classifyRealImageWithPretrainedModel.md
+++ b/docs/packages/r/classifyRealImageWithPretrainedModel.md
@@ -6,7 +6,7 @@ algorithm can do is to classify real world images.
 In this example we will show how to use a pretrained Inception-BatchNorm Network to predict the class of
 real world image. The network architecture is decribed in [1].
 
-The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)
+The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://data.dmlc.ml/mxnet/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
 
 Preface
@@ -112,7 +112,7 @@ preproc.image <- function(im, mean.image) {
   shape <- dim(im)
   short.edge <- min(shape[1:2])
   xx <- floor((shape[1] - short.edge) / 2)
-  yy <- floor((shape[2] - short.edge) / 2) 
+  yy <- floor((shape[2] - short.edge) / 2)
   croped <- crop.borders(im, xx, yy)
   # resize to 224 x 224, needed by input of the model.
   resized <- resize(croped, 224, 224)
diff --git a/example/image-classification/symbol_inception-bn-28-small.py b/example/image-classification/symbol_inception-bn-28-small.py
index bc934c377b5a..b5a2afce2c1c 100644
--- a/example/image-classification/symbol_inception-bn-28-small.py
+++ b/example/image-classification/symbol_inception-bn-28-small.py
@@ -17,7 +17,7 @@ def DownsampleFactory(data, ch_3x3, mirror_attr):
     # conv 3x3
     conv = ConvFactory(data=data, kernel=(3, 3), stride=(2, 2), num_filter=ch_3x3, pad=(1, 1), mirror_attr=mirror_attr)
     # pool
-    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type='max', attr=mirror_attr)
+    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', attr=mirror_attr)
     # concat
     concat = mx.symbol.Concat(*[conv, pool])
     return concat
diff --git a/example/image-classification/symbol_inception-bn-full.py b/example/image-classification/symbol_inception-bn-full.py
index 27f6bebd9815..de87cf8ebe42 100644
--- a/example/image-classification/symbol_inception-bn-full.py
+++ b/example/image-classification/symbol_inception-bn-full.py
@@ -37,7 +37,7 @@ def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
     cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
     cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
     # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type="max", name=('max_pool_%s_pool' % name))
     # concat
     concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
     return concat
diff --git a/example/image-classification/symbol_inception-bn.py b/example/image-classification/symbol_inception-bn.py
index 985ede4a4a19..c3a2fa8d08ae 100644
--- a/example/image-classification/symbol_inception-bn.py
+++ b/example/image-classification/symbol_inception-bn.py
@@ -45,7 +45,7 @@ def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
     cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
     cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
     # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type="max", name=('max_pool_%s_pool' % name))
     # concat
     concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
     return concat
diff --git a/example/image-classification/train_cifar10.py b/example/image-classification/train_cifar10.py
index 8bbd233deeb3..dc3580cd3181 100644
--- a/example/image-classification/train_cifar10.py
+++ b/example/image-classification/train_cifar10.py
@@ -40,7 +40,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')) :
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip")
+        os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip")
         os.system("unzip -u cifar10.zip")
         os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
     os.chdir("..")
diff --git a/example/image-classification/train_cifar10_mirroring.py b/example/image-classification/train_cifar10_mirroring.py
index 93f263132a8d..24ded036bd71 100644
--- a/example/image-classification/train_cifar10_mirroring.py
+++ b/example/image-classification/train_cifar10_mirroring.py
@@ -8,7 +8,7 @@
 # documentation could be expected when this feature is mature.
 #
 # When mirroring is turned on and set properly, we could expect smaller memory
-# consumption with slightly slower computation speed (due to extra forward 
+# consumption with slightly slower computation speed (due to extra forward
 # steps). We are not including a sample running log here, as this test case
 # is only a functionality test. The using of pycuda GPU memory query is also
 # not very good way of measuring the memory usage here.
@@ -55,7 +55,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')) :
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip")
+        os.system("wget http://data.dmlc.ml/mxnet/data/data/cifar10.zip")
         os.system("unzip -u cifar10.zip")
         os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
     os.chdir("..")
diff --git a/example/image-classification/train_cifar10_resnet.py b/example/image-classification/train_cifar10_resnet.py
index 6357df67fc01..b85ffd35c434 100644
--- a/example/image-classification/train_cifar10_resnet.py
+++ b/example/image-classification/train_cifar10_resnet.py
@@ -79,7 +79,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')):
-        os.system('wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip')
+        os.system('wget http://data.dmlc.ml/mxnet/data/cifar10.zip')
         os.system('unzip -u cifar10.zip')
         os.system('mv cifar/* .; rm -rf cifar; rm cifar10.zip')
     os.chdir('..')
diff --git a/example/image-classification/train_mnist.R b/example/image-classification/train_mnist.R
index e4fde087b174..4d80512a8e92 100644
--- a/example/image-classification/train_mnist.R
+++ b/example/image-classification/train_mnist.R
@@ -4,11 +4,11 @@ require(mxnet)
 download_ <- function(data_dir) {
     dir.create(data_dir, showWarnings = FALSE)
     setwd(data_dir)
-    if ((!file.exists('train-images-idx3-ubyte')) || 
+    if ((!file.exists('train-images-idx3-ubyte')) ||
         (!file.exists('train-labels-idx1-ubyte')) ||
         (!file.exists('t10k-images-idx3-ubyte')) ||
         (!file.exists('t10k-labels-idx1-ubyte'))) {
-        download.file(url='http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip',
+        download.file(url='http://data.dmlc.ml/mxnet/data/mnist.zip',
                       destfile='mnist.zip', method='wget')
         unzip("mnist.zip")
         file.remove("mnist.zip")
@@ -83,7 +83,7 @@ get_iterator <- function(data_shape) {
 }
 
 parse_args <- function() {
-    parser <- ArgumentParser(description='train an image classifer on mnist')  
+    parser <- ArgumentParser(description='train an image classifer on mnist')
     parser$add_argument('--network', type='character', default='mlp',
                         choices = c('mlp', 'lenet'),
                         help = 'the cnn to use')
diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
index 7758d28c7977..fd7c00cfac25 100644
--- a/example/image-classification/train_mnist.py
+++ b/example/image-classification/train_mnist.py
@@ -12,7 +12,7 @@ def _download(data_dir):
        (not os.path.exists('train-labels-idx1-ubyte')) or \
        (not os.path.exists('t10k-images-idx3-ubyte')) or \
        (not os.path.exists('t10k-labels-idx1-ubyte')):
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip")
+        os.system("wget http://data.dmlc.ml/mxnet/data/mnist.zip")
         os.system("unzip -u mnist.zip; rm mnist.zip")
     os.chdir("..")
 
diff --git a/example/module/train_cifar10.py b/example/module/train_cifar10.py
index bcf6f8bd0bf7..2603f9cf704f 100644
--- a/example/module/train_cifar10.py
+++ b/example/module/train_cifar10.py
@@ -52,7 +52,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')) :
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip")
+           os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip")
         os.system("unzip -u cifar10.zip")
         os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
     os.chdir(cwd)
diff --git a/example/notebooks/predict-with-pretrained-model.ipynb b/example/notebooks/predict-with-pretrained-model.ipynb
index 73ba99071890..f85157dc714f 100644
--- a/example/notebooks/predict-with-pretrained-model.ipynb
+++ b/example/notebooks/predict-with-pretrained-model.ipynb
@@ -16,7 +16,7 @@
     "For network structure, you can visualize it in [Composite Symbol Demo](composite_symbol.ipynb)\n",
     "\n",
     "The pre-trained Inception-BatchNorm network is able to be downloaded from:\n",
-    "[http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)\n",
+    "[http://data.dmlc.ml/mxnet/data/Inception.zip](http://data.dmlc.ml/mxnet/data/Inception.zip)\n",
     "This model achieves Top-1 Accuracy: 70% and Top-5 Accuracy: 89.9%\n",
     "\n",
     "Note: This network is trained by using very simple augmentation (random flip + random crop). We will release model with a little bit more augmentation (which achieves better validation score)"
diff --git a/example/rnn/char-rnn.ipynb b/example/rnn/char-rnn.ipynb
index cefe60b96995..4ad18815be02 100644
--- a/example/rnn/char-rnn.ipynb
+++ b/example/rnn/char-rnn.ipynb
@@ -43,14 +43,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<img src=\"http://webdocs.cs.ualberta.ca/~bx3/char-rnn_1.png\">\n"
+    "<img src=\"http://data.dmlc.ml/mxnet/data/char-rnn_1.png\">\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<img src=\"http://webdocs.cs.ualberta.ca/~bx3/char-rnn_2.png\">"
+    "<img src=\"http://data.dmlc.ml/mxnet/data/char-rnn_2.png\">"
    ]
   },
   {
@@ -145,7 +145,7 @@
    ],
    "source": [
     "import os\n",
-    "data_url = \"http://webdocs.cs.ualberta.ca/~bx3/lab_data.zip\"\n",
+    "data_url = \"http://data.dmlc.ml/mxnet/data/lab_data.zip\"\n",
     "os.system(\"wget %s\" % data_url)\n",
     "os.system(\"unzip -o lab_data.zip\")"
    ]
diff --git a/scala-package/core/scripts/get_cifar_data.sh b/scala-package/core/scripts/get_cifar_data.sh
index 48c4bfde2225..4b59e2c2ad4b 100755
--- a/scala-package/core/scripts/get_cifar_data.sh
+++ b/scala-package/core/scripts/get_cifar_data.sh
@@ -5,7 +5,7 @@ fi
 
 cifar_data_path="./data/cifar10.zip"
 if [ ! -f "$cifar_data_path" ]; then
-  wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip -P $data_path
+  wget http://data.dmlc.ml/mxnet/data/cifar10.zip -P $data_path
   cd $data_path
   unzip -u cifar10.zip
-fi
\ No newline at end of file
+fi
diff --git a/scala-package/core/scripts/get_mnist_data.sh b/scala-package/core/scripts/get_mnist_data.sh
index e080144f6663..359e399629cc 100755
--- a/scala-package/core/scripts/get_mnist_data.sh
+++ b/scala-package/core/scripts/get_mnist_data.sh
@@ -5,7 +5,7 @@ fi
 
 mnist_data_path="./data/mnist.zip"
 if [ ! -f "$mnist_data_path" ]; then
-  wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip -P $data_path
+  wget http://data.dmlc.ml/mxnet/data/mnist.zip -P $data_path
   cd $data_path
   unzip -u mnist.zip
 fi
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
index 0f356bfb5fa4..57c065f2e86b 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
@@ -91,6 +91,6 @@ class ConvSuite extends FunSuite with BeforeAndAfterAll {
     }
     val acc = numCorrect.toFloat / numInst
     logger.info(s"Final accuracy = $acc")
-    assert(acc > 0.96)
+    assert(acc > 0.95)
   }
 }
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 46e99d4f514d..59d82465cfb6 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -40,21 +40,21 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   int cudnn_tune;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     int shape[] = {1, 1};
-    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x)");
+    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x) or (d, y, x)");
     DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
-    .describe("convolution stride: (y, x)");
+    .describe("convolution stride: (y, x) or (d, y, x)");
     DMLC_DECLARE_FIELD(dilate).set_default(TShape(shape, shape + 2))
     .describe("convolution dilate: (y, x)");
     shape[0] = shape[1] = 0;
     DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-    .describe("pad for convolution: (y, x)");
+    .describe("pad for convolution: (y, x) or (d, y, x)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("convolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
     .describe("Number of groups partition. "
               "This option is not supported by CuDNN, you can use SliceChannel to num_group,"
               "apply convolution and concat instead to achieve the same need.");
-    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
+    DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192)
     .describe("Tmp workspace for convolution (MB).");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
     .describe("Whether to disable bias parameter.");
@@ -89,6 +89,9 @@ class ConvolutionOp : public Operator {
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
     Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
@@ -163,6 +166,9 @@ class ConvolutionOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): check the BLAS Handle, be careful
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
     CHECK_EQ(out_grad.size(), 1);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
@@ -333,37 +339,82 @@ class ConvolutionProp : public OperatorProperty {
     }
     const TShape &dshape = (*in_shape)[conv::kData];
     if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 4) \
-        << "Input data should be 4D in batch-num_filter-y-x";
-    SHAPE_ASSIGN_CHECK(*in_shape,
-                       conv::kWeight,
-                       Shape4(param_.num_filter, dshape[1] / param_.num_group,
-                              param_.kernel[0], param_.kernel[1]));
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
-    const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-    CHECK_EQ(dshape[1] % param_.num_group, 0) \
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      CHECK_EQ(dshape.ndim(), 4) \
+          << "Input data should be 4D in batch-num_filter-y-x";
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                         conv::kWeight,
+                         Shape4(param_.num_filter, dshape[1] / param_.num_group,
+                                param_.kernel[0], param_.kernel[1]));
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+      out_shape->clear();
+      out_shape->push_back(dshape);
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0) \
+          << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+          << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
+          << "kernel size exceed input";
+      (*out_shape)[conv::kOut][1] = param_.num_filter;
+      (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
+          (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
+      (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] -
+          (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
+      return true;
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      CHECK_EQ(dshape.ndim(), 5) \
+        << "Input data should be 5D in batch-num_filter-depth-y-x";
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                         conv::kWeight,
+                         Shape5(param_.num_filter, dshape[1] / param_.num_group,
+                                param_.kernel[0], param_.kernel[1], param_.kernel[2]));
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+      out_shape->clear();
+      out_shape->push_back(dshape);
+      const index_t ksize_d = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[1]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[2]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0) \
-        << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0) \
-        << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0) \
-        << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0) \
-        << "incorrect dilate size: " << param_.dilate;
-    CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
-        << "kernel size exceed input";
-    (*out_shape)[conv::kOut][1] = param_.num_filter;
-    (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
-        (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
-    (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] -
-        (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
-    return true;
+      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+          << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_d < dshape[2] && ksize_y <= dshape[3] && ksize_x <= dshape[4])
+          << "kernel size exceed input";
+      if (param_.dilate.Size() != 1) {
+        LOG(INFO) << "Dilate is not supported in 3d convolution";
+      }
+      (*out_shape)[conv::kOut][1] = param_.num_filter;
+      (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
+          (1 * (ksize_d - 1) + 1)) / param_.stride[0] + 1;
+      (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] -
+          (1 * (ksize_y - 1) + 1)) / param_.stride[1] + 1;
+      (*out_shape)[conv::kOut][4] = (dshape[4] + 2 * param_.pad[2] -
+          (1 * (ksize_x - 1) + 1)) / param_.stride[2] + 1;
+      return true;
+    } else {
+      LOG(FATAL) << "Unknown convolution type";
+      return false;
+    }
   }
 
   bool InferType(std::vector<int> *in_type,
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index c307c8336ee4..34a6c7295d41 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -10,6 +10,8 @@
 #include <algorithm>
 #include <vector>
 #include "./convolution-inl.h"
+#include "../common/cuda_utils.h"
+
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
@@ -36,7 +38,6 @@ class CuDNNConvolutionOp : public Operator {
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
     init_cudnn_ = false;
-    // TODO(xxx): fp16
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
 
     if (param.cudnn_tune != conv::kOff) {
@@ -63,50 +64,69 @@ class CuDNNConvolutionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
+    DType *data_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *out_ptr = NULL;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(wmat.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
     Tensor<gpu, 1, DType> workspace =
         ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
                                  mshadow::Shape1(forward_workspace_), s);
+
+    if (param_.kernel.ndim() == 2) {
+      Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(wmat.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+      wmat_ptr = wmat.dptr_;
+      out_ptr = out.dptr_;
+    } else {
+      Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> wmat = in_data[conv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data[conv::kOut].get<gpu, 5, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(wmat.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+      wmat_ptr = wmat.dptr_;
+      out_ptr = out.dptr_;
+    }
     for (uint32_t g = 0; g < param_.num_group; ++g) {
       typename DataType<DType>::ScaleType alpha = 1.0f;
       typename DataType<DType>::ScaleType beta = 0.0f;
       CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
                                        &alpha,
                                        in_desc_,
-                                       data.dptr_ + data_offset_ * g,
+                                       data_ptr + data_offset_ * g,
                                        filter_desc_,
-                                       wmat.dptr_ + weight_offset_ * g,
+                                       wmat_ptr + weight_offset_ * g,
                                        conv_desc_,
                                        algo_,
                                        workspace.dptr_,
                                        forward_workspace_byte_,
                                        &beta,
                                        out_desc_,
-                                       out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+                                       out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
       if (!param_.no_bias) {
         beta = 1.0f;
         Tensor<gpu, 1, DType> bias = in_data[conv::kBias].get<gpu, 1, DType>(s);
-#if CUDNN_MAJOR >= 4
+        #if CUDNN_MAJOR >= 4
         CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
-                                &alpha,
-                                bias_desc_,
-                                bias.dptr_ + bias_offset_ * g,
-                                &beta,
-                                out_desc_,
-                                out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
-#endif
-#if CUDNN_MAJOR == 3
+                                  &alpha,
+                                  bias_desc_,
+                                  bias.dptr_ + bias_offset_ * g,
+                                  &beta,
+                                  out_desc_,
+                                  out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+        #endif
+        #if CUDNN_MAJOR == 3
         CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
                                 CUDNN_ADD_SAME_C,
                                 &alpha,
@@ -115,7 +135,7 @@ class CuDNNConvolutionOp : public Operator {
                                 &beta,
                                 out_desc_,
                                 out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
-#endif
+        #endif
       }
     }
   }
@@ -130,14 +150,37 @@ class CuDNNConvolutionOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
+    DType *grad_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *gwmat_ptr = NULL;
+    DType *data_ptr = NULL;
+    DType *gdata_ptr = NULL;
     CHECK_EQ(out_grad.size(), 1);
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4, DType> grad = out_grad[conv::kOut].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> gwmat = in_grad[conv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> gdata = in_grad[conv::kData].get<gpu, 4, DType>(s);
+    if (param_.kernel.ndim() == 2) {
+      Tensor<gpu, 4, DType> grad = out_grad[conv::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> gwmat = in_grad[conv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> gdata = in_grad[conv::kData].get<gpu, 4, DType>(s);
+      grad_ptr = grad.dptr_;
+      wmat_ptr = wmat.dptr_;
+      gwmat_ptr = gwmat.dptr_;
+      data_ptr = data.dptr_;
+      gdata_ptr = gdata.dptr_;
+    } else {
+      Tensor<gpu, 5, DType> grad = out_grad[conv::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> wmat = in_data[conv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> gwmat = in_grad[conv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> gdata = in_grad[conv::kData].get<gpu, 5, DType>(s);
+      grad_ptr = grad.dptr_;
+      wmat_ptr = wmat.dptr_;
+      gwmat_ptr = gwmat.dptr_;
+      data_ptr = data.dptr_;
+      gdata_ptr = gdata.dptr_;
+    }
     Tensor<gpu, 1, DType> workspace =
       ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
       mshadow::Shape1(backward_workspace_), s);
@@ -150,7 +193,7 @@ class CuDNNConvolutionOp : public Operator {
         CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
                                               &alpha,
                                               out_desc_,
-                                              grad.dptr_ + out_offset_ * g,
+                                              grad_ptr + out_offset_ * g,
                                               req[conv::kBias] == kWriteTo ? &beta : &beta_add,
                                               bias_desc_,
                                               gbias.dptr_ + bias_offset_ * g),
@@ -160,59 +203,60 @@ class CuDNNConvolutionOp : public Operator {
       CHECK_EQ(cudnnConvolutionBackwardFilter_v3(s->dnn_handle_,
                &alpha,
                in_desc_,
-               data.dptr_ + data_offset_ * g,
+               data_ptr + data_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_w_,
                workspace.dptr_,
                backward_workspace_byte_,
                req[conv::kWeight] == kWriteTo? &beta : &beta_add,
                filter_desc_,
-               gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gwmat_ptr + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
       #elif CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnConvolutionBackwardFilter(s->dnn_handle_,
+      back_algo_w_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
                &alpha,
                in_desc_,
-               data.dptr_ + data_offset_ * g,
+               data_ptr + data_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_w_,
                workspace.dptr_,
                backward_workspace_byte_,
                req[conv::kWeight] == kWriteTo? &beta : &beta_add,
                filter_desc_,
-               gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gwmat_ptr + weight_offset_ * g));
       #endif
       #if CUDNN_MAJOR <= 4
       CHECK_EQ(cudnnConvolutionBackwardData_v3(s->dnn_handle_,
                &alpha,
                filter_desc_,
-               wmat.dptr_ + weight_offset_ * g,
+               wmat_ptr + weight_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_,
                workspace.dptr_,
                backward_workspace_byte_,
                &beta,
                in_desc_,
-               gdata.dptr_ + data_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gdata_ptr + data_offset_ * g), CUDNN_STATUS_SUCCESS);
       #elif CUDNN_MAJOR == 5
       CHECK_EQ(cudnnConvolutionBackwardData(s->dnn_handle_,
                &alpha,
                filter_desc_,
-               wmat.dptr_ + weight_offset_ * g,
+               wmat_ptr + weight_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_,
                workspace.dptr_,
                backward_workspace_byte_,
                &beta,
                in_desc_,
-               gdata.dptr_ + data_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gdata_ptr + data_offset_ * g), CUDNN_STATUS_SUCCESS);
       #endif
     }
   }
@@ -233,71 +277,158 @@ class CuDNNConvolutionOp : public Operator {
       size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
       size_t back_size = 0;
       size_t back_size_w = 0;
-      Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
-      data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
-      out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
-      weight_offset_ = param_.num_filter / param_.num_group * data.shape_[1] / param_.num_group
-                       * param_.kernel[0] * param_.kernel[1];
       CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc_), CUDNN_STATUS_SUCCESS);
-      #if CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
-                                          dtype_,
-                                          format_,
-                                          param_.num_filter / param_.num_group,
-                                          data.shape_[1] / param_.num_group,
-                                          param_.kernel[0],
-                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
-      #else
-      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
-                                          dtype_,
-                                          param_.num_filter / param_.num_group,
-                                          data.shape_[1] / param_.num_group,
-                                          param_.kernel[0],
-                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
-      #endif
-      CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
-                                               param_.pad[0],
-                                               param_.pad[1],
-                                               param_.stride[0],
-                                               param_.stride[1],
-                                               1,
-                                               1,
-                                               CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc_,
+      if (param_.kernel.ndim() == 2) {
+        // 2d conv
+        Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
+        Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
+        data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
+        out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
+        weight_offset_ = param_.num_filter / param_.num_group * data.shape_[1] / param_.num_group
+                        * param_.kernel[0] * param_.kernel[1];
+        #if CUDNN_MAJOR == 5
+        CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
                                             dtype_,
-                                            data.shape_[0],
+                                            format_,
+                                            param_.num_filter / param_.num_group,
                                             data.shape_[1] / param_.num_group,
-                                            data.shape_[2],
-                                            data.shape_[3],
-                                            data.shape_[1] * data.shape_[2] * data.shape_[3],
-                                            data.shape_[2] * data.shape_[3],
-                                            data.shape_[3],
-                                            1), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc_,
+                                            param_.kernel[0],
+                                            param_.kernel[1]), CUDNN_STATUS_SUCCESS);
+        #else
+        CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
                                             dtype_,
-                                            out.shape_[0],
-                                            out.shape_[1] / param_.num_group,
-                                            out.shape_[2],
-                                            out.shape_[3],
-                                            out.shape_[1] * out.shape_[2] * out.shape_[3],
-                                            out.shape_[2] * out.shape_[3],
-                                            out.shape_[3],
-                                            1), CUDNN_STATUS_SUCCESS);
+                                            param_.num_filter / param_.num_group,
+                                            data.shape_[1] / param_.num_group,
+                                            param_.kernel[0],
+                                            param_.kernel[1]), CUDNN_STATUS_SUCCESS);
+        #endif
+        CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
+                                                param_.pad[0],
+                                                param_.pad[1],
+                                                param_.stride[0],
+                                                param_.stride[1],
+                                                1,
+                                                1,
+                                                CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc_,
+                                              dtype_,
+                                              data.shape_[0],
+                                              data.shape_[1] / param_.num_group,
+                                              data.shape_[2],
+                                              data.shape_[3],
+                                              data.shape_[1] * data.shape_[2] * data.shape_[3],
+                                              data.shape_[2] * data.shape_[3],
+                                              data.shape_[3],
+                                              1), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc_,
+                                              dtype_,
+                                              out.shape_[0],
+                                              out.shape_[1] / param_.num_group,
+                                              out.shape_[2],
+                                              out.shape_[3],
+                                              out.shape_[1] * out.shape_[2] * out.shape_[3],
+                                              out.shape_[2] * out.shape_[3],
+                                              out.shape_[3],
+                                              1), CUDNN_STATUS_SUCCESS);
+      } else if (param_.kernel.ndim() == 3) {
+        // 3d conv
+        Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
+        Tensor<gpu, 5, DType> out = out_data[conv::kOut].get<gpu, 5, DType>(s);
+        data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * \
+                                                           data.shape_[3] * \
+                                                           data.shape_[4];
+        out_offset_ = out.shape_[1] / param_.num_group * out.shape_[2] * \
+                                                         out.shape_[3] * \
+                                                         out.shape_[4];
+        weight_offset_ = param_.num_filter / param_.num_group * data.shape_[1] / param_.num_group
+                        * param_.kernel[0] * param_.kernel[1] * param_.kernel[2];
+        std::vector<int> filter_vec = {static_cast<int>(param_.num_filter / param_.num_group),
+                                       static_cast<int>(data.shape_[1] / param_.num_group),
+                                       static_cast<int>(param_.kernel[0]),
+                                       static_cast<int>(param_.kernel[1]),
+                                       static_cast<int>(param_.kernel[2])};
+
+        std::vector<int> pad_vec = {static_cast<int>(param_.pad[0]),
+                                    static_cast<int>(param_.pad[1]),
+                                    static_cast<int>(param_.pad[2])};
+
+        std::vector<int> stride_vec = {static_cast<int>(param_.stride[0]),
+                                       static_cast<int>(param_.stride[1]),
+                                       static_cast<int>(param_.stride[2])};
+
+        std::vector<int> upscale_vec = {1, 1, 1};
+
+        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                   static_cast<int>(data.shape_[1]),
+                                   static_cast<int>(data.shape_[2]),
+                                   static_cast<int>(data.shape_[3]),
+                                   static_cast<int>(data.shape_[4])};
+
+        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[4]),
+                                    1};
+
+        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                   static_cast<int>(out.shape_[1]),
+                                   static_cast<int>(out.shape_[2]),
+                                   static_cast<int>(out.shape_[3]),
+                                   static_cast<int>(out.shape_[4])};
+
+        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[4]),
+                                    1};
+
+        #if CUDNN_MAJOR == 5
+        CHECK_EQ(cudnnSetFilterNdDescriptor(filter_desc_,
+                                            dtype_,
+                                            format_,
+                                            static_cast<int>(filter_vec.size()),
+                                            &filter_vec[0]), CUDNN_STATUS_SUCCESS);
+        #else
+        LOG(FATAL) << "Only support CUDNN V5 for 3D convolution";
+        #endif
+        CHECK_EQ(cudnnSetConvolutionNdDescriptor(conv_desc_,
+                                                 3,
+                                                 &pad_vec[0],
+                                                 &stride_vec[0],
+                                                 &upscale_vec[0],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 dtype_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc_,
+                                              dtype_,
+                                              static_cast<int>(ishape.size()),
+                                              &ishape[0],
+                                              &istride[0]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc_,
+                                              dtype_,
+                                              static_cast<int>(oshape.size()),
+                                              &oshape[0],
+                                              &ostride[0]), CUDNN_STATUS_SUCCESS);
+      }
       if (!param_.no_bias) {
         Tensor<gpu, 1, DType> bias = in_data[conv::kBias].get<gpu, 1, DType>(s);
         bias_offset_ = bias.shape_[0] / param_.num_group;
-        CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
-                                            CUDNN_TENSOR_NCHW,
+        std::vector<int> bias_shape = {1,
+                                       static_cast<int>(bias.shape_[0] / param_.num_group),
+                                       1, 1};
+        std::vector<int> bias_stride = {static_cast<int>(bias_offset_), 1, 1, 1};
+        if (param_.kernel.ndim() == 3) {
+          bias_shape.push_back(1);
+          bias_stride.push_back(1);
+        }
+        CHECK_EQ(cudnnSetTensorNdDescriptor(bias_desc_,
                                             dtype_,
-                                            1,
-                                            bias.shape_[0] / param_.num_group,
-                                            1,
-                                            1), CUDNN_STATUS_SUCCESS);
+                                            static_cast<int>(bias_shape.size()),
+                                            &bias_shape[0],
+                                            &bias_stride[0]), CUDNN_STATUS_SUCCESS);
       }
 
       if (!param_.cudnn_tune) {
@@ -351,6 +482,11 @@ class CuDNNConvolutionOp : public Operator {
       }
       forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
       backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
+      // ugly fix CUDNN algorithm selection
+      // safe to remove after CuDNN fix 3D conv selection
+      // if (param_.kernel.ndim() == 3) {
+      //   back_algo_w_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      // }
     }
   }
 
diff --git a/src/operator/cudnn_convolution.cc b/src/operator/cudnn_convolution.cc
index 1e72afe6af96..9c7efb7ce9ff 100644
--- a/src/operator/cudnn_convolution.cc
+++ b/src/operator/cudnn_convolution.cc
@@ -11,6 +11,7 @@
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
+// TODO(xxx): Refactor with Init CuDNN function, remove redandent code in initalization
 void TuneCudnnConvolution(ConvolutionParam param,
                           std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape,
@@ -46,13 +47,45 @@ void TuneCudnnConvolution(ConvolutionParam param,
   CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc), CUDNN_STATUS_SUCCESS);
   CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc), CUDNN_STATUS_SUCCESS);
 #if CUDNN_MAJOR == 5
-  CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
-                                      dtype,
-                                      format,
-                                      param.num_filter / param.num_group,
-                                      x_shape[1] / param.num_group,
-                                      param.kernel[0],
-                                      param.kernel[1]), CUDNN_STATUS_SUCCESS);
+  if (in_shape->at(0).ndim() == 4) {
+    // 2d conv
+    CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
+                                        dtype,
+                                        format,
+                                        param.num_filter / param.num_group,
+                                        x_shape[1] / param.num_group,
+                                        param.kernel[0],
+                                        param.kernel[1]), CUDNN_STATUS_SUCCESS);
+  } else {
+    // 3d conv, only support CUDNN v5
+    std::vector<int> filter_vec = {static_cast<int>(param.num_filter / param.num_group),
+                                   static_cast<int>(x_shape[1] / param.num_group),
+                                   static_cast<int>(param.kernel[0]),
+                                   static_cast<int>(param.kernel[1]),
+                                   static_cast<int>(param.kernel[2])};
+
+    std::vector<int> pad_vec = {static_cast<int>(param.pad[0]),
+                                static_cast<int>(param.pad[1]),
+                                static_cast<int>(param.pad[2])};
+
+    std::vector<int> stride_vec = {static_cast<int>(param.stride[0]),
+                                   static_cast<int>(param.stride[1]),
+                                   static_cast<int>(param.stride[2])};
+
+    std::vector<int> upscale_vec = {1, 1, 1};
+    CHECK_EQ(cudnnSetConvolutionNdDescriptor(conv_desc,
+                                             3,
+                                             &pad_vec[0],
+                                             &stride_vec[0],
+                                             &upscale_vec[0],
+                                             CUDNN_CROSS_CORRELATION,
+                                             dtype), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetFilterNdDescriptor(filter_desc,
+                                        dtype,
+                                        format,
+                                        static_cast<int>(filter_vec.size()),
+                                        &filter_vec[0]), CUDNN_STATUS_SUCCESS);
+  }
 #else
   CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
                                       dtype,
@@ -61,43 +94,93 @@ void TuneCudnnConvolution(ConvolutionParam param,
                                       param.kernel[0],
                                       param.kernel[1]), CUDNN_STATUS_SUCCESS);
 #endif
-  CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc,
-                                           param.pad[0],
-                                           param.pad[1],
-                                           param.stride[0],
-                                           param.stride[1],
-                                           1,
-                                           1,
-                                           CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc,
-                                        dtype,
-                                        x_shape[0],
-                                        x_shape[1] / param.num_group,
-                                        x_shape[2],
-                                        x_shape[3],
-                                        x_shape[1] * x_shape[2] * x_shape[3],
-                                        x_shape[2] * x_shape[3],
-                                        x_shape[3],
-                                        1), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc,
+  if (param.kernel.ndim() == 2) {
+    // 2d conv
+    CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             param.pad[0],
+                                             param.pad[1],
+                                             param.stride[0],
+                                             param.stride[1],
+                                             1,
+                                             1,
+                                             CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc,
+                                          dtype,
+                                          x_shape[0],
+                                          x_shape[1] / param.num_group,
+                                          x_shape[2],
+                                          x_shape[3],
+                                          x_shape[1] * x_shape[2] * x_shape[3],
+                                          x_shape[2] * x_shape[3],
+                                          x_shape[3],
+                                          1), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc,
+                                          dtype,
+                                          y_shape[0],
+                                          y_shape[1] / param.num_group,
+                                          y_shape[2],
+                                          y_shape[3],
+                                          y_shape[1] * y_shape[2] * y_shape[3],
+                                          y_shape[2] * y_shape[3],
+                                          y_shape[3],
+                                          1), CUDNN_STATUS_SUCCESS);
+    if (!param.no_bias) {
+      TShape bias_shape = (*in_shape)[conv::kBias];
+      CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype,
+                                          1,
+                                          bias_shape[0] / param.num_group,
+                                          1,
+                                          1), CUDNN_STATUS_SUCCESS);
+    }
+  } else {
+    // 3d conv
+    std::vector<int> ishape = {static_cast<int>(in_shape->at(conv::kData)[0]),
+                               static_cast<int>(in_shape->at(conv::kData)[1]),
+                               static_cast<int>(in_shape->at(conv::kData)[2]),
+                               static_cast<int>(in_shape->at(conv::kData)[3]),
+                               static_cast<int>(in_shape->at(conv::kData)[4])};
+
+    std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                static_cast<int>(ishape[3] * ishape[4]),
+                                static_cast<int>(ishape[4]),
+                                1};
+
+    std::vector<int> oshape = {static_cast<int>(out_shape->at(conv::kOut)[0]),
+                               static_cast<int>(out_shape->at(conv::kOut)[1]),
+                               static_cast<int>(out_shape->at(conv::kOut)[2]),
+                               static_cast<int>(out_shape->at(conv::kOut)[3]),
+                               static_cast<int>(out_shape->at(conv::kOut)[4])};
+
+    std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                static_cast<int>(oshape[3] * oshape[4]),
+                                static_cast<int>(oshape[4]),
+                                1};
+    CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc,
                                         dtype,
-                                        y_shape[0],
-                                        y_shape[1] / param.num_group,
-                                        y_shape[2],
-                                        y_shape[3],
-                                        y_shape[1] * y_shape[2] * y_shape[3],
-                                        y_shape[2] * y_shape[3],
-                                        y_shape[3],
-                                        1), CUDNN_STATUS_SUCCESS);
-  if (!param.no_bias) {
-    TShape bias_shape = (*in_shape)[conv::kBias];
-    CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc,
-                                        CUDNN_TENSOR_NCHW,
+                                        static_cast<int>(ishape.size()),
+                                        &ishape[0],
+                                        &istride[0]), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc,
                                         dtype,
-                                        1,
-                                        bias_shape[0] / param.num_group,
-                                        1,
-                                        1), CUDNN_STATUS_SUCCESS);
+                                        static_cast<int>(oshape.size()),
+                                        &oshape[0],
+                                        &ostride[0]), CUDNN_STATUS_SUCCESS);
+    if (!param.no_bias) {
+      TShape bias_shape = (*in_shape)[conv::kBias];
+      index_t bias_offset = bias_shape[0] / param.num_group;
+      std::vector<int> bshape = {1, static_cast<int>(bias_shape[0] / param.num_group),
+                                     1, 1, 1};
+      std::vector<int> bias_stride = {static_cast<int>(bias_offset), 1, 1, 1, 1};
+      CHECK_EQ(cudnnSetTensorNdDescriptor(bias_desc,
+                                          dtype,
+                                          static_cast<int>(bshape.size()),
+                                          &bshape[0],
+                                          &bias_stride[0]), CUDNN_STATUS_SUCCESS);
+    }
   }
 
   Engine::VarHandle var = Engine::Get()->NewVariable();
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index c7fa214aa55a..e995a1b289b0 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -51,27 +51,46 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    if (!init_cudnn_) {
-      this->Init(s, in_data, out_data);
-    }
-    if (param_.global_pool) {
-      this->InitGlobalPool(data.shape_);
-    }
     float alpha = 1.0f;
     float beta = 0.0f;
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
-    CHECK_EQ(cudnnPoolingForward(s->dnn_handle_,
-                                 pooling_desc_,
-                                 &alpha,
-                                 in_desc_,
-                                 data.dptr_,
-                                 &beta,
-                                 out_desc_,
-                                 out.dptr_), CUDNN_STATUS_SUCCESS);
+    if (param_.kernel.ndim() == 2) {
+      // 2d pool
+      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      if (!init_cudnn_) {
+        this->Init(s, in_data, out_data);
+      }
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      CHECK_EQ(cudnnPoolingForward(s->dnn_handle_,
+                                   pooling_desc_,
+                                   &alpha,
+                                   in_desc_,
+                                   data.dptr_,
+                                   &beta,
+                                   out_desc_,
+                                   out.dptr_), CUDNN_STATUS_SUCCESS);
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d pool
+      Tensor<gpu, 5> data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> out = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      if (!init_cudnn_) {
+        this->Init(s, in_data, out_data);
+      }
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      CHECK_EQ(cudnnPoolingForward(s->dnn_handle_,
+                                   pooling_desc_,
+                                   &alpha,
+                                   in_desc_,
+                                   data.dptr_,
+                                   &beta,
+                                   out_desc_,
+                                   out.dptr_), CUDNN_STATUS_SUCCESS);
+    } else {
+      LOG(FATAL) << "Only support 2D or 3D pooling";
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -90,14 +109,34 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_grad.size(), 1);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     float alpha = 1.0f;
     float beta = 0.0f;
-    CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
+    if (param_.kernel.ndim() == 2) {
+      // 2d pool
+      Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
+      CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
+                                  pooling_desc_,
+                                  &alpha,
+                                  out_desc_,
+                                  m_out_data.dptr_,
+                                  out_desc_,
+                                  m_out_grad.dptr_,
+                                  in_desc_,
+                                  m_in_data.dptr_,
+                                  &beta,
+                                  in_desc_,
+                                  m_in_grad.dptr_), CUDNN_STATUS_SUCCESS);
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d pool
+      Tensor<gpu, 5> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> m_in_data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, real_t>(s);
+      CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
                                   pooling_desc_,
                                   &alpha,
                                   out_desc_,
@@ -109,34 +148,12 @@ class CuDNNPoolingOp : public Operator {
                                   &beta,
                                   in_desc_,
                                   m_in_grad.dptr_), CUDNN_STATUS_SUCCESS);
+    } else {
+      LOG(FATAL) << "Only support 2D or 3D pooling";
+    }
   }
 
  private:
-  inline void InitGlobalPool(const mshadow::Shape<4> &dshape) {
-    #if CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           nan_prop_,
-                                           param_.global_pool ? dshape[2] : param_.kernel[0],
-                                           param_.global_pool ? dshape[3] : param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.global_pool ? 1 : param_.stride[0],
-                                           param_.global_pool ? 1 :param_.stride[1]),
-               CUDNN_STATUS_SUCCESS);
-      #else
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           param_.global_pool ? dshape[2] : param_.kernel[0],
-                                           param_.global_pool ? dshape[3] : param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.global_pool ? 1 : param_.stride[0],
-                                           param_.global_pool ? 1 : param_.stride[1]),
-               CUDNN_STATUS_SUCCESS);
-      #endif
-  }
-
   inline void Init(mshadow::Stream<gpu> *s,
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
@@ -148,45 +165,117 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
-      CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          data.shape_[0],
-                                          data.shape_[1],
-                                          data.shape_[2],
-                                          data.shape_[3]), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          out.shape_[0],
-                                          out.shape_[1],
-                                          out.shape_[2],
-                                          out.shape_[3]), CUDNN_STATUS_SUCCESS);
-      #if CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           nan_prop_,
-                                           param_.kernel[0],
-                                           param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.stride[0],
-                                           param_.stride[1]), CUDNN_STATUS_SUCCESS);
-      #else
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           param_.kernel[0],
-                                           param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.stride[0],
-                                           param_.stride[1]), CUDNN_STATUS_SUCCESS);
-      #endif
+      if (param_.kernel.ndim() == 2) {
+        // 2d conv
+        Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+        Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+        mshadow::Shape<4> dshape = data.shape_;
+        CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            data.shape_[0],
+                                            data.shape_[1],
+                                            data.shape_[2],
+                                            data.shape_[3]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            out.shape_[0],
+                                            out.shape_[1],
+                                            out.shape_[2],
+                                            out.shape_[3]), CUDNN_STATUS_SUCCESS);
+        #if CUDNN_MAJOR == 5
+          CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                               mode_,
+                                               nan_prop_,
+                                               param_.global_pool ? dshape[2] : param_.kernel[0],
+                                               param_.global_pool ? dshape[3] : param_.kernel[1],
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.global_pool ? 1 : param_.stride[0],
+                                               param_.global_pool ? 1 :param_.stride[1]),
+                                               CUDNN_STATUS_SUCCESS);
+        #else
+          CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                               mode_,
+                                               param_.global_pool ? dshape[2] : param_.kernel[0],
+                                               param_.global_pool ? dshape[3] : param_.kernel[1],
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.global_pool ? 1 : param_.stride[0],
+                                               param_.global_pool ? 1 : param_.stride[1]),
+                                               CUDNN_STATUS_SUCCESS);
+        #endif
+      } else {
+        Tensor<gpu, 5> data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
+        Tensor<gpu, 5> out = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+        CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                   static_cast<int>(data.shape_[1]),
+                                   static_cast<int>(data.shape_[2]),
+                                   static_cast<int>(data.shape_[3]),
+                                   static_cast<int>(data.shape_[4])};
+
+        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[4]),
+                                    1};
+
+        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                   static_cast<int>(out.shape_[1]),
+                                   static_cast<int>(out.shape_[2]),
+                                   static_cast<int>(out.shape_[3]),
+                                   static_cast<int>(out.shape_[4])};
+
+        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[4]),
+                                    1};
+
+        std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
+                                                            static_cast<int>(param_.kernel[0]),
+                                       param_.global_pool ? ishape[3] :
+                                                            static_cast<int>(param_.kernel[1]),
+                                       param_.global_pool ? ishape[4] :
+                                                            static_cast<int>(param_.kernel[2])};
+
+        std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
+                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
+                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
+
+        std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
+                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
+                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
+
+        CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc_,
+                                            dtype_,
+                                            static_cast<int>(ishape.size()),
+                                            &ishape[0],
+                                            &istride[0]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc_,
+                                            dtype_,
+                                            static_cast<int>(oshape.size()),
+                                            &oshape[0],
+                                            &ostride[0]), CUDNN_STATUS_SUCCESS);
+        #if CUDNN_MAJOR == 5
+        CHECK_EQ(cudnnSetPoolingNdDescriptor(pooling_desc_,
+                                             mode_,
+                                             nan_prop_,
+                                             static_cast<int>(kernel_vec.size()),
+                                             &(kernel_vec[0]),
+                                             &(pad_vec[0]),
+                                             &(stride_vec[0])), CUDNN_STATUS_SUCCESS);
+        #else
+        LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+        #endif
+      }
     }
   }
   bool init_cudnn_;
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index ac6190d38c93..630e61852bee 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -39,8 +39,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
               "This is useful for input with different shape");
 
     DMLC_DECLARE_FIELD(kernel)
-    .set_expect_ndim(2).enforce_nonzero()
-    .describe("pooling kernel size: (y, x)");
+    .enforce_nonzero()
+    .describe("pooling kernel size: (y, x) or (d, y, x)");
 
     DMLC_DECLARE_FIELD(pool_type)
     .add_enum("max", pool_enum::kMaxPooling)
@@ -50,13 +50,12 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
 
     int stride_shape[] = {1, 1};
     DMLC_DECLARE_FIELD(stride).set_default(TShape(stride_shape, stride_shape + 2))
-    .set_expect_ndim(2).enforce_nonzero()
-    .describe("stride: for pooling (y, x)");
+    .enforce_nonzero()
+    .describe("stride: for pooling (y, x) or (d, y, x)");
 
     int pad_shape[] = {0, 0};
     DMLC_DECLARE_FIELD(pad).set_default(TShape(pad_shape, pad_shape + 2))
-    .set_expect_ndim(2)
-    .describe("pad for pooling: (y, x)");
+    .describe("pad for pooling: (y, x) or (d, y, x)");
   }
 };
 
@@ -77,6 +76,9 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "Not implmented";
+    }
     Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> out = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
@@ -119,6 +121,9 @@ class PoolingOp : public Operator {
     CHECK_EQ(req.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     // TODO(bing): remove pad (0,0)
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "Not implmented";
+    }
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> grad = out_grad[pool_enum::kOut].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
@@ -179,22 +184,37 @@ class PoolingProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_EQ(dshape.ndim(), 4) << \
-                               "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+                               << "Or 5D in (batch, channel, d, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    if (param_.global_pool) {
-      oshape[2] = 1;
-      oshape[3] = 1;
-    } else {
-      oshape[2] = std::min(dshape[2] + 2 * param_.pad[0] - param_.kernel[0] + param_.stride[0] - 1,
-                          dshape[2] + 2 * param_.pad[0] - 1) / param_.stride[0] + 1;
-      oshape[3] = std::min(dshape[3] + 2 * param_.pad[1] - param_.kernel[1] + param_.stride[1] - 1,
-                          dshape[3] + 2 * param_.pad[1] - 1) / param_.stride[1] + 1;
+    if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+      } else {
+        oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / param_.stride[0];
+        oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
+      }
+      CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      } else {
+        oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / param_.stride[0];
+        oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
+        oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / param_.stride[2];
+      }
+      CHECK(oshape[2] > 0 && oshape[3] > 0 && oshape[4] > 0) << "Pooling: kernel size exceed input";
+      out_shape->clear();
+      out_shape->push_back(oshape);
     }
-    CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
-    out_shape->clear();
-    out_shape->push_back(oshape);
     return true;
   }
 
diff --git a/tests/python/common/get_data.py b/tests/python/common/get_data.py
index 65e8ac59ad6f..db7165e2903c 100644
--- a/tests/python/common/get_data.py
+++ b/tests/python/common/get_data.py
@@ -18,7 +18,7 @@ def GetMNIST_ubyte():
        (not os.path.exists('data/train-labels-idx1-ubyte')) or \
        (not os.path.exists('data/t10k-images-idx3-ubyte')) or \
        (not os.path.exists('data/t10k-labels-idx1-ubyte')):
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip -P data/")
+        os.system("wget http://data.dmlc.ml/mxnet/data/mnist.zip -P data/")
         os.chdir("./data")
         os.system("unzip -u mnist.zip")
         os.chdir("..")
@@ -28,7 +28,7 @@ def GetCifar10():
     if not os.path.isdir("data/"):
         os.system("mkdir data/")
     if not os.path.exists('data/cifar10.zip'):
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip -P data/")
+        os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip -P data/")
         os.chdir("./data")
         os.system("unzip -u cifar10.zip")
         os.chdir("..")
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index aaf908d92cc8..edbc2f99ebaa 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -74,7 +74,7 @@ if [ ${TASK} == "r_test" ]; then
 
     Rscript tests/travis/r_vignettes.R
 
-    wget http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip
+    wget http://data.dmlc.ml/mxnet/data/Inception.zip
     unzip Inception.zip && rm -rf Inception.zip
     wget https://s3-us-west-2.amazonaws.com/mxnet/train.csv -O train.csv
     wget https://s3-us-west-2.amazonaws.com/mxnet/test.csv -O test.csv
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 9e7fa00b7490..8e9e581fe66f 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -8,7 +8,6 @@ fi
 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
     brew update
     brew tap homebrew/science
-    brew info opencv
     brew install opencv
     brew install python3
     brew install fftw

From 7acf93b6ea8921268175e7f70e8a53e1da0b8056 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 10 Jul 2016 10:40:14 -0700
Subject: [PATCH 093/126] update mshadow to fix the docsystem (#2665)

---
 mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mshadow b/mshadow
index 0186f06e3c1f..46ead0cf1d51 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 0186f06e3c1ffd0777775fedd670d82052317674
+Subproject commit 46ead0cf1d5173f1bc595db4ca21dcfbdac6baf9

From f1c6efe0533db89bb715f0de13d3be0efd4a843f Mon Sep 17 00:00:00 2001
From: lightingghost <zhenp3ngzhou@gmail.com>
Date: Sun, 10 Jul 2016 17:42:26 -0700
Subject: [PATCH 094/126] fixed python3 compatibility (#2668)

---
 python/mxnet/module/bucketing_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index 0d040dea4881..94f47948415e 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -199,7 +199,7 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
             Typically `data_batch.provide_label`.
         """
         assert self.binded, 'call bind before switching bucket'
-        if not self._buckets.has_key(bucket_key):
+        if not bucket_key in self._buckets:
             symbol, data_names, label_names = self._sym_gen(bucket_key)
             module = Module(symbol, data_names, label_names,
                             logger=self.logger, context=self._context,
@@ -236,7 +236,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
 
         self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params,
                                          force_init=force_init)
-        for mod in self._buckets.itervalues():
+        for mod in self._buckets.values():
             if mod is not self._curr_module:
                 mod.borrow_optimizer(self._curr_module)
 

From b7df20d6bea07a191552c89e4b14021eeedd5504 Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Mon, 11 Jul 2016 22:13:13 -0700
Subject: [PATCH 095/126] fixes (#2667)

* mem allocator remove 4GB limit

* fix monitor for bulk execution

* add resume to optimizer
---
 python/mxnet/optimizer.py            |  7 ++++---
 src/storage/cpu_device_storage.h     | 10 +++++-----
 src/storage/gpu_device_storage.h     |  5 ++++-
 src/storage/pooled_storage_manager.h | 28 +++++++++++++++++-----------
 src/storage/storage.cc               |  3 +--
 src/symbol/graph_executor.cc         | 13 +++++++------
 6 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index fecd838a775b..94a84232f81f 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -90,7 +90,7 @@ def _init_cc_optimizer(name, param_keys, param_vals):
 
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=0.01,
-                 lr_scheduler=None, sym=None):
+                 lr_scheduler=None, sym=None, begin_num_update=0):
         self.rescale_grad = rescale_grad
         self.lr = learning_rate
         self.lr_scheduler = lr_scheduler
@@ -100,7 +100,8 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.wd = wd
         self.lr_mult = {}
         self.wd_mult = {}
-        self.num_update = 0
+        self.begin_num_update = begin_num_update
+        self.num_update = begin_num_update
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
 
@@ -176,7 +177,7 @@ def _update_count(self, index):
             The index will be updated
         """
         if index not in self._index_update_count:
-            self._index_update_count[index] = 0
+            self._index_update_count[index] = self.begin_num_update
         self._index_update_count[index] += 1
         self.num_update = max(self._index_update_count[index], self.num_update)
 
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 6838af037535..2afb658bb9c6 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -8,6 +8,7 @@
 
 #include <dmlc/logging.h>
 #include <cstdlib>
+#include <new>
 #include "mxnet/base.h"
 
 namespace mxnet {
@@ -38,16 +39,15 @@ class CPUDeviceStorage {
 };  // class CPUDeviceStorage
 
 inline void* CPUDeviceStorage::Alloc(size_t size) {
-#if _MSC_VER
   void* ptr;
+#if _MSC_VER
   ptr = _aligned_malloc(size, alignment_);
-  return CHECK_NOTNULL(ptr);
+  if (ptr == NULL) throw std::bad_alloc();
 #else
-  void* ptr;
   int ret = posix_memalign(&ptr, alignment_, size);
-  CHECK_EQ(ret, 0) << "Allocation failed";
-  return ptr;
+  if (ret != 0) throw std::bad_alloc();
 #endif
+  return ptr;
 }
 
 inline void CPUDeviceStorage::Free(void* ptr) {
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index f92918ac7dc9..10684905a861 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -11,6 +11,7 @@
 #if MXNET_USE_CUDA
 #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
+#include <new>
 
 namespace mxnet {
 namespace storage {
@@ -36,7 +37,9 @@ class GPUDeviceStorage {
 inline void* GPUDeviceStorage::Alloc(size_t size) {
   void* ret = nullptr;
 #if MXNET_USE_CUDA
-  CUDA_CALL(cudaMalloc(&ret, size));
+  cudaError_t e = cudaMalloc(&ret, size);
+  if (e != cudaSuccess && e != cudaErrorCudartUnloading)
+    throw std::bad_alloc();
 #else   // MXNET_USE_CUDA
   LOG(FATAL) << "Please compile with CUDA enabled";
 #endif  // MXNET_USE_CUDA
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 7d3c0dcb7802..6e613f81cdb4 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -10,6 +10,7 @@
 #include <unordered_map>
 #include <vector>
 #include <mutex>
+#include <new>
 #include "./storage_manager.h"
 
 namespace mxnet {
@@ -18,7 +19,7 @@ namespace storage {
 /*!
  * \brief Storage manager with a memory pool.
  */
-template <class DeviceStorage, size_t kThreshold>
+template <class DeviceStorage>
 class PooledStorageManager final : public StorageManager {
  public:
   /*!
@@ -45,16 +46,21 @@ class PooledStorageManager final : public StorageManager {
   DISALLOW_COPY_AND_ASSIGN(PooledStorageManager);
 };  // class PooledStorageManager
 
-template <class DeviceStorage, size_t kThreshold>
-void* PooledStorageManager<DeviceStorage, kThreshold>::Alloc(size_t size) {
+template <class DeviceStorage>
+void* PooledStorageManager<DeviceStorage>::Alloc(size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
-    if (kThreshold <= used_memory_) {
-      ReleaseAll();
-    }
     used_memory_ += size;
-    return DeviceStorage::Alloc(size);
+    for (int i = 0; i < 2; ++i) {
+      try {
+        return DeviceStorage::Alloc(size);
+      } catch (const std::bad_alloc& e) {
+        ReleaseAll();
+      }
+    }
+    LOG(FATAL) << "Memory allocation failed.";
+    return NULL;
   } else {
     auto&& reuse_pool = reuse_it->second;
     auto ret = reuse_pool.back();
@@ -63,16 +69,16 @@ void* PooledStorageManager<DeviceStorage, kThreshold>::Alloc(size_t size) {
   }
 }
 
-template <class DeviceStorage, size_t kThreshold>
-void PooledStorageManager<DeviceStorage, kThreshold>::Free(void* ptr,
+template <class DeviceStorage>
+void PooledStorageManager<DeviceStorage>::Free(void* ptr,
                                                            size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(ptr);
 }
 
-template <class DeviceStorage, size_t kThreshold>
-void PooledStorageManager<DeviceStorage, kThreshold>::ReleaseAll() {
+template <class DeviceStorage>
+void PooledStorageManager<DeviceStorage>::ReleaseAll() {
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
       DeviceStorage::Free(j);
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 4aeebd5681b7..177d95e257f9 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -25,13 +25,12 @@ class StorageImpl : public Storage {
   virtual ~StorageImpl() = default;
 
  private:
-  static constexpr size_t kPoolThreshold = 4096 * 1024 * 1024ul;
   static constexpr size_t kMaxNumberOfDevices = Context::kMaxDevType + 1;
   static constexpr size_t kMaxNumberOfDeviceIDs = Context::kMaxDevID + 1;
 
   template <class DeviceStorage>
   using CurrentStorageManager =
-      storage::PooledStorageManager<DeviceStorage, kPoolThreshold>;
+      storage::PooledStorageManager<DeviceStorage>;
 
   static void ActivateDevice(Context ctx) {
     switch (ctx.dev_type) {
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index acd9c0743250..4ea774829e2c 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -847,7 +847,6 @@ void GraphExecutor::InitOpSegs() {
   cached_seg_opr_.resize(topo_order_.size(), p);
 
   if (!prefer_bulk_execution_) return;
-  if (monitor_callback_) return;
   if (num_forward_nodes_ == topo_order_.size()) {
     cached_seg_opr_[0] = this->CreateCachedSegOpr(0, topo_order_.size());
     return;
@@ -902,11 +901,13 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
   }
 
   for (size_t i = topo_start; i < topo_end; ++i) {
-    auto seg_op = cached_seg_opr_[i];
-    if (seg_op.opr != nullptr && seg_op.topo_end <= topo_end) {
-      Engine::Get()->Push(seg_op.opr, seg_op.ctx);
-      i = seg_op.topo_end - 1;
-      continue;
+    if (!monitor_callback_) {
+      auto seg_op = cached_seg_opr_[i];
+      if (seg_op.opr != nullptr && seg_op.topo_end <= topo_end) {
+        Engine::Get()->Push(seg_op.opr, seg_op.ctx);
+        i = seg_op.topo_end - 1;
+        continue;
+      }
     }
 
     uint32_t nid = topo_order_[i];

From 8590200b7af78468e29ca0213daf0decdb170b24 Mon Sep 17 00:00:00 2001
From: "wei.shen" <wshenx@users.noreply.github.com>
Date: Tue, 12 Jul 2016 15:52:26 +0800
Subject: [PATCH 096/126] Update model.py

adding `eval_data.reset()` at the end of evaluation session to avoid training session being skipped when eval_data and train_data are the same data.
---
 python/mxnet/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 41e5c032311a..18987a51eaac 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -304,6 +304,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
             name_value = eval_metric.get_name_value()
             for name, value in name_value:
                 logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
+            eval_data.reset()
     # end of all epochs
     return
 

From 449e88e8a6ba5a04eb25ad42e257ce981b03049b Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Tue, 12 Jul 2016 16:08:52 -0700
Subject: [PATCH 097/126] fix build issues (#2679)

---
 dmlc-core                            | 2 +-
 include/mxnet/ndarray.h              | 5 +++--
 mshadow                              | 2 +-
 src/operator/cudnn_convolution-inl.h | 2 +-
 src/storage/pooled_storage_manager.h | 3 +--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dmlc-core b/dmlc-core
index 25e80e916b8b..ddef90217681 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 25e80e916b8bb42e2f027ce352970904389ba76b
+Subproject commit ddef9021768181add97e67b6d80fe25abd8e16b3
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 3649b36e0e05..e4f15082b398 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -65,11 +65,12 @@ class NDArray {
    * \return the data TBlob
    */
   inline TBlob data() const {
+    TBlob res;
     MSHADOW_TYPE_SWITCH(dtype_, DType, {
-      return TBlob(static_cast<DType*>(ptr_->shandle.dptr)
+      res = TBlob(static_cast<DType*>(ptr_->shandle.dptr)
         + offset_, shape_, ptr_->shandle.ctx.dev_mask());
     });
-    return TBlob();
+    return res;
   }
   /*!
    * \return the context of NDArray, this function is only valid when the NDArray is not empty
diff --git a/mshadow b/mshadow
index 46ead0cf1d51..11dba2e69bd4 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 46ead0cf1d5173f1bc595db4ca21dcfbdac6baf9
+Subproject commit 11dba2e69bd4d0d274933ab2f04e878fe9847069
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index 34a6c7295d41..5edde29f12fe 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -134,7 +134,7 @@ class CuDNNConvolutionOp : public Operator {
                                 bias.dptr_ + bias_offset_ * g,
                                 &beta,
                                 out_desc_,
-                                out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+                                out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
         #endif
       }
     }
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 6e613f81cdb4..5fcf781a67f0 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -70,8 +70,7 @@ void* PooledStorageManager<DeviceStorage>::Alloc(size_t size) {
 }
 
 template <class DeviceStorage>
-void PooledStorageManager<DeviceStorage>::Free(void* ptr,
-                                                           size_t size) {
+void PooledStorageManager<DeviceStorage>::Free(void* ptr, size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(ptr);

From b496758901e68bbbf5ad07b41d49da73a4889d85 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Wed, 13 Jul 2016 16:42:44 -0700
Subject: [PATCH 098/126] fix lint

---
 include/mxnet/mxrtc.h |  2 +-
 src/common/mxrtc.cc   | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/mxnet/mxrtc.h b/include/mxnet/mxrtc.h
index a45badb1d3dc..9de59f63da2a 100644
--- a/include/mxnet/mxrtc.h
+++ b/include/mxnet/mxrtc.h
@@ -60,7 +60,7 @@ class MXRtc {
             unsigned int  block_dim_Z);
 
  private:
-  static const std::string str_type;
+  static const char str_type[];
   static std::unordered_map<std::string, char*> kernel_registry;
 
   std::string name_;
diff --git a/src/common/mxrtc.cc b/src/common/mxrtc.cc
index 4fd687267409..c1ab065db627 100644
--- a/src/common/mxrtc.cc
+++ b/src/common/mxrtc.cc
@@ -7,7 +7,7 @@
 #include <mxnet/mxrtc.h>
 #if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
 namespace mxnet {
-const std::string MXRtc::str_type = "float";
+const char MXRtc::str_type[] = "float";
 std::unordered_map<std::string, char*> MXRtc::kernel_registry;
 
 MXRtc::MXRtc(const std::string& name,
@@ -79,36 +79,36 @@ std::string MXRtc::decorate(const std::string& name,
                          std::vector<std::pair<std::string, NDArray> > const& output,
                          const std::string kernel) {
     std::string source;
-    source += "\nextern \"C\" __global__ void " + name + "(";
+    source = source + "\nextern \"C\" __global__ void " + name + "(";
     for (auto &i : input) {
-        source += "const " + str_type + "* " + i.first + ",";
+        source = source + "const " + str_type + "* " + i.first + ",";
     }
     for (auto &i : output) {
-        source += str_type + "* " + i.first + ",";
+        source = source + str_type + "* " + i.first + ",";
     }
     source.pop_back();
     source = source + ") {\n";
     for (auto &i : input) {
-        source += "const int " + i.first + "_ndim = " +
+        source = source + "const int " + i.first + "_ndim = " +
                   std::to_string(i.second.shape().ndim()) + ";\n";
-        source += "const int " + i.first + "_dims[] = {";
+        source = source + "const int " + i.first + "_dims[] = {";
         for (index_t j = 0; j < i.second.shape().ndim(); ++j) {
-            source += std::to_string(i.second.shape()[j]) + ",";
+            source = source + std::to_string(i.second.shape()[j]) + ",";
         }
         source.pop_back();
-        source += "};\n";
+        source = source + "};\n";
     }
     for (auto &i : output) {
-        source += "const int " + i.first + "_ndim = " +
+        source = source + "const int " + i.first + "_ndim = " +
                   std::to_string(i.second.shape().ndim()) + ";\n";
-        source += "const int " + i.first + "_dims[] = {";
+        source = source + "const int " + i.first + "_dims[] = {";
         for (index_t j = 0; j < i.second.shape().ndim(); ++j) {
-            source += std::to_string(i.second.shape()[j]) + ",";
+            source = source + std::to_string(i.second.shape()[j]) + ",";
         }
         source.pop_back();
-        source += "};\n";
+        source = source + "};\n";
     }
-    source += kernel + "\n}\n";
+    source = source + kernel + "\n}\n";
     return source;
 }
 

From 44daeb4c9d8e65f69a449f9368d6891e135ef138 Mon Sep 17 00:00:00 2001
From: liming <zlmzju@gmail.com>
Date: Thu, 14 Jul 2016 17:05:39 +0800
Subject: [PATCH 099/126] fix bug of image resize in im2rec.py

---
 tools/im2rec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/im2rec.py b/tools/im2rec.py
index c602f773bb03..bdb398e18a93 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -100,9 +100,9 @@ def image_encode(item, q_out):
                 img = img[:, margin:margin + img.shape[0]]
         if args.resize:
             if img.shape[0] > img.shape[1]:
-                newsize = (img.shape[0] * args.resize / img.shape[1], args.resize)
+                newsize = (args.resize, img.shape[0] * args.resize / img.shape[1]
             else:
-                newsize = (args.resize, img.shape[1] * args.resize / img.shape[0])
+                newsize = (img.shape[1] * args.resize / img.shape[0], args.resize)
             img = cv2.resize(img, newsize)
         header = mx.recordio.IRHeader(0, item[2], item[0], 0)
 

From aa187664757955a4fc5562355af02daea9bf2f1f Mon Sep 17 00:00:00 2001
From: Jonas Rocha Lima Amaro <jonasrla@gmail.com>
Date: Thu, 14 Jul 2016 21:30:57 -0300
Subject: [PATCH 100/126] SVM Output cpu implementation + test + example
 (#2708)

* get rid of annoying virtualenv

* implements svm_output

* adds a usage example for SVMOutput

* Adds tests

* jonasrla as new contributor!

* fix .gitignore back

* style fix
---
 CONTRIBUTORS.md                        |   1 +
 example/svm_mnist/README.md            |  11 ++
 example/svm_mnist/svm_mnist.py         |  84 ++++++++++
 src/operator/svm_output-inl.h          | 206 +++++++++++++++++++++++++
 src/operator/svm_output.cc             |  82 ++++++++++
 src/operator/svm_output.cu             |  42 +++++
 tests/python/unittest/test_operator.py |  60 +++++++
 7 files changed, 486 insertions(+)
 create mode 100644 example/svm_mnist/README.md
 create mode 100644 example/svm_mnist/svm_mnist.py
 create mode 100644 src/operator/svm_output-inl.h
 create mode 100644 src/operator/svm_output.cc
 create mode 100644 src/operator/svm_output.cu

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a9f61ae69bf0..5dc64ed541e6 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -108,3 +108,4 @@ List of Contributors
 * [Depeng Liang](https://github.com/Ldpe2G)
 * [Kiko Qiu](https://github.com/kikoqiu)
 * [Yang Bo](https://github.com/Atry)
+* [Jonas Amaro](https://github.com/jonasrla)
\ No newline at end of file
diff --git a/example/svm_mnist/README.md b/example/svm_mnist/README.md
new file mode 100644
index 000000000000..082c2053f27e
--- /dev/null
+++ b/example/svm_mnist/README.md
@@ -0,0 +1,11 @@
+# Use case with Support Vector Machine
+
+To ensure that not only the implementation is learning, but is able to outsmart the softmax, as [this article](arxiv.org/pdf/1306.0239.pdf) suggests, I ran svm_mnist.py script. It was based on the MNIST experiment description on the article and [this tutorial](https://github.com/dmlc/mxnet-gtc-tutorial/blob/master/tutorial.ipynb).
+
+
+## To this you will need
+
+* [Numpy](http://www.scipy.org/scipylib/download.html)
+* [Sklearn](http://scikit-learn.org/stable/install.html)
+
+I recommend installing [matplot](http://matplotlib.org/users/installing.html) to visualize examples
\ No newline at end of file
diff --git a/example/svm_mnist/svm_mnist.py b/example/svm_mnist/svm_mnist.py
new file mode 100644
index 000000000000..f36a0457616f
--- /dev/null
+++ b/example/svm_mnist/svm_mnist.py
@@ -0,0 +1,84 @@
+
+#############################################################
+## Please read the README.md document for better reference ##
+#############################################################
+
+import mxnet as mx
+import numpy as np
+from sklearn.datasets import fetch_mldata
+from sklearn.decomposition import PCA
+# import matplotlib.pyplot as plt
+import logging
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+# Network declaration as symbols. The following pattern was based
+# on the article, but feel free to play with the number of nodes
+# and with the activation function
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=512)
+act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 512)
+act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
+
+# Here we add the ultimate layer based on L2-SVM objective
+mlp = mx.symbol.SVMOutput(data=fc3, name='svm')
+
+# To use L1-SVM objective, comment the line above and uncomment the line below
+# mlp = mx.symbol.SVMOutput(data=fc3, name='svm', use_linear=True)
+
+# Now we fetch MNIST dataset, add some noise, as the article suggests,
+# permutate and assign the examples to be used on our network
+mnist = fetch_mldata('MNIST original')
+mnist_pca = PCA(n_components=70).fit_transform(mnist.data)
+noise = np.random.normal(size=mnist_pca.shape)
+mnist_pca += noise
+np.random.seed(1234) # set seed for deterministic ordering
+p = np.random.permutation(mnist_pca.shape[0])
+X = mnist_pca[p]
+Y = mnist.target[p]
+X_show = mnist.data[p]
+
+# This is just to normalize the input to a value inside [0,1],
+# and separate train set and test set
+X = X.astype(np.float32)/255
+X_train = X[:60000]
+X_test = X[60000:]
+X_show = X_show[60000:]
+Y_train = Y[:60000]
+Y_test = Y[60000:]
+
+# Article's suggestion on batch size
+batch_size = 200
+train_iter = mx.io.NDArrayIter(X_train, Y_train, batch_size=batch_size)
+test_iter = mx.io.NDArrayIter(X_test, Y_test, batch_size=batch_size)
+
+# A quick work around to prevent mxnet complaining the lack of a softmax_label
+train_iter.label =  mx.io._init_data(Y_train, allow_empty=True, default_name='svm_label')
+test_iter.label =  mx.io._init_data(Y_test, allow_empty=True, default_name='svm_label')
+
+# Here we instatiate and fit the model for our data
+# The article actually suggests using 400 epochs,
+# But I reduced to 10, for convinience
+model = mx.model.FeedForward(
+    ctx = mx.cpu(0),      # Run on CPU 0
+    symbol = mlp,         # Use the network we just defined
+    num_epoch = 10,       # Train for 10 epochs
+    learning_rate = 0.1,  # Learning rate
+    momentum = 0.9,       # Momentum for SGD with momentum
+    wd = 0.00001,         # Weight decay for regularization
+    )
+model.fit(
+    X=train_iter,  # Training data set
+    eval_data=test_iter,  # Testing data set. MXNet computes scores on test set every epoch
+    batch_end_callback = mx.callback.Speedometer(batch_size, 200))  # Logging module to print out progress
+
+# Uncomment to view an example
+# plt.imshow((X_show[0].reshape((28,28))*255).astype(np.uint8), cmap='Greys_r')
+# plt.show()
+# print 'Result:', model.predict(X_test[0:1])[0].argmax()
+
+# Now it prints how good did the network did for this configuration
+print 'Accuracy:', model.score(test_iter)*100, '%'
\ No newline at end of file
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
new file mode 100644
index 000000000000..1221bf923cda
--- /dev/null
+++ b/src/operator/svm_output-inl.h
@@ -0,0 +1,206 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output-inl.h
+ * \brief
+ * \author Jonas Amaro
+*/
+#ifndef MXNET_OPERATOR_SVM_OUTPUT_INL_H_
+#define MXNET_OPERATOR_SVM_OUTPUT_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace svm_enum {
+enum SVMOutputOpInputs {kData, kLabel};
+enum SVMOutputOpOutputs {kOut};
+enum SVMOutputNormType {kNull, kBatch, kValid};
+enum SVMOutputOpResource {kTempSpace};
+}  // namespace svm_enum
+
+
+struct SVMOutputParam : public dmlc::Parameter<SVMOutputParam> {
+  float margin;
+  float regularization_coefficient;
+  bool use_linear;
+  DMLC_DECLARE_PARAMETER(SVMOutputParam) {
+    DMLC_DECLARE_FIELD(margin).set_default(1.0f)
+    .describe("Scale the DType(param_.margin) for activation size");
+    DMLC_DECLARE_FIELD(regularization_coefficient).set_default(1.0f)
+    .describe("Scale the coefficient responsible for balacing coefficient size and error tradeoff");
+    DMLC_DECLARE_FIELD(use_linear).set_default(false)
+    .describe("If set true, uses L1-SVM objective function. Default uses L2-SVM objective");
+  };
+};
+
+template<typename xpu, typename DType>
+class SVMOutputOp : public Operator {
+ public:
+  explicit SVMOutputOp(SVMOutputParam param) : param_(param) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2) << "Expecting [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "Expecting [output]";
+    CHECK_EQ(req.size(), 1) << "Expecting output.size() == req.size()";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, DType> data = in_data[svm_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Assign(out, req[svm_enum::kOut], F<mshadow_op::identity>(data));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_GE(in_grad.size(), 1);
+    CHECK_GE(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& label_shape = in_data[svm_enum::kLabel].shape_;
+
+    Tensor<xpu, 1, DType> label = in_data[svm_enum::kLabel].get_with_shape<xpu, 1, DType>(
+        Shape1(label_shape.ProdShape(0, label_shape.ndim())), s);
+    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = in_grad[svm_enum::kData].FlatTo2D<xpu, DType>(s);
+    CHECK_EQ(grad.shape_, out.shape_) << "SVMOutputs: shape mismatch";
+
+    if (param_.use_linear) {
+      L1_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
+    } else {
+      L2_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
+    }
+  }
+
+ private:
+  SVMOutputParam param_;
+};  // class SVMOutputOp
+
+// Declare Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(SVMOutputParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SVMOutputProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    TShape label_shape(dshape.ndim() - 1);
+    for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+      label_shape[i] = dshape[i];
+    SHAPE_ASSIGN_CHECK(*in_shape, svm_enum::kLabel, label_shape);
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SVMOutputProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SVMOutput";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {in_data[svm_enum::kLabel], out_data[svm_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_data[svm_enum::kOut], in_grad[svm_enum::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[svm_enum::kData], out_data[svm_enum::kOut]}};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ protected:
+  SVMOutputParam param_;
+};  // class SVMOutputProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SVM_OUTPUT_INL_H_
diff --git a/src/operator/svm_output.cc b/src/operator/svm_output.cc
new file mode 100644
index 000000000000..87b9f0a96c28
--- /dev/null
+++ b/src/operator/svm_output.cc
@@ -0,0 +1,82 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output.cc
+ * \brief
+ * \author Jonas Amaro
+*/
+#include "./svm_output-inl.h"
+#include "./mshadow_op.h"
+
+namespace mshadow {
+  template<typename DType>
+  inline void L1_SVM(const DType & margin,
+                     const DType & reg_coef,
+                     Tensor<cpu, 2, DType> dst,
+                     const Tensor<cpu, 1, DType> & label,
+                     const Tensor<cpu, 2, DType> & src) {
+    for (index_t y = 0; y < dst.size(0); y++) {
+      const index_t k = static_cast<int>(label[y]);
+      for (index_t x = 0; x < dst.size(1); x++) {
+        if (x == k) {
+          dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
+        } else {
+          dst[y][x] = DType(margin > -src[y][x]) * reg_coef;
+        }
+      }
+    }
+  }
+
+
+  template<typename DType>
+  inline void L2_SVM(const DType & margin,
+                     const DType & reg_coef,
+                     Tensor<cpu, 2, DType> dst,
+                     const Tensor<cpu, 1, DType> & label,
+                     const Tensor<cpu, 2, DType> & src) {
+    for (index_t y = 0; y < dst.size(0); y++) {
+      const index_t k = static_cast<int>(label[y]);
+      for (index_t x = 0; x < dst.size(1); x++) {
+        if (x == k) {
+          dst[y][k] = margin > src[y][k] ?  2*(margin - src[y][k]) : DType(0.0f);
+          dst[y][k] *= -reg_coef;
+        } else {
+          dst[y][x] = margin > -src[y][x] ? (-2)*(margin + src[y][x]) : DType(0.0f);
+          dst[y][x] *= -reg_coef;
+        }
+      }
+    }
+  }
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(SVMOutputParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SVMOutputOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SVMOutputProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SVMOutputParam);
+
+MXNET_REGISTER_OP_PROPERTY(SVMOutput, SVMOutputProp)
+.describe("Support Vector Machine based transformation on input, backprop L2-SVM")
+.add_argument("data", "Symbol", "Input data to svm.")
+.add_argument("label", "Symbol", "Label data.")
+.add_arguments(SVMOutputParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
new file mode 100644
index 000000000000..589eac75f458
--- /dev/null
+++ b/src/operator/svm_output.cu
@@ -0,0 +1,42 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output.cu
+ * \brief
+ * \author Jonas Amaro
+*/
+
+#include "./svm_output-inl.h"
+
+namespace mshadow {
+  template<typename DType>
+  inline void L1_SVM(const DType & margin,
+                   const DType & reg_coef,
+                   Tensor<gpu, 2, DType> dst,
+                   const Tensor<gpu, 1, DType> & label,
+                   const Tensor<gpu, 2, DType> & src) {
+    LOG(FATAL) << "Not Implemented.";
+  }
+  template<typename DType>
+  inline void L2_SVM(const DType & margin,
+               const DType & reg_coef,
+               Tensor<gpu, 2, DType> dst,
+               const Tensor<gpu, 1, DType> & label,
+               const Tensor<gpu, 2, DType> & src) {
+    LOG(FATAL) << "Not Implemented.";
+  }
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(SVMOutputParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SVMOutputOp<gpu, DType>(param);
+  })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 06537778b7de..785abe616729 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1419,6 +1419,64 @@ def test_batch_dot(ctx=mx.cpu()):
                     assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
 
 
+def test_support_vector_machine_l1_svm():
+    xpu = mx.cpu()
+    shape = (20, 10)
+
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.SVMOutput(data=X, label=L, use_linear=True)
+    x = mx.nd.empty(shape, ctx = xpu)
+    l = mx.nd.empty((shape[0],), ctx = xpu)
+    x_np = np.random.rand(*shape)
+    l_np = np.random.randint(0, shape[1], (shape[0],))
+    x[:] = x_np
+    l[:] = l_np
+
+    grad = mx.nd.empty(shape, ctx = xpu)
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    exec1.forward()
+
+    assert_allclose(x_np, exec1.outputs[0].asnumpy())
+    
+    exec1.backward()
+
+    l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1]))
+    l_mask = np.array(l_mask, dtype=np.float32)*2 -1
+    grad_np = (-1) * l_mask * np.greater(1 - l_mask * x_np, 0)
+
+    assert_allclose(grad_np, grad.asnumpy())
+
+def test_support_vector_machine_l2_svm():
+    xpu = mx.cpu()
+    shape = (20, 10)
+
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.SVMOutput(data=X, label=L)
+    x = mx.nd.empty(shape, ctx = xpu)
+    l = mx.nd.empty((shape[0],), ctx = xpu)
+    x_np = np.random.rand(*shape)
+    x_np = x_np.astype(np.float32)
+    l_np = np.random.randint(0, shape[1], (shape[0],))
+    x[:] = x_np
+    l[:] = l_np
+
+    grad = mx.nd.empty(shape, ctx = xpu)
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    exec1.forward()
+
+    assert_allclose(x_np, exec1.outputs[0].asnumpy())
+    
+    exec1.backward()
+    
+    l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1]))
+    l_mask = np.array(l_mask, dtype=np.float32)*2 -1
+    grad_np = (-2)*l_mask*np.maximum(1-l_mask*x_np,0)
+    grad_np = grad_np.astype(np.float32)
+    assert_allclose(grad_np, grad.asnumpy())
+
+
 if __name__ == '__main__':
     test_expand_dims()
     test_slice_axis()
@@ -1457,3 +1515,5 @@ def test_batch_dot(ctx=mx.cpu()):
     test_dot()
     test_batch_dot()
     test_correlation()
+    test_support_vector_machine_l1_svm()
+    test_support_vector_machine_l2_svm()

From f3cc5b179c8ce6e901f21b247106de76947ff406 Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Fri, 15 Jul 2016 14:28:05 -0700
Subject: [PATCH 101/126] Update module.py (#2721)

---
 python/mxnet/module/module.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 99efea3a2e5a..06141d945ef8 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -260,8 +260,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
             self.params_initialized = True
             self._arg_params = shared_module._arg_params
             self._aux_params = shared_module._aux_params
-
-        if self.params_initialized:
+        elif self.params_initialized:
             # if the parameters are already initialized, we are re-binding
             # so automatically copy the already initialized params
             self._exec_group.set_params(self._arg_params, self._aux_params)

From 88fd2ff1cbdf936d1f3cd41d290e04c8e5d9cf91 Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@ust.hk>
Date: Sat, 16 Jul 2016 15:20:46 +0800
Subject: [PATCH 102/126] Use a smaller threshold (#2726)

---
 tests/python/unittest/test_operator.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 785abe616729..d0a9ecffcdd7 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1387,10 +1387,10 @@ def test_dot(ctx=mx.cpu()):
                 c = mx.sym.dot(a, b)
                 exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
                 outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
-                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
                 exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
-                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
-                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
+                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
 
 
 def test_batch_dot(ctx=mx.cpu()):
@@ -1413,10 +1413,10 @@ def test_batch_dot(ctx=mx.cpu()):
                     c = mx.sym.batch_dot(a, b)
                     exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
                     outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
-                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
                     exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
-                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
-                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
+                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
 
 
 def test_support_vector_machine_l1_svm():

From 0ef25dea6544affa87bb9f034e69f4e844f2d304 Mon Sep 17 00:00:00 2001
From: Devinsuit <r.poryvaev@hotmail.com>
Date: Sat, 16 Jul 2016 14:22:51 +0300
Subject: [PATCH 103/126] Update link in imagenet_full.md  #2729

---
 docs/tutorials/imagenet_full.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/imagenet_full.md b/docs/tutorials/imagenet_full.md
index ba6b26648cdf..f0e722ed975d 100644
--- a/docs/tutorials/imagenet_full.md
+++ b/docs/tutorials/imagenet_full.md
@@ -68,7 +68,7 @@ We should note that this result is by no means optimal, as we did not carefully
 ## The Code and Model
 The code and step guide is publically available at [https://github.com/dmlc/mxnet/tree/master/example/image-classification](https://github.com/dmlc/mxnet/tree/master/example/image-classification)
 
-We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception)
+We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception.md)
 
 ## How to Use The Model
 We should point out it 21k classes is much more challenging than 1k. Directly use the raw prediction is not a reasonable way.

From 3790c123e21d92004e7e7f6a727b6235005b5e52 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sat, 16 Jul 2016 20:37:50 -0700
Subject: [PATCH 104/126] fix convolution speed

---
 src/operator/cudnn_convolution-inl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index 5edde29f12fe..d37f24b40b56 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -214,7 +214,6 @@ class CuDNNConvolutionOp : public Operator {
                filter_desc_,
                gwmat_ptr + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
       #elif CUDNN_MAJOR == 5
-      back_algo_w_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
       CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
                &alpha,
                in_desc_,

From 5f58c7a34ed40854cd354d0eefa2cdc8a6330ae1 Mon Sep 17 00:00:00 2001
From: sxjscience <xshiab@ust.hk>
Date: Thu, 14 Jul 2016 23:09:38 +0800
Subject: [PATCH 105/126] 1. Accelerate embedding using the new kernel

2. Update mshadow
---
 mshadow                      |  2 +-
 src/operator/embedding-inl.h | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/mshadow b/mshadow
index 11dba2e69bd4..867be36a5ada 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 11dba2e69bd4d0d274933ab2f04e878fe9847069
+Subproject commit 867be36a5adabba2a2565fdddb1f88a2b68f9005
diff --git a/src/operator/embedding-inl.h b/src/operator/embedding-inl.h
index f9765daaee3d..fc8b7154fa97 100644
--- a/src/operator/embedding-inl.h
+++ b/src/operator/embedding-inl.h
@@ -22,6 +22,7 @@ namespace op {
 namespace embedding {
 enum EmbeddingOpInputs {kData, kWeight};
 enum EmbeddingOpOutputs {kOut};
+enum EmbeddingOpResource {kTempSpace};
 }  // namespace embedding
 
 struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
@@ -94,11 +95,28 @@ class EmbeddingOp : public Operator {
     Tensor<xpu, 2, DType> grad_out = out_grad[embedding::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
     Tensor<xpu, 2, DType> grad_in = in_grad[embedding::kWeight].get<xpu, 2, DType>(s);
-    if (req[embedding::kWeight] == kWriteTo) {
-      grad_in = scalar<DType>(0.0f);
-      AddTakeGrad(grad_in, data, grad_out);
-    } else if (req[embedding::kWeight] == kAddTo) {
-      AddTakeGrad(grad_in, data, grad_out);
+    if (req[embedding::kWeight] == kWriteTo || req[embedding::kWeight] == kAddTo) {
+      if (req[embedding::kWeight] == kWriteTo) {
+#ifdef __CUDACC__
+        cudaMemsetAsync(grad_in.dptr_, 0, grad_in.MSize() * sizeof(DType),
+                        Stream<gpu>::GetStream(s));
+#else
+        grad_in = scalar<DType>(0.0f);
+#endif
+      }
+      if ((grad_out.shape_[0] < grad_out.shape_[1]) && (grad_out.shape_[0] < 512)) {
+        AddTakeGrad(grad_in, data, grad_out);
+      } else {
+        Tensor<xpu, 2, int> workspace =
+          ctx.requested[embedding::kTempSpace].get_space_typed<xpu, 2, int>(
+          mshadow::Shape2(2, data.shape_.Size()), s);
+        Tensor<xpu, 1, int> sorted_data = workspace[0];
+        Tensor<xpu, 1, int> original_index = workspace[1];
+        sorted_data = tcast<int>(data);
+        original_index = range<int>(0, data.shape_.Size());
+        SortByKey(sorted_data, original_index, true);
+        AddTakeGradLargeBatch(grad_in, sorted_data, original_index, grad_out);
+      }
     } else {
       LOG(FATAL) << "wrong req";
     }
@@ -183,6 +201,11 @@ class EmbeddingProp : public OperatorProperty {
     return {out_grad[embedding::kOut], in_data[embedding::kData]};
   }
 
+  std::vector<ResourceRequest> BackwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return{ ResourceRequest::kTempSpace};
+  }
+
   Operator* CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;

From 5cf77293d078399db56809741e21f96198d572e7 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sun, 17 Jul 2016 01:07:43 -0700
Subject: [PATCH 106/126] fix 3d conv shape inference

---
 src/operator/convolution-inl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 59d82465cfb6..bc46f28d5c47 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -398,7 +398,9 @@ class ConvolutionProp : public OperatorProperty {
           << "incorrect stride size: " << param_.stride;
       CHECK_GT(param_.dilate.Size(), 0) \
           << "incorrect dilate size: " << param_.dilate;
-      CHECK(ksize_d < dshape[2] && ksize_y <= dshape[3] && ksize_x <= dshape[4])
+      CHECK(ksize_d < dshape[2] + 2 * param_.pad[0]
+            && ksize_y <= dshape[3] + 2 * param_.pad[1]
+            && ksize_x <= dshape[4] + 2 * param_.pad[2])
           << "kernel size exceed input";
       if (param_.dilate.Size() != 1) {
         LOG(INFO) << "Dilate is not supported in 3d convolution";

From 4f0c8c9195d92f158d29f2a37b5769e67ae7212f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=BE=B7=E6=BE=8E?= <liangdepeng@gmail.com>
Date: Mon, 18 Jul 2016 00:54:41 +0800
Subject: [PATCH 107/126] add scala example lstm (#2737)

---
 .../examples/scripts/rnn/run_test_charrnn.sh  |  20 ++
 .../examples/scripts/rnn/run_train_charrnn.sh |  20 ++
 .../ml/dmlc/mxnet/examples/rnn/ButketIo.scala | 204 ++++++++++++++++++
 .../ml/dmlc/mxnet/examples/rnn/Lstm.scala     | 148 +++++++++++++
 .../ml/dmlc/mxnet/examples/rnn/RnnModel.scala |  55 +++++
 .../dmlc/mxnet/examples/rnn/TestCharRnn.scala |  88 ++++++++
 .../mxnet/examples/rnn/TrainCharRnn.scala     | 160 ++++++++++++++
 .../ml/dmlc/mxnet/examples/rnn/Utils.scala    | 137 ++++++++++++
 8 files changed, 832 insertions(+)
 create mode 100644 scala-package/examples/scripts/rnn/run_test_charrnn.sh
 create mode 100644 scala-package/examples/scripts/rnn/run_train_charrnn.sh
 create mode 100644 scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
 create mode 100644 scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
 create mode 100644 scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
 create mode 100644 scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
 create mode 100644 scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
 create mode 100644 scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala

diff --git a/scala-package/examples/scripts/rnn/run_test_charrnn.sh b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
new file mode 100644
index 000000000000..3abe98917a8f
--- /dev/null
+++ b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# you can get the training data file using the following command
+# wget http://data.dmlc.ml/mxnet/data/lab_data.zip
+# unzip -o lab_data.zip
+# for example ./datas/obama.txt
+DATA_PATH=$1
+# for example ./models/obama
+MODEL_PREFIX=$2
+# feel free to change the starter sentence
+STARTER_SENTENCE="The joke"
+
+java -Xmx4G -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.rnn.TestCharRnn \
+	--data-path $DATA_PATH \
+	--model-prefix $MODEL_PREFIX \
+	--starter-sentence "$STARTER_SENTENCE"
diff --git a/scala-package/examples/scripts/rnn/run_train_charrnn.sh b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
new file mode 100644
index 000000000000..04379d33401a
--- /dev/null
+++ b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# which gpu card to use, -1 means cpu
+GPU=$1
+# you can get the training data file using the following command
+# wget http://data.dmlc.ml/mxnet/data/lab_data.zip
+# unzip -o lab_data.zip
+# for example ./datas/obama.txt
+DATA_PATH=$2
+# for example ./models
+SAVE_MODEL_PATH=$3
+
+java -Xmx4G -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.rnn.TrainCharRnn \
+	--data-path $DATA_PATH \
+	--save-model-path $SAVE_MODEL_PATH \
+	--gpu $GPU \
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
new file mode 100644
index 000000000000..8573bbdbe05d
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
@@ -0,0 +1,204 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet.{DataBatch, DataIter, NDArray, Shape}
+import org.slf4j.LoggerFactory
+import scala.io.Source
+import scala.util.Random
+
+/**
+ * @author Depeng Liang
+ */
+object ButketIo {
+
+  type Text2Id = (String, Map[String, Int]) => Array[Int]
+  type ReadContent = String => String
+
+  def defaultReadContent(path: String): String = {
+    val content = Source.fromFile(path).mkString
+                                        .replaceAll("\n", " <eos> ")
+                                        .replaceAll(". ", " <eos> ")
+    content
+  }
+
+  def defaultText2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
+    val words = {
+      val tmp = sentence.split(" ").filter(_.length() > 0)
+      for (w <- tmp) yield theVocab(w)
+    }
+    words.toArray
+  }
+
+  def defaultGenBuckets(sentences: Array[String], batchSize: Int,
+      theVocab: Map[String, Int]): List[Int] = {
+    val lenDict = scala.collection.mutable.Map[Int, Int]()
+    var maxLen = -1
+    for (sentence <- sentences) {
+      val wordsLen = defaultText2Id(sentence, theVocab).length
+      if (wordsLen > 0) {
+        if (wordsLen > maxLen) {
+          maxLen = wordsLen
+        }
+        if (lenDict.contains(wordsLen)) {
+          lenDict(wordsLen) = lenDict(wordsLen) + 1
+        } else {
+          lenDict += wordsLen -> 1
+        }
+      }
+    }
+
+    var tl = 0
+    var buckets = List[Int]()
+    lenDict.foreach {
+      case (l, n) =>
+        if (n + tl >= batchSize) {
+          buckets = buckets :+ l
+          tl = 0
+        } else tl += n
+    }
+    if (tl  > 0) buckets = buckets :+ maxLen
+    buckets
+  }
+
+  class BucketSentenceIter(path: String, vocab: Map[String, Int], var buckets: List[Int],
+      _batchSize: Int, initStates: IndexedSeq[(String, (Int, Int))],
+      seperateChar: String = " <eos> ", text2Id: Text2Id = defaultText2Id,
+      readContent: ReadContent = defaultReadContent) extends DataIter {
+
+    private val logger = LoggerFactory.getLogger(classOf[BucketSentenceIter])
+
+    private val content = readContent(path)
+    private val sentences = content.split(seperateChar)
+
+    if (buckets.length == 0) {
+      buckets = defaultGenBuckets(sentences, batchSize, vocab)
+    }
+    buckets = buckets.sorted
+    // pre-allocate with the largest bucket for better memory sharing
+    private val defaultBucketKey = (buckets(0) /: buckets.drop(1)) { (max, elem) =>
+      if (max < elem) elem else max
+    }
+    // we just ignore the sentence it is longer than the maximum
+    // bucket size here
+    private val data = buckets.indices.map(x => Array[Array[Float]]()).toArray
+    for (sentence <- sentences) {
+      val ids = text2Id(sentence, vocab)
+      if (ids.length > 0) {
+        buckets.indices.foreach { idx =>
+          if (buckets(idx) >= ids.length) {
+            data(idx) = data(idx) :+
+            (ids.map(_.toFloat) ++ Array.fill[Float](buckets(idx) - ids.length)(0f))
+          }
+        }
+      }
+    }
+
+    // Get the size of each bucket, so that we could sample
+    // uniformly from the bucket
+    private val bucketSizes = data.map(_.length)
+    logger.info("Summary of dataset ==================")
+    buckets.zip(bucketSizes).foreach {
+      case (bkt, size) => logger.info(s"bucket of len $bkt : $size samples")
+    }
+
+     // make a random data iteration plan
+     // truncate each bucket into multiple of batch-size
+    private var bucketNBatches = Array[Int]()
+    for (i <- data.indices) {
+      bucketNBatches = bucketNBatches :+ (data(i).length / _batchSize)
+      data(i) = data(i).take(bucketNBatches(i) * _batchSize)
+    }
+
+    private val bucketPlan = {
+      val plan = bucketNBatches.zipWithIndex.map(x => Array.fill[Int](x._1)(x._2)).flatten
+      Random.shuffle(plan.toList)
+    }
+
+    private val bucketIdxAll = data.map(_.length).toList
+                                        .map(l => Random.shuffle((0 until l).toList))
+    private val bucketCurrIdx = data.map(x => 0)
+
+    private var dataBuffer = Array[NDArray]()
+    private var labelBuffer = Array[NDArray]()
+    for (iBucket <- data.indices) {
+      dataBuffer = dataBuffer :+ NDArray.zeros(_batchSize, buckets(iBucket))
+      labelBuffer = labelBuffer :+ NDArray.zeros(_batchSize, buckets(iBucket))
+    }
+
+    private val _provideData = {
+      val tmp = Map("data" -> Shape(_batchSize, defaultBucketKey))
+      tmp ++ initStates.map(x => x._1 -> Shape(x._2._1, x._2._2))
+    }
+    private val _provideLabel = Map("softmax_label" -> Shape(_batchSize, defaultBucketKey))
+
+    private var iBucket = 0
+
+    override def next(): DataBatch = {
+      val bucketIdx = bucketPlan(iBucket)
+      val dataBuf = dataBuffer(bucketIdx)
+      val iIdx = bucketCurrIdx(bucketIdx)
+      val idx = bucketIdxAll(bucketIdx).drop(iIdx).take(_batchSize)
+      bucketCurrIdx(bucketIdx) = bucketCurrIdx(bucketIdx) + _batchSize
+
+      val datas = idx.map(i => data(bucketIdx)(i)).toArray
+      for (sentence <- datas) {
+        assert(sentence.length == buckets(bucketIdx))
+      }
+      dataBuf.set(datas.flatten)
+
+      val labelBuf = labelBuffer(bucketIdx)
+      val labels = idx.map(i => data(bucketIdx)(i).drop(1) :+ 0f).toArray
+      labelBuf.set(labels.flatten)
+
+      iBucket += 1
+      new DataBatch(IndexedSeq(dataBuf),
+                                   IndexedSeq(labelBuf),
+                                   getIndex(),
+                                   getPad())
+    }
+
+    /**
+     * reset the iterator
+     */
+    override def reset(): Unit = {
+      iBucket = 0
+      bucketCurrIdx.indices.map(i => bucketCurrIdx(i) = 0)
+    }
+
+    override def batchSize: Int = _batchSize
+
+    /**
+     * get data of current batch
+     * @return the data of current batch
+     */
+    override def getData(): IndexedSeq[NDArray] = IndexedSeq(dataBuffer(bucketPlan(iBucket)))
+
+    /**
+     * Get label of current batch
+     * @return the label of current batch
+     */
+    override def getLabel(): IndexedSeq[NDArray] = IndexedSeq(labelBuffer(bucketPlan(iBucket)))
+
+    /**
+     * the index of current batch
+     * @return
+     */
+    override def getIndex(): IndexedSeq[Long] = IndexedSeq[Long]()
+
+    // The name and shape of label provided by this iterator
+    override def provideLabel: Map[String, Shape] = this._provideLabel
+
+    /**
+     * get the number of padding examples
+     * in current batch
+     * @return number of padding examples in current batch
+     */
+    override def getPad(): Int = 0
+
+    // The name and shape of data provided by this iterator
+    override def provideData: Map[String, Shape] = this._provideData
+
+    override def hasNext: Boolean = {
+      if (iBucket < bucketPlan.length) true else false
+    }
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
new file mode 100644
index 000000000000..6ac01a8d6644
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
@@ -0,0 +1,148 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Executor
+
+/**
+ * @author Depeng Liang
+ */
+object Lstm {
+
+  final case class LSTMState(c: Symbol, h: Symbol)
+  final case class LSTMParam(i2hWeight: Symbol, i2hBias: Symbol,
+                                                         h2hWeight: Symbol, h2hBias: Symbol)
+
+  // LSTM Cell symbol
+  def lstm(numHidden: Int, inData: Symbol, prevState: LSTMState,
+                   param: LSTMParam, seqIdx: Int, layerIdx: Int, dropout: Float = 0f): LSTMState = {
+    val inDataa = {
+      if (dropout > 0f) Symbol.Dropout()(Map("data" -> inData, "p" -> dropout))
+      else inData
+    }
+    val i2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_i2h")(Map("data" -> inDataa,
+                                                       "weight" -> param.i2hWeight,
+                                                       "bias" -> param.i2hBias,
+                                                       "num_hidden" -> numHidden * 4))
+    val h2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_h2h")(Map("data" -> prevState.h,
+                                                       "weight" -> param.h2hWeight,
+                                                       "bias" -> param.h2hBias,
+                                                       "num_hidden" -> numHidden * 4))
+    val gates = i2h + h2h
+    val sliceGates = Symbol.SliceChannel(s"t${seqIdx}_l${layerIdx}_slice")(Array(gates),
+        Map("num_outputs" -> 4))
+    val ingate = Symbol.Activation()(Map("data" -> sliceGates.get(0), "act_type" -> "sigmoid"))
+    val inTransform = Symbol.Activation()(Map("data" -> sliceGates.get(1), "act_type" -> "tanh"))
+    val forgetGate = Symbol.Activation()(Map("data" -> sliceGates.get(2), "act_type" -> "sigmoid"))
+    val outGate = Symbol.Activation()(Map("data" -> sliceGates.get(3), "act_type" -> "sigmoid"))
+    val nextC = (forgetGate * prevState.c) + (ingate * inTransform)
+    val nextH = outGate * Symbol.Activation()(Map("data" -> nextC, "act_type" -> "tanh"))
+    LSTMState(c = nextC, h = nextH)
+  }
+
+  // we define a new unrolling function here because the original
+  // one in lstm.py concats all the labels at the last layer together,
+  // making the mini-batch size of the label different from the data.
+  // I think the existing data-parallelization code need some modification
+  // to allow this situation to work properly
+  def lstmUnroll(numLstmLayer: Int, seqLen: Int, inputSize: Int, numHidden: Int,
+                              numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
+    val embedWeight = Symbol.Variable("embed_weight")
+    val clsWeight = Symbol.Variable("cls_weight")
+    val clsBias = Symbol.Variable("cls_bias")
+
+    var paramCells = Array[LSTMParam]()
+    var lastStates = Array[LSTMState]()
+    for (i <- 0 until numLstmLayer) {
+      paramCells = paramCells :+ LSTMParam(i2hWeight = Symbol.Variable(s"l${i}_i2h_weight"),
+                                           i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
+                                           h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
+                                           h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
+      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
+                                                  h = Symbol.Variable(s"l${i}_init_h"))
+    }
+    assert(lastStates.length == numLstmLayer)
+
+    // embeding layer
+    val data = Symbol.Variable("data")
+    var label = Symbol.Variable("softmax_label")
+    val embed = Symbol.Embedding("embed")(Map("data" -> data, "input_dim" -> inputSize,
+                                           "weight" -> embedWeight, "output_dim" -> numEmbed))
+    val wordvec = Symbol.SliceChannel()(Array(embed),
+      Map("num_outputs" -> seqLen, "squeeze_axis" -> true))
+
+    var hiddenAll = Array[Symbol]()
+    var dpRatio = 0f
+    var hidden: Symbol = null
+    for (seqIdx <- 0 until seqLen) {
+      hidden = wordvec.get(seqIdx)
+      // stack LSTM
+      for (i <- 0 until numLstmLayer) {
+        if (i == 0) dpRatio = 0f else dpRatio = dropout
+        val nextState = lstm(numHidden, inData = hidden,
+                                prevState = lastStates(i),
+                                param = paramCells(i),
+                                seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
+        hidden = nextState.h
+        lastStates(i) = nextState
+      }
+      // decoder
+      if (dropout > 0f) hidden = Symbol.Dropout()(Map("data" -> hidden, "p" -> dropout))
+      hiddenAll = hiddenAll :+ hidden
+    }
+    val hiddenConcat = Symbol.Concat()(hiddenAll, Map("dim" -> 0))
+    val pred = Symbol.FullyConnected("pred")(Map("data" -> hiddenConcat, "num_hidden" -> numLabel,
+                                            "weight" -> clsWeight, "bias" -> clsBias))
+    label = Symbol.transpose(label)
+    label = Symbol.Reshape()(Map("data" -> label, "target_shape" -> "(0,)"))
+    val sm = Symbol.SoftmaxOutput("softmax")(Map("data" -> pred, "label" -> label))
+    sm
+  }
+
+  def lstmInferenceSymbol(numLstmLayer: Int, inputSize: Int, numHidden: Int,
+                              numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
+    val seqIdx = 0
+    val embedWeight = Symbol.Variable("embed_weight")
+    val clsWeight = Symbol.Variable("cls_weight")
+    val clsBias = Symbol.Variable("cls_bias")
+
+    var paramCells = Array[LSTMParam]()
+    var lastStates = Array[LSTMState]()
+    for (i <- 0 until numLstmLayer) {
+      paramCells = paramCells :+ LSTMParam(i2hWeight = Symbol.Variable(s"l${i}_i2h_weight"),
+                                           i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
+                                           h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
+                                           h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
+      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
+                                            h = Symbol.Variable(s"l${i}_init_h"))
+    }
+    assert(lastStates.length == numLstmLayer)
+
+    val data = Symbol.Variable("data")
+
+    var hidden = Symbol.Embedding("embed")(Map("data" -> data, "input_dim" -> inputSize,
+                                           "weight" -> embedWeight, "output_dim" -> numEmbed))
+
+    var dpRatio = 0f
+    // stack LSTM
+    for (i <- 0 until numLstmLayer) {
+      if (i == 0) dpRatio = 0f else dpRatio = dropout
+      val nextState = lstm(numHidden, inData = hidden,
+                              prevState = lastStates(i),
+                              param = paramCells(i),
+                              seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
+      hidden = nextState.h
+      lastStates(i) = nextState
+    }
+    // decoder
+    if (dropout > 0f) hidden = Symbol.Dropout()(Map("data" -> hidden, "p" -> dropout))
+    val fc = Symbol.FullyConnected("pred")(Map("data" -> hidden, "num_hidden" -> numLabel,
+                                      "weight" -> clsWeight, "bias" -> clsBias))
+    val sm = Symbol.SoftmaxOutput("softmax")(Map("data" -> fc))
+    var output = Array(sm)
+    for (state <- lastStates) {
+      output = output :+ state.c
+      output = output :+ state.h
+    }
+    Symbol.Group(output: _*)
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
new file mode 100644
index 000000000000..62aacc5a2e78
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
@@ -0,0 +1,55 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Symbol
+
+object RnnModel {
+  class LSTMInferenceModel(numLstmLayer: Int, inputSize: Int, numHidden: Int,
+                              numEmbed: Int, numLabel: Int, argParams: Map[String, NDArray],
+                              ctx: Context = Context.cpu(), dropout: Float = 0f) {
+    private val sym = Lstm.lstmInferenceSymbol(numLstmLayer,
+                                                              inputSize,
+                                                              numHidden,
+                                                              numEmbed,
+                                                              numLabel,
+                                                              dropout)
+    private val batchSize = 1
+    private val initC = (for (l <- 0 until numLstmLayer)
+                          yield (s"l${l}_init_c" -> Shape(batchSize, numHidden))).toMap
+    private val initH = (for (l <- 0 until numLstmLayer)
+                          yield (s"l${l}_init_h" -> Shape(batchSize, numHidden))).toMap
+    private val dataShape = Map("data" -> Shape(batchSize))
+    private val inputShape = initC ++ initH ++ dataShape
+    private val executor = sym.simpleBind(ctx = ctx, shapeDict = inputShape)
+
+    for (key <- this.executor.argDict.keys) {
+      if (!inputShape.contains(key) && argParams.contains(key) && key != "softmax_label") {
+        argParams(key).copyTo(this.executor.argDict(key))
+      }
+    }
+
+    private var stateName = (Array[String]() /: (0 until numLstmLayer)) { (acc, i) =>
+      acc :+ s"l${i}_init_c"  :+ s"l${i}_init_h"
+    }
+
+    private val statesDict = stateName.zip(this.executor.outputs.drop(1)).toMap
+    private val inputArr = NDArray.zeros(dataShape("data"))
+
+    def forward(inputData: NDArray, newSeq: Boolean = false): Array[Float] = {
+      if (newSeq == true) {
+        for (key <- this.statesDict.keys) {
+          this.executor.argDict(key).set(0f)
+        }
+      }
+      inputData.copyTo(this.executor.argDict("data"))
+      this.executor.forward()
+      for (key <- this.statesDict.keys) {
+        this.statesDict(key).copyTo(this.executor.argDict(key))
+      }
+      val prob = this.executor.outputs(0).toArray
+      prob
+    }
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
new file mode 100644
index 000000000000..a3351ff12557
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
@@ -0,0 +1,88 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet._
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+
+/**
+ * Follows the demo, to test the char rnn:
+ * https://github.com/dmlc/mxnet/blob/master/example/rnn/char-rnn.ipynb
+ * @author Depeng Liang
+ */
+object TestCharRnn {
+
+  private val logger = LoggerFactory.getLogger(classOf[TrainCharRnn])
+
+  def main(args: Array[String]): Unit = {
+    val stcr = new TestCharRnn
+    val parser: CmdLineParser = new CmdLineParser(stcr)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(stcr.dataPath != null && stcr.modelPrefix != null && stcr.starterSentence != null)
+
+      // The batch size for training
+      val batchSize = 32
+      // We can support various length input
+      // For this problem, we cut each input sentence to length of 129
+      // So we only need fix length bucket
+      val buckets = List(129)
+      // hidden unit in LSTM cell
+      val numHidden = 512
+      // embedding dimension, which is, map a char to a 256 dim vector
+      val numEmbed = 256
+      // number of lstm layer
+      val numLstmLayer = 3
+
+      // build char vocabluary from input
+      val vocab = Utils.buildVocab(stcr.dataPath)
+
+      // load from check-point
+      val (_, argParams, _) = Model.loadCheckpoint(stcr.modelPrefix, 75)
+
+      // build an inference model
+      val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1,
+                           numHidden = numHidden, numEmbed = numEmbed,
+                           numLabel = vocab.size + 1, argParams = argParams, dropout = 0.2f)
+
+      // generate a sequence of 1200 chars
+      val seqLength = 1200
+      val inputNdarray = NDArray.zeros(1)
+      val revertVocab = Utils.makeRevertVocab(vocab)
+
+      // Feel free to change the starter sentence
+      var output = stcr.starterSentence
+      val randomSample = true
+      var newSentence = true
+      val ignoreLength = output.length()
+
+      for (i <- 0 until seqLength) {
+        if (i <= ignoreLength - 1) Utils.makeInput(output(i), vocab, inputNdarray)
+        else Utils.makeInput(output.takeRight(1)(0), vocab, inputNdarray)
+        val prob = model.forward(inputNdarray, newSentence)
+        newSentence = false
+        val nextChar = Utils.makeOutput(prob, revertVocab, randomSample)
+        if (nextChar == "") newSentence = true
+        if (i >= ignoreLength) output = output ++ nextChar
+      }
+
+      // Let's see what we can learned from char in Obama's speech.
+      logger.info(output)
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class TestCharRnn {
+  @Option(name = "--data-path", usage = "the input train data file")
+  private val dataPath: String = null
+  @Option(name = "--model-prefix", usage = "the model prefix")
+  private val modelPrefix: String = null
+  @Option(name = "--starter-sentence", usage = "the starter sentence")
+  private val starterSentence: String = null
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
new file mode 100644
index 000000000000..ef15636f836b
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
@@ -0,0 +1,160 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet._
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+import ml.dmlc.mxnet.optimizer.Adam
+
+/**
+ * Follows the demo, to train the char rnn:
+ * https://github.com/dmlc/mxnet/blob/master/example/rnn/char-rnn.ipynb
+ * @author Depeng Liang
+ */
+object TrainCharRnn {
+
+  private val logger = LoggerFactory.getLogger(classOf[TrainCharRnn])
+
+  def main(args: Array[String]): Unit = {
+    val incr = new TrainCharRnn
+    val parser: CmdLineParser = new CmdLineParser(incr)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(incr.dataPath != null && incr.saveModelPath != null)
+
+      // The batch size for training
+      val batchSize = 32
+      // We can support various length input
+      // For this problem, we cut each input sentence to length of 129
+      // So we only need fix length bucket
+      val buckets = List(129)
+      // hidden unit in LSTM cell
+      val numHidden = 512
+      // embedding dimension, which is, map a char to a 256 dim vector
+      val numEmbed = 256
+      // number of lstm layer
+      val numLstmLayer = 3
+      // we will show a quick demo in 2 epoch
+      // and we will see result by training 75 epoch
+      val numEpoch = 75
+      // learning rate
+      val learningRate = 0.001f
+      // we will use pure sgd without momentum
+      val momentum = 0.0f
+
+      val ctx = if (incr.gpu == -1) Context.cpu() else Context.gpu(incr.gpu)
+      val vocab = Utils.buildVocab(incr.dataPath)
+
+      // generate symbol for a length
+      def symGen(seqLen: Int): Symbol = {
+        Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size + 1,
+                    numHidden = numHidden, numEmbed = numEmbed,
+                    numLabel = vocab.size + 1, dropout = 0.2f)
+      }
+
+      // initalize states for LSTM
+      val initC = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_c", (batchSize, numHidden))
+      val initH = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_h", (batchSize, numHidden))
+      val initStates = initC ++ initH
+
+      val dataTrain = new ButketIo.BucketSentenceIter(incr.dataPath, vocab, buckets,
+                                          batchSize, initStates, seperateChar = "\n",
+                                          text2Id = Utils.text2Id, readContent = Utils.readContent)
+
+      // the network symbol
+      val symbol = symGen(buckets(0))
+
+      val datasAndLabels = dataTrain.provideData ++ dataTrain.provideLabel
+      val (argShapes, outputShapes, auxShapes) = symbol.inferShape(datasAndLabels)
+
+      val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
+
+      val argNames = symbol.listArguments()
+      val argDict = argNames.zip(argShapes.map(NDArray.zeros(_, ctx))).toMap
+      val auxNames = symbol.listAuxiliaryStates()
+      val auxDict = auxNames.zip(auxShapes.map(NDArray.zeros(_, ctx))).toMap
+
+      val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
+        !datasAndLabels.contains(name)
+      }.map(x => x._1 -> NDArray.empty(x._2, ctx) ).toMap
+
+      argDict.foreach { case (name, ndArray) =>
+        if (!datasAndLabels.contains(name)) {
+          initializer.initWeight(name, ndArray)
+        }
+      }
+
+      val data = argDict("data")
+      val label = argDict("softmax_label")
+
+      val executor = symbol.bind(ctx, argDict, gradDict)
+
+      val opt = new Adam(learningRate = learningRate, wd = 0.0001f)
+
+      val paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
+        (idx, name, grad, opt.createState(idx, argDict(name)))
+      }
+
+      val evalMetric = new CustomMetric(Utils.perplexity, "perplexity")
+      val batchEndCallback = new Callback.Speedometer(batchSize, 50)
+      val epochEndCallback = Utils.doCheckpoint(s"${incr.saveModelPath}/obama")
+
+      for (epoch <- 0 until numEpoch) {
+        // Training phase
+        val tic = System.currentTimeMillis
+        evalMetric.reset()
+        var nBatch = 0
+        var epochDone = false
+        // Iterate over training data.
+        dataTrain.reset()
+        while (!epochDone) {
+          var doReset = true
+          while (doReset && dataTrain.hasNext) {
+            val dataBatch = dataTrain.next()
+
+            data.set(dataBatch.data(0))
+            label.set(dataBatch.label(0))
+            executor.forward(isTrain = true)
+            executor.backward()
+            paramsGrads.foreach { case (idx, name, grad, optimState) =>
+              opt.update(idx, argDict(name), grad, optimState)
+            }
+
+            // evaluate at end, so out_cpu_array can lazy copy
+            evalMetric.update(dataBatch.label, executor.outputs)
+
+            nBatch += 1
+            batchEndCallback.invoke(epoch, nBatch, evalMetric)
+          }
+          if (doReset) {
+            dataTrain.reset()
+          }
+          // this epoch is done
+          epochDone = true
+        }
+        val (name, value) = evalMetric.get
+        logger.info(s"Epoch[$epoch] Train-$name=$value")
+        val toc = System.currentTimeMillis
+        logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
+
+        epochEndCallback.invoke(epoch, symbol, argDict, auxDict)
+      }
+      executor.dispose()
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class TrainCharRnn {
+  @Option(name = "--data-path", usage = "the input train data file")
+  private val dataPath: String = null
+  @Option(name = "--save-model-path", usage = "the model saving path")
+  private val saveModelPath: String = null
+  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
+  private val gpu: Int = -1
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
new file mode 100644
index 000000000000..a65655a6c4cb
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
@@ -0,0 +1,137 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import scala.io.Source
+import ml.dmlc.mxnet.EvalMetric
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.EpochEndCallback
+import ml.dmlc.mxnet.Model
+import ml.dmlc.mxnet.Symbol
+import scala.util.Random
+
+/**
+ * @author Depeng Liang
+ */
+object Utils {
+
+  def readContent(path: String): String = Source.fromFile(path).mkString
+
+  // Build  a vocabulary of what char we have in the content
+  def buildVocab(path: String): Map[String, Int] = {
+    val content = readContent(path).split("\n")
+    var idx = 1 // 0 is left for zero padding
+    var theVocab = Map[String, Int]()
+    for (line <- content) {
+      for (char <- line) {
+        val key = s"$char"
+        if (!theVocab.contains(key)) {
+          theVocab = theVocab + (key -> idx)
+          idx += 1
+        }
+      }
+    }
+    theVocab
+  }
+
+  // We will assign each char with a special numerical id
+  def text2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
+    val words = for (char <- sentence) yield theVocab(s"$char")
+    words.toArray
+  }
+
+  // Evaluation
+  def perplexity(label: NDArray, pred: NDArray): Float = {
+    val shape = label.shape
+    val size = shape(0) * shape(1)
+    val labelT = {
+      val tmp = label.toArray.grouped(shape(1)).toArray
+      val result = Array.fill[Float](size)(0f)
+      var idx = 0
+      for (i <- 0 until shape(1)) {
+        for (j <- 0 until shape(0)) {
+          result(idx) = tmp(j)(i)
+          idx += 1
+        }
+      }
+      result
+    }
+    var loss = 0f
+    val predArray = pred.toArray.grouped(pred.shape(1)).toArray
+    for (i <- 0 until pred.shape(0)) {
+      loss += -Math.log(Math.max(1e-10, predArray(i)(labelT(i).toInt)).toFloat).toFloat
+    }
+    loss / size
+  }
+
+  def doCheckpoint(prefix: String): EpochEndCallback = new EpochEndCallback {
+    override def invoke(epoch: Int, symbol: Symbol,
+                    argParams: Map[String, NDArray],
+                    auxStates: Map[String, NDArray]): Unit = {
+      Model.saveCheckpoint(prefix, epoch + 1, symbol, argParams, auxStates)
+    }
+  }
+
+  // helper strcuture for prediction
+  def makeRevertVocab(vocab: Map[String, Int]): Map[Int, String] = {
+    var dic = Map[Int, String]()
+    vocab.foreach { case (k, v) =>
+      dic = dic + (v -> k)
+    }
+    dic
+  }
+
+  // make input from char
+  def makeInput(char: Char, vocab: Map[String, Int], arr: NDArray): Unit = {
+    val idx = vocab(s"$char")
+    val tmp = NDArray.zeros(1)
+    tmp.set(idx)
+    arr.set(tmp)
+  }
+
+  // helper function for random sample
+  def cdf(weights: Array[Float]): Array[Float] = {
+    val total = weights.sum
+    var result = Array[Float]()
+    var cumsum = 0f
+    for (w <- weights) {
+      cumsum += w
+      result = result :+ (cumsum / total)
+    }
+    result
+  }
+
+  def choice(population: Array[String], weights: Array[Float]): String = {
+    assert(population.length == weights.length)
+    val cdfVals = cdf(weights)
+    val x = Random.nextFloat()
+    var idx = 0
+    var found = false
+    for (i <- 0 until cdfVals.length) {
+      if (cdfVals(i) >= x && !found) {
+        idx = i
+        found = true
+      }
+    }
+    population(idx)
+  }
+
+  // we can use random output or fixed output by choosing largest probability
+  def makeOutput(prob: Array[Float], vocab: Map[Int, String],
+      sample: Boolean = false, temperature: Float = 1f): String = {
+    var idx = -1
+    val char = if (sample == false) {
+      idx = ((-1f, -1) /: prob.zipWithIndex) { (max, elem) =>
+        if (max._1 < elem._1) elem else max
+      }._2
+      if (vocab.contains(idx)) vocab(idx)
+      else ""
+    } else {
+      val fixDict = Array("") ++ (1 until vocab.size + 1).map(i => vocab(i))
+      var scaleProb = prob.map(x => if (x < 1e-6) 1e-6 else if (x > 1 - 1e-6) 1 - 1e-6 else x)
+      var rescale = scaleProb.map(x => Math.exp(Math.log(x) / temperature).toFloat)
+      val sum = rescale.sum.toFloat
+      rescale = rescale.map(_ / sum)
+      choice(fixDict, rescale)
+    }
+    char
+  }
+}

From 1daf89a48ec47e2a9c097c346c7fed68c0f7d3aa Mon Sep 17 00:00:00 2001
From: Ldpe2G <liangdepeng@mail.com>
Date: Mon, 18 Jul 2016 09:20:12 +0800
Subject: [PATCH 108/126] fix some code style

---
 .../ml/dmlc/mxnet/examples/rnn/ButketIo.scala | 15 ++++++------
 .../ml/dmlc/mxnet/examples/rnn/Lstm.scala     | 24 +++++++++----------
 .../ml/dmlc/mxnet/examples/rnn/RnnModel.scala | 14 +++++------
 .../ml/dmlc/mxnet/examples/rnn/Utils.scala    |  6 ++---
 4 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
index 8573bbdbe05d..9a11f6fa2950 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
@@ -15,8 +15,8 @@ object ButketIo {
 
   def defaultReadContent(path: String): String = {
     val content = Source.fromFile(path).mkString
-                                        .replaceAll("\n", " <eos> ")
-                                        .replaceAll(". ", " <eos> ")
+                        .replaceAll("\n", " <eos> ")
+                        .replaceAll(". ", " <eos> ")
     content
   }
 
@@ -29,7 +29,7 @@ object ButketIo {
   }
 
   def defaultGenBuckets(sentences: Array[String], batchSize: Int,
-      theVocab: Map[String, Int]): List[Int] = {
+                        theVocab: Map[String, Int]): List[Int] = {
     val lenDict = scala.collection.mutable.Map[Int, Int]()
     var maxLen = -1
     for (sentence <- sentences) {
@@ -59,7 +59,8 @@ object ButketIo {
     buckets
   }
 
-  class BucketSentenceIter(path: String, vocab: Map[String, Int], var buckets: List[Int],
+  class BucketSentenceIter(
+      path: String, vocab: Map[String, Int], var buckets: List[Int],
       _batchSize: Int, initStates: IndexedSeq[(String, (Int, Int))],
       seperateChar: String = " <eos> ", text2Id: Text2Id = defaultText2Id,
       readContent: ReadContent = defaultReadContent) extends DataIter {
@@ -151,9 +152,9 @@ object ButketIo {
 
       iBucket += 1
       new DataBatch(IndexedSeq(dataBuf),
-                                   IndexedSeq(labelBuf),
-                                   getIndex(),
-                                   getPad())
+                    IndexedSeq(labelBuf),
+                    getIndex(),
+                    getPad())
     }
 
     /**
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
index 6ac01a8d6644..0e2e5f7de66b 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
@@ -10,11 +10,11 @@ object Lstm {
 
   final case class LSTMState(c: Symbol, h: Symbol)
   final case class LSTMParam(i2hWeight: Symbol, i2hBias: Symbol,
-                                                         h2hWeight: Symbol, h2hBias: Symbol)
+                             h2hWeight: Symbol, h2hBias: Symbol)
 
   // LSTM Cell symbol
   def lstm(numHidden: Int, inData: Symbol, prevState: LSTMState,
-                   param: LSTMParam, seqIdx: Int, layerIdx: Int, dropout: Float = 0f): LSTMState = {
+           param: LSTMParam, seqIdx: Int, layerIdx: Int, dropout: Float = 0f): LSTMState = {
     val inDataa = {
       if (dropout > 0f) Symbol.Dropout()(Map("data" -> inData, "p" -> dropout))
       else inData
@@ -45,7 +45,7 @@ object Lstm {
   // I think the existing data-parallelization code need some modification
   // to allow this situation to work properly
   def lstmUnroll(numLstmLayer: Int, seqLen: Int, inputSize: Int, numHidden: Int,
-                              numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
+                 numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
     val embedWeight = Symbol.Variable("embed_weight")
     val clsWeight = Symbol.Variable("cls_weight")
     val clsBias = Symbol.Variable("cls_bias")
@@ -58,7 +58,7 @@ object Lstm {
                                            h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
                                            h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
       lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
-                                                  h = Symbol.Variable(s"l${i}_init_h"))
+                                           h = Symbol.Variable(s"l${i}_init_h"))
     }
     assert(lastStates.length == numLstmLayer)
 
@@ -79,9 +79,9 @@ object Lstm {
       for (i <- 0 until numLstmLayer) {
         if (i == 0) dpRatio = 0f else dpRatio = dropout
         val nextState = lstm(numHidden, inData = hidden,
-                                prevState = lastStates(i),
-                                param = paramCells(i),
-                                seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
+                             prevState = lastStates(i),
+                             param = paramCells(i),
+                             seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
         hidden = nextState.h
         lastStates(i) = nextState
       }
@@ -99,7 +99,7 @@ object Lstm {
   }
 
   def lstmInferenceSymbol(numLstmLayer: Int, inputSize: Int, numHidden: Int,
-                              numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
+                          numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
     val seqIdx = 0
     val embedWeight = Symbol.Variable("embed_weight")
     val clsWeight = Symbol.Variable("cls_weight")
@@ -113,7 +113,7 @@ object Lstm {
                                            h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
                                            h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
       lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
-                                            h = Symbol.Variable(s"l${i}_init_h"))
+                                           h = Symbol.Variable(s"l${i}_init_h"))
     }
     assert(lastStates.length == numLstmLayer)
 
@@ -127,9 +127,9 @@ object Lstm {
     for (i <- 0 until numLstmLayer) {
       if (i == 0) dpRatio = 0f else dpRatio = dropout
       val nextState = lstm(numHidden, inData = hidden,
-                              prevState = lastStates(i),
-                              param = paramCells(i),
-                              seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
+                           prevState = lastStates(i),
+                           param = paramCells(i),
+                           seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
       hidden = nextState.h
       lastStates(i) = nextState
     }
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
index 62aacc5a2e78..b91835f7d076 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
@@ -7,14 +7,14 @@ import ml.dmlc.mxnet.Symbol
 
 object RnnModel {
   class LSTMInferenceModel(numLstmLayer: Int, inputSize: Int, numHidden: Int,
-                              numEmbed: Int, numLabel: Int, argParams: Map[String, NDArray],
-                              ctx: Context = Context.cpu(), dropout: Float = 0f) {
+                           numEmbed: Int, numLabel: Int, argParams: Map[String, NDArray],
+                           ctx: Context = Context.cpu(), dropout: Float = 0f) {
     private val sym = Lstm.lstmInferenceSymbol(numLstmLayer,
-                                                              inputSize,
-                                                              numHidden,
-                                                              numEmbed,
-                                                              numLabel,
-                                                              dropout)
+                                               inputSize,
+                                               numHidden,
+                                               numEmbed,
+                                               numLabel,
+                                               dropout)
     private val batchSize = 1
     private val initC = (for (l <- 0 until numLstmLayer)
                           yield (s"l${l}_init_c" -> Shape(batchSize, numHidden))).toMap
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
index a65655a6c4cb..9fba8c1fd3e4 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
@@ -64,8 +64,8 @@ object Utils {
 
   def doCheckpoint(prefix: String): EpochEndCallback = new EpochEndCallback {
     override def invoke(epoch: Int, symbol: Symbol,
-                    argParams: Map[String, NDArray],
-                    auxStates: Map[String, NDArray]): Unit = {
+                        argParams: Map[String, NDArray],
+                        auxStates: Map[String, NDArray]): Unit = {
       Model.saveCheckpoint(prefix, epoch + 1, symbol, argParams, auxStates)
     }
   }
@@ -116,7 +116,7 @@ object Utils {
 
   // we can use random output or fixed output by choosing largest probability
   def makeOutput(prob: Array[Float], vocab: Map[Int, String],
-      sample: Boolean = false, temperature: Float = 1f): String = {
+                 sample: Boolean = false, temperature: Float = 1f): String = {
     var idx = -1
     val char = if (sample == false) {
       idx = ((-1f, -1) /: prob.zipWithIndex) { (max, elem) =>

From 72ea6f981979bcea2dc908b76fa079fd70004e75 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 18 Jul 2016 10:48:23 -0700
Subject: [PATCH 109/126] Refactor symbol ndarray register mechanism (#2745)

---
 python/mxnet/_ndarray_internal.py |   1 +
 python/mxnet/_symbol_internal.py  |   1 +
 python/mxnet/ndarray.py           | 102 +++++++++++++++---------------
 python/mxnet/operator.py          |  14 ++--
 python/mxnet/random.py            |   7 +-
 python/mxnet/symbol.py            |  47 +++++++-------
 6 files changed, 89 insertions(+), 83 deletions(-)
 create mode 100644 python/mxnet/_ndarray_internal.py
 create mode 100644 python/mxnet/_symbol_internal.py

diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/_ndarray_internal.py
new file mode 100644
index 000000000000..cbe2bcd96220
--- /dev/null
+++ b/python/mxnet/_ndarray_internal.py
@@ -0,0 +1 @@
+"""NDArray namespace used to register internal functions"""
diff --git a/python/mxnet/_symbol_internal.py b/python/mxnet/_symbol_internal.py
new file mode 100644
index 000000000000..d798f8d3704a
--- /dev/null
+++ b/python/mxnet/_symbol_internal.py
@@ -0,0 +1 @@
+"""Symbol namespace used to register internal functions"""
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index e26adbfbc45f..b23ded4418fb 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-# pylint: disable= too-many-lines, redefined-builtin
+# pylint: disable= too-many-lines, redefined-builtin, protected-access
 """NDArray API of mxnet."""
 from __future__ import absolute_import
 from __future__ import division
@@ -16,6 +16,7 @@
 from .base import ctypes2buffer
 from .base import check_call, ctypes2docstring
 from .context import Context
+from . import _ndarray_internal as _internal
 
 # pylint: disable= no-member
 _DTYPE_NP_TO_MX = {
@@ -103,9 +104,9 @@ def __iadd__(self, other):
         if not self.writable:
             raise ValueError('trying to add to a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._plus(self, other, out=self)
+            return _internal._plus(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._plus_scalar(self, float(other), out=self)
+            return _internal._plus_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -119,9 +120,9 @@ def __isub__(self, other):
         if not self.writable:
             raise ValueError('trying to subtract from a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._minus(self, other, out=self)
+            return _internal._minus(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._minus_scalar(self, float(other), out=self)
+            return _internal._minus_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -132,15 +133,15 @@ def __mul__(self, other):
         return multiply(self, other)
 
     def __neg__(self):
-        return NDArray._mul_scalar(self, -1.0)
+        return _internal._mul_scalar(self, -1.0)
 
     def __imul__(self, other):
         if not self.writable:
             raise ValueError('trying to multiply to a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._mul(self, other, out=self)
+            return _internal._mul(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._mul_scalar(self, float(other), out=self)
+            return _internal._mul_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -157,9 +158,9 @@ def __idiv__(self, other):
         if not self.writable:
             raise ValueError('trying to divide from a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._div(self, other, out=self)
+            return _internal._div(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._div_scalar(self, float(other), out=self)
+            return _internal._div_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -219,7 +220,7 @@ def __setitem__(self, in_slice, value):
             if value.handle is not self.handle:
                 value.copyto(self)
         elif isinstance(value, numeric_types):
-            NDArray._set_value(float(value), out=self)
+            _internal._set_value(float(value), out=self)
         elif isinstance(value, (np.ndarray, np.generic)):
             self._sync_copyfrom(value)
         else:
@@ -455,10 +456,10 @@ def copyto(self, other):
                 warnings.warn('copy an array to itself, is it intended?',
                               RuntimeWarning)
                 return
-            return NDArray._copyto(self, out=other)
+            return _internal._copyto(self, out=other)
         elif isinstance(other, Context):
             hret = NDArray(_new_alloc_handle(self.shape, other, True, self.dtype))
-            return NDArray._copyto(self, out=hret)
+            return _internal._copyto(self, out=hret)
         else:
             raise TypeError('copyto do not support type ' + str(type(other)))
 
@@ -510,7 +511,7 @@ def onehot_encode(indices, out):
         Same as out.
     """
     # pylint: disable= no-member, protected-access
-    return NDArray._onehot_encode(indices, out, out=out)
+    return _internal._onehot_encode(indices, out, out=out)
     # pylint: enable= no-member, protected-access
 
 
@@ -611,9 +612,9 @@ def add(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._plus,
+        _internal._plus,
         operator.add,
-        NDArray._plus_scalar,
+        _internal._plus_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
@@ -637,10 +638,10 @@ def subtract(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._minus,
+        _internal._minus,
         operator.sub,
-        NDArray._minus_scalar,
-        NDArray._rminus_scalar)
+        _internal._minus_scalar,
+        _internal._rminus_scalar)
     # pylint: enable= no-member, protected-access
 
 def multiply(lhs, rhs):
@@ -663,9 +664,9 @@ def multiply(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._mul,
+        _internal._mul,
         operator.mul,
-        NDArray._mul_scalar,
+        _internal._mul_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
@@ -689,10 +690,10 @@ def divide(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._div,
+        _internal._div,
         operator.truediv,
-        NDArray._div_scalar,
-        NDArray._rdiv_scalar)
+        _internal._div_scalar,
+        _internal._rdiv_scalar)
     # pylint: enable= no-member, protected-access
 
 def power(lhs, rhs):
@@ -715,10 +716,10 @@ def power(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._power,
+        _internal._power,
         operator.pow,
-        NDArray._power_scalar,
-        NDArray._rpower_scalar)
+        _internal._power_scalar,
+        _internal._rpower_scalar)
     # pylint: enable= no-member, protected-access
 
 def maximum(lhs, rhs):
@@ -741,9 +742,9 @@ def maximum(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._maximum,
+        _internal._maximum,
         lambda x, y: x if x > y else y,
-        NDArray._maximum_scalar,
+        _internal._maximum_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
@@ -767,9 +768,9 @@ def minimum(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        NDArray._minimum,
+        _internal._minimum,
         lambda x, y: x if x < y else y,
-        NDArray._minimum_scalar,
+        _internal._minimum_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
@@ -1013,24 +1014,24 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea
     if mean is None:
         mean = NDArray(_new_empty_handle())
     if out is None:
-        return NDArray._imdecode(mean, index,
-                                 clip_rect[0],
-                                 clip_rect[1],
-                                 clip_rect[2],
-                                 clip_rect[3],
-                                 channels,
-                                 len(str_img),
-                                 str_img=str_img)
+        return _internal._imdecode(mean, index,
+                                   clip_rect[0],
+                                   clip_rect[1],
+                                   clip_rect[2],
+                                   clip_rect[3],
+                                   channels,
+                                   len(str_img),
+                                   str_img=str_img)
     else:
-        return NDArray._imdecode(mean, index,
-                                 clip_rect[0],
-                                 clip_rect[1],
-                                 clip_rect[2],
-                                 clip_rect[3],
-                                 channels,
-                                 len(str_img),
-                                 str_img=str_img,
-                                 out=out)
+        return _internal._imdecode(mean, index,
+                                   clip_rect[0],
+                                   clip_rect[1],
+                                   clip_rect[2],
+                                   clip_rect[3],
+                                   channels,
+                                   len(str_img),
+                                   str_img=str_img,
+                                   out=out)
 
 # pylint: disable=too-many-locals, invalid-name
 def _make_ndarray_function(handle):
@@ -1196,12 +1197,13 @@ def _init_ndarray_module():
                                     ctypes.byref(plist)))
 
     module_obj = sys.modules[__name__]
+    module_internal = sys.modules["mxnet._ndarray_internal"]
     for i in range(size.value):
         hdl = FunctionHandle(plist[i])
         function = _make_ndarray_function(hdl)
-        # if function name starts with underscore, register as static method of NDArray
+        # if function name starts with underscore, register as internal namespace
         if function.__name__.startswith('_'):
-            setattr(NDArray, function.__name__, staticmethod(function))
+            setattr(module_internal, function.__name__, function)
         else:
             fname = function.__name__
             fn_obj = getattr(module_obj, fname, None)
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 9e186eed926f..e9c0e385022d 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -211,10 +211,10 @@ def list_arguments_entry(out, _):
                                  None, None, None, None, None)
         cb_ptr = format(cast(pointer(self.info_), c_void_p).value, 'x')
         # pylint: disable=E1101
-        sym = symbol.Symbol._Native(*args,
-                                    info=cb_ptr,
-                                    need_top_grad=self.need_top_grad(),
-                                    **kwargs)
+        sym = symbol._internal._Native(*args,
+                                       info=cb_ptr,
+                                       need_top_grad=self.need_top_grad(),
+                                       **kwargs)
         # keep a reference of ourself in PythonOp so we don't get garbage collected.
         PythonOp._ref_holder.append(self)
         return sym
@@ -358,9 +358,9 @@ def declare_backward_dependency(out_grad, in_data, out_data, num_dep, deps, _):
                                    None, None, None, None, None, None)
         cb_ptr = format(cast(pointer(self.info_), c_void_p).value, 'x')
         # pylint: disable=E1101
-        sym = symbol.Symbol._NDArray(*args,
-                                     info=cb_ptr,
-                                     **kwargs)
+        sym = symbol._internal._NDArray(*args,
+                                        info=cb_ptr,
+                                        **kwargs)
         # keep a reference of ourself in PythonOp so we don't get garbage collected.
         PythonOp._ref_holder.append(self)
         return sym
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index a3eec80e8122..b54e40d653bb 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -5,7 +5,8 @@
 
 import ctypes
 from .base import _LIB, check_call
-from .ndarray import NDArray, empty
+from .ndarray import empty
+from . import _ndarray_internal as _internal
 
 
 def uniform(low, high, shape=None, ctx=None, out=None):
@@ -38,7 +39,7 @@ def uniform(low, high, shape=None, ctx=None, out=None):
         if isinstance(shape, int):
             shape = (shape,)
         out = empty(shape, ctx)
-    return NDArray._sample_uniform(low=low, high=high, shape=out.shape, out=out)
+    return _internal._sample_uniform(low=low, high=high, shape=out.shape, out=out)
 
 
 def normal(loc, scale, shape=None, ctx=None, out=None):
@@ -71,7 +72,7 @@ def normal(loc, scale, shape=None, ctx=None, out=None):
         if isinstance(shape, int):
             shape = (shape,)
         out = empty(shape, ctx)
-    return NDArray._sample_normal(loc=loc, scale=scale, shape=out.shape, out=out)
+    return _internal._sample_normal(loc=loc, scale=scale, shape=out.shape, out=out)
 
 
 def seed(seed_state):
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index fb0c3a5875f1..dfe1890bddea 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -19,7 +19,7 @@
 from .ndarray import NDArray, zeros, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from .executor import Executor
 from .symbol_doc import SymbolDoc
-
+from . import _symbol_internal as _internal
 
 class Symbol(object):
     """Symbol is symbolic graph of the mxnet."""
@@ -37,9 +37,9 @@ def __init__(self, handle):
 
     def __add__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Plus(self, other)
+            return _internal._Plus(self, other)
         if isinstance(other, Number):
-            return Symbol._PlusScalar(self, scalar=other)
+            return _internal._PlusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -48,23 +48,23 @@ def __radd__(self, other):
 
     def __sub__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Minus(self, other)
+            return _internal._Minus(self, other)
         if isinstance(other, Number):
-            return Symbol._MinusScalar(self, scalar=other)
+            return _internal._MinusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rsub__(self, other):
         if isinstance(other, Number):
-            return Symbol._RMinusScalar(self, scalar=other)
+            return _internal._RMinusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __mul__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Mul(self, other)
+            return _internal._Mul(self, other)
         if isinstance(other, Number):
-            return Symbol._MulScalar(self, scalar=other)
+            return _internal._MulScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -73,15 +73,15 @@ def __rmul__(self, other):
 
     def __div__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Div(self, other)
+            return _internal._Div(self, other)
         if isinstance(other, Number):
-            return Symbol._DivScalar(self, scalar=other)
+            return _internal._DivScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rdiv__(self, other):
         if isinstance(other, Number):
-            return Symbol._RDivScalar(self, scalar=other)
+            return _internal._RDivScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -93,9 +93,9 @@ def __rtruediv__(self, other):
 
     def __pow__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Power(self, other)
+            return _internal._Power(self, other)
         if isinstance(other, Number):
-            return Symbol._PowerScalar(self, scalar=other)
+            return _internal._PowerScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -1091,11 +1091,12 @@ def _init_symbol_module():
     check_call(_LIB.MXSymbolListAtomicSymbolCreators(ctypes.byref(size),
                                                      ctypes.byref(plist)))
     module_obj = sys.modules[__name__]
+    module_internal = sys.modules["mxnet._symbol_internal"]
     for i in range(size.value):
         hdl = SymbolHandle(plist[i])
         function = _make_atomic_symbol_function(hdl)
         if function.__name__.startswith('_'):
-            setattr(Symbol, function.__name__, staticmethod(function))
+            setattr(module_internal, function.__name__, function)
         else:
             setattr(module_obj, function.__name__, function)
 
@@ -1118,11 +1119,11 @@ def pow(base, exp):
     result: Symbol or Number
     """
     if isinstance(base, Symbol) and isinstance(exp, Symbol):
-        return Symbol._Power(base, exp)
+        return _internal._Power(base, exp)
     if isinstance(base, Symbol) and isinstance(exp, Number):
-        return Symbol._PowerScalar(base, scalar=exp)
+        return _internal._PowerScalar(base, scalar=exp)
     if isinstance(base, Number) and isinstance(exp, Symbol):
-        return Symbol._RPowerScalar(exp, scalar=base)
+        return _internal._RPowerScalar(exp, scalar=base)
     if isinstance(base, Number) and isinstance(exp, Number):
         return base**exp
     else:
@@ -1144,11 +1145,11 @@ def maximum(left, right):
     result: Symbol or Number
     """
     if isinstance(left, Symbol) and isinstance(right, Symbol):
-        return Symbol._Maximum(left, right)
+        return _internal._Maximum(left, right)
     if isinstance(left, Symbol) and isinstance(right, Number):
-        return Symbol._MaximumScalar(left, scalar=right)
+        return _internal._MaximumScalar(left, scalar=right)
     if isinstance(left, Number) and isinstance(right, Symbol):
-        return Symbol._MaximumScalar(right, scalar=left)
+        return _internal._MaximumScalar(right, scalar=left)
     if isinstance(left, Number) and isinstance(right, Number):
         return left if left > right else right
     else:
@@ -1170,11 +1171,11 @@ def minimum(left, right):
     result: Symbol or Number
     """
     if isinstance(left, Symbol) and isinstance(right, Symbol):
-        return Symbol._Minimum(left, right)
+        return _internal._Minimum(left, right)
     if isinstance(left, Symbol) and isinstance(right, Number):
-        return Symbol._MinimumScalar(left, scalar=right)
+        return _internal._MinimumScalar(left, scalar=right)
     if isinstance(left, Number) and isinstance(right, Symbol):
-        return Symbol._MinimumScalar(right, scalar=left)
+        return _internal._MinimumScalar(right, scalar=left)
     if isinstance(left, Number) and isinstance(right, Number):
         return left if left > right else right
     else:

From 7e98334d62d61efbd765ca98016f3b7a0664cebd Mon Sep 17 00:00:00 2001
From: danielgordon10 <danielgordon10@gmail.com>
Date: Mon, 18 Jul 2016 18:54:39 -0700
Subject: [PATCH 110/126] Add MSRAPrelu initialization (#2647)

* added new MSRAPrelu initialization

* inheriting from xavier

* fixed typo
---
 python/mxnet/initializer.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index d308626a6377..47aa0bd3a7b9 100644
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -92,6 +92,7 @@ def _init_default(self, name, _):
         raise ValueError('Unknown initialization pattern for %s' % name)
     # pylint: enable=no-self-use, missing-docstring, invalid-name
 
+
 class Load(object):
     """Initialize by loading pretrained param from file or dict
 
@@ -134,6 +135,7 @@ def __call__(self, name, arr):
             if self.verbose:
                 logging.info('Initialized %s by default', name)
 
+
 class Mixed(object):
     """Initialize with mixed Initializer
 
@@ -186,6 +188,7 @@ def __init__(self, sigma=0.01):
     def _init_weight(self, _, arr):
         random.normal(0, self.sigma, out=arr)
 
+
 class Orthogonal(Initializer):
     """Intialize weight as Orthogonal matrix
 
@@ -265,3 +268,19 @@ def _init_weight(self, _, arr):
             random.normal(0, scale, out=arr)
         else:
             raise ValueError("Unknown random type")
+
+class MSRAPrelu(Xavier):
+    """Initialize the weight with initialization scheme from
+        Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification.
+
+    Parameters
+    ----------
+    factor_type: str, optional
+        Use ```avg```, ```in```, or ```out``` to init
+
+    slope: float, optional
+        initial slope of any PReLU (or similar) nonlinearities.
+    """
+    def __init__(self, factor_type="avg", slope=0.25):
+        magnitude = 2. / (1 + slope ** 2)
+        super(MSRAPrelu, self).__init__("gaussian", factor_type, magnitude)

From 3c3cba4f5bf57c6735aa0d30411d908ff8086ed8 Mon Sep 17 00:00:00 2001
From: sxjscience <xshiab@ust.hk>
Date: Tue, 19 Jul 2016 00:58:09 +0800
Subject: [PATCH 111/126] Enable auto-broadcasting for ndarray.broadcast_to. We
 can broadcast (50,) to (20, 50)

---
 python/mxnet/ndarray.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index b23ded4418fb..3d61f95b6b21 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -315,7 +315,20 @@ def broadcast_to(self, shape):
         shape : the shape to broadcast
             the broadcast shape
         """
-        return broadcast_to(self, shape=tuple(shape))
+        cur_shape = self.shape
+        err_str = 'operands could not be broadcast together with remapped shapes' \
+                  '[original->remapped]: {} and requested shape {}'.format(cur_shape, shape)
+        if len(shape) < len(cur_shape):
+            raise ValueError(err_str)
+        cur_shape = (1,) * (len(shape) - len(cur_shape)) + cur_shape
+        cur_shape_arr = np.array(cur_shape)
+        broadcasting_axes = np.nonzero(cur_shape_arr != np.array(shape))
+        if (cur_shape_arr[broadcasting_axes] != 1).any():
+            raise ValueError(err_str)
+        if cur_shape != self.shape:
+            return broadcast_to(self.reshape(cur_shape), shape=shape)
+        else:
+            return broadcast_to(self, shape=tuple(shape))
     # pylint: enable= undefined-variable
 
     def wait_to_read(self):

From d4150766027aec0f2f1d65a2b525ddf006c0ad42 Mon Sep 17 00:00:00 2001
From: Ldpe2G <liangdepeng@mail.com>
Date: Tue, 19 Jul 2016 21:42:31 +0800
Subject: [PATCH 112/126] scala-pkg, fix DataParallelExecutorManager

---
 scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
index 7507b3584faa..281f208843bb 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
@@ -406,7 +406,7 @@ class DataParallelExecutorManager(symbol: Symbol,
   private val trainExecs =
     ctx.zipWithIndex.map { case (context, i) =>
       val dataShapes =
-        trainData.provideData.map { case (name: String, shape: Shape) =>
+        (trainData.provideData ++ trainData.provideLabel).map { case (name: String, shape: Shape) =>
           (name, Shape(slices(i)._2 - slices(i)._1) ++ shape.drop(1))
         }
       symbol.simpleBind(context, "write", shapeDict = dataShapes)

From cbcdcebc90bba3661159133c7c0cc27d20eadf0b Mon Sep 17 00:00:00 2001
From: ziyeqinghan <ziyeqinghan@gmail.com>
Date: Mon, 18 Jul 2016 10:07:02 +0800
Subject: [PATCH 113/126] Add RMSProp optimizer in R

---
 R-package/R/optimizer.R         | 86 +++++++++++++++++++++++++++++++++
 R-package/man/mx.opt.rmsprop.Rd | 39 +++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 R-package/man/mx.opt.rmsprop.Rd

diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 18c4d81aa9ac..36543931d1f6 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -66,6 +66,89 @@ mx.opt.sgd <- function(learning.rate,
   return(list(create.state=create.state, update=update))
 }
 
+#' Create an RMSProp optimizer with respective parameters.
+#' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
+#' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+#' 
+#' @param learning.rate float, default=0.002
+#'      Step size.
+#' @param gamma1 float, default=0.95
+#'      decay factor of moving average for gradient, gradient^2.
+#' @param gamm2 float, default=0.9
+#'      "momentum" factor.
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+#'
+mx.opt.rmsprop <- function(learning.rate=0.002,
+                           gamma1=0.95,
+                           gamma2=0.9,
+                           wd=0,
+                           rescale.grad=1,
+                           clip_gradient = NULL, 
+                           lr_scheduler = NULL) {
+  # use lr as short for learing rate.
+  lr <- learning.rate
+  count       <- 0
+  num_update  <- 0
+
+  rmsprop <- new.env()
+  rmsprop$lr <- lr
+  rmsprop$count <- 0
+  rmsprop$num_update <- 0
+
+  create.state <- function(index, weight) {
+      return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
+                   g=mx.nd.zeros(dim(weight), ctx(weight)),
+                   delta=mx.nd.zeros(dim(weight), ctx(weight))))
+  }
+
+  update <- function(index, weight, grad, state) {
+    if (!is.null(lr_scheduler)){
+      lr_scheduler(rmsprop) ## changing lr
+      lr <- rmsprop$lr
+      ## update count
+      indexKey <- paste0('ik', index)
+      if (!exists(envir = rmsprop, x = indexKey)){
+        assign(x = indexKey, value = 0, envir = rmsprop)
+      } else {
+        indexValue <- get(envir = rmsprop, x = indexKey)
+        assign(x = indexKey, value = indexValue + 1, envir = rmsprop)
+        rmsprop$num_update <- max(rmsprop$num_update, get(envir = rmsprop, x = indexKey))
+      }
+    }
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)){
+      if(clip_gradient >= 0){
+          grad_ctx <- ctx(grad)
+          grad <- as.array(grad)
+          grad <- pmax(grad, -1 * clip_gradient)
+          grad <- pmin(grad, clip_gradient)
+          grad <- mx.nd.array(grad, grad_ctx)
+      } else {
+        stop("Error: clip_gradient should be positive number.")
+      }
+    }
+
+    n <- state$n
+    g <- state$g
+    delta <- state$delta
+    n <- gamma1 * n + (1 - gamma1) * (grad * grad)
+    g <- gamma1 * g + (1 - gamma1) * grad
+    delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd * weight)
+    weight <- weight + delta
+    state <- list(n=n, g=g, delta=delta)
+
+    return(list(weight=weight, state=state))
+  }
+  return(list(create.state=create.state, update=update))
+}
+
 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
@@ -76,6 +159,9 @@ mx.opt.create <- function(name, ...) {
   if (name == "sgd") {
     return(mx.opt.sgd(...))
   }
+  else if (name == "rmsprop") {
+    return (mx.opt.rmsprop(...))
+  }
   stop(paste("Unknown optimizer ", name))
 }
 
diff --git a/R-package/man/mx.opt.rmsprop.Rd b/R-package/man/mx.opt.rmsprop.Rd
new file mode 100644
index 000000000000..d51447200f2e
--- /dev/null
+++ b/R-package/man/mx.opt.rmsprop.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimizer.R
+\name{mx.opt.rmsprop}
+\alias{mx.opt.rmsprop}
+\title{Create an RMSProp optimizer with respective parameters.
+Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
+The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.}
+\usage{
+mx.opt.rmsprop(learning.rate = 0.002, gamma1 = 0.95, gamma2 = 0.9,
+  wd = 0, rescale.grad = 1, clip_gradient = NULL, lr_scheduler = NULL)
+}
+\arguments{
+\item{learning.rate}{float, default=0.002
+Step size.}
+
+\item{gamma1}{float, default=0.95
+decay factor of moving average for gradient, gradient^2.}
+
+\item{wd}{float, default=0.0
+L2 regularization coefficient add to all the weights.}
+
+\item{rescale.grad}{float, default=1.0
+rescaling factor of gradient.}
+
+\item{clip_gradient}{float, optional
+clip gradient in range [-clip_gradient, clip_gradient].}
+
+\item{lr_scheduler}{function, optional
+The learning rate scheduler.}
+
+\item{gamm2}{float, default=0.9
+"momentum" factor.}
+}
+\description{
+Create an RMSProp optimizer with respective parameters.
+Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
+The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+}
+

From 6b7f912e57b4d5daa45ed8b7e6873fbf4bf3550d Mon Sep 17 00:00:00 2001
From: Jian Guo <guojian710@gmail.com>
Date: Wed, 20 Jul 2016 19:33:26 +0800
Subject: [PATCH 114/126] update rcnn example to Faster R-CNN

---
 example/rcnn/LICENSE                          |  25 ++
 example/rcnn/README.md                        | 130 +++---
 example/rcnn/demo.py                          |  33 --
 example/rcnn/helper/dataset/imdb.py           |  85 +++-
 example/rcnn/helper/dataset/pascal_voc.py     |  65 ++-
 example/rcnn/helper/dataset/voc_eval.py       |   1 -
 .../rcnn/helper/processing/bbox_process.py    |  16 +
 .../rcnn/helper/processing/bbox_regression.py |  23 +-
 .../rcnn/helper/processing/bbox_transform.py  |  53 ++-
 .../rcnn/helper/processing/generate_anchor.py |  72 ++++
 .../helper/processing/image_processing.py     |  32 +-
 example/rcnn/helper/processing/roidb.py       |  41 +-
 example/rcnn/rcnn/callback.py                 |   4 +-
 example/rcnn/rcnn/config.py                   |  47 ++-
 example/rcnn/rcnn/detector.py                 |  42 +-
 example/rcnn/rcnn/loader.py                   | 298 ++++++++++++++
 example/rcnn/rcnn/metric.py                   |  57 ++-
 example/rcnn/rcnn/minibatch.py                | 374 ++++++++++++++----
 example/rcnn/rcnn/module.py                   | 193 +++++++++
 example/rcnn/rcnn/rpn/__init__.py             |   0
 example/rcnn/rcnn/rpn/generate.py             | 116 ++++++
 example/rcnn/rcnn/rpn/proposal.py             | 206 ++++++++++
 example/rcnn/rcnn/solver.py                   | 117 +++---
 example/rcnn/rcnn/symbol.py                   | 262 ++++++++----
 example/rcnn/rcnn/tester.py                   |  37 +-
 example/rcnn/test.py                          |  29 --
 example/rcnn/tools/fast-rcnn/__init__.py      |   0
 .../tools/{demo_net.py => fast-rcnn/demo.py}  |  39 +-
 example/rcnn/tools/fast-rcnn/test.py          |  57 +++
 example/rcnn/tools/fast-rcnn/train.py         | 101 +++++
 example/rcnn/tools/load_data.py               |  21 -
 example/rcnn/tools/test_final.py              |  61 +++
 example/rcnn/tools/test_rcnn.py               |  57 +++
 example/rcnn/tools/test_rpn.py                |  58 +++
 example/rcnn/tools/train_alternate.py         | 216 ++++++++++
 example/rcnn/tools/train_rcnn.py              | 140 +++++++
 example/rcnn/tools/train_rpn.py               | 145 +++++++
 example/rcnn/train.py                         |  38 --
 example/rcnn/utils/__init__.py                |   0
 example/rcnn/utils/caffe_convert.py           |  74 ++++
 example/rcnn/utils/combine_model.py           |  22 ++
 example/rcnn/utils/load_data.py               |  49 +++
 example/rcnn/{tools => utils}/load_model.py   |   3 +-
 example/rcnn/{tools => utils}/save_model.py   |   0
 44 files changed, 2899 insertions(+), 540 deletions(-)
 delete mode 100644 example/rcnn/demo.py
 create mode 100644 example/rcnn/helper/processing/bbox_process.py
 create mode 100644 example/rcnn/helper/processing/generate_anchor.py
 create mode 100644 example/rcnn/rcnn/loader.py
 create mode 100644 example/rcnn/rcnn/module.py
 create mode 100644 example/rcnn/rcnn/rpn/__init__.py
 create mode 100644 example/rcnn/rcnn/rpn/generate.py
 create mode 100644 example/rcnn/rcnn/rpn/proposal.py
 delete mode 100644 example/rcnn/test.py
 create mode 100644 example/rcnn/tools/fast-rcnn/__init__.py
 rename example/rcnn/tools/{demo_net.py => fast-rcnn/demo.py} (58%)
 create mode 100644 example/rcnn/tools/fast-rcnn/test.py
 create mode 100644 example/rcnn/tools/fast-rcnn/train.py
 delete mode 100644 example/rcnn/tools/load_data.py
 create mode 100644 example/rcnn/tools/test_final.py
 create mode 100644 example/rcnn/tools/test_rcnn.py
 create mode 100644 example/rcnn/tools/test_rpn.py
 create mode 100644 example/rcnn/tools/train_alternate.py
 create mode 100644 example/rcnn/tools/train_rcnn.py
 create mode 100644 example/rcnn/tools/train_rpn.py
 delete mode 100644 example/rcnn/train.py
 create mode 100644 example/rcnn/utils/__init__.py
 create mode 100644 example/rcnn/utils/caffe_convert.py
 create mode 100644 example/rcnn/utils/combine_model.py
 create mode 100644 example/rcnn/utils/load_data.py
 rename example/rcnn/{tools => utils}/load_model.py (97%)
 rename example/rcnn/{tools => utils}/save_model.py (100%)

diff --git a/example/rcnn/LICENSE b/example/rcnn/LICENSE
index 07b70c57b8d5..84eb07876986 100644
--- a/example/rcnn/LICENSE
+++ b/example/rcnn/LICENSE
@@ -42,6 +42,31 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 
 
+Faster R-CNN
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+
 Caffe
 
 COPYRIGHT
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index 240234870dc8..93d897cdfcf4 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -1,4 +1,10 @@
-# Fast R-CNN in MXNet
+# Faster R-CNN in MXNet with distributed implementation and data parallelization
+
+Region Proposal Network solves object detection as a regression problem 
+from the objectness perspective. Bounding boxes are predicted by applying 
+learned bounding box deltas to base boxes, namely anchor boxes across 
+different positions in feature maps. Training process directly learns a 
+mapping from raw image intensities to bounding box transformation targets.
 
 Fast R-CNN treats general object detection as a classification problem and
 bounding box prediction as a regression problem. Classifying cropped region
@@ -7,9 +13,15 @@ detection results. Cropping feature maps instead of image input accelerates
 computation utilizing shared convolution maps. Bounding box displacements
 are simultaneously learned in the training process.
 
+Faster R-CNN utilize an alternate optimization training process between RPN 
+and Fast R-CNN. Fast R-CNN weights are used to initiate RPN for training.
+
 ## Getting Started
 
-* MXNet with `ROIPooling` and `smooth_l1` operators are required
+* Install a forked MXNet at [MXNet-detection](https://github.com/precedenceguo/mxnet/tree/detection).
+Follow the instructions at http://mxnet.readthedocs.io/en/latest/how_to/build.html. Install the python interface.
+Note that the link refers to `detection` branch of the fork. Use `git clone -b detection https://github.com/precedenceguo/mxnet.git`
+to clone or `git checkout detection` if you checked out the master.
 * Download data and place them to `data` folder according to `Data Folder Structure`.
   You might want to create a symbolic link to VOCdevkit folder
 ```
@@ -17,13 +29,12 @@ Pascal VOCdevkit
 http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
 http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
 http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
-Ross's precomputed object proposals
-https://github.com/rbgirshick/fast-rcnn/blob/master/data/scripts/fetch_selective_search_data.sh
 ```
 * Data Folder Structure (suppose root is `data`)
 ```
 demo
-selective_search_data
+rpn_data (created by rpn)
+selective_search_data (can be omitted)
 cache (created by imdb)
 -- name + source + roidb.pkl (create by imdb)
 -- name (created by detection and evaluation)
@@ -36,91 +47,29 @@ VOCdevkit
 ```
 * Download VGG16 pretrained model, use `mxnet/tools/caffe_converter` to convert it,
   rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder
-* Download 'demo' data and put it in `data/demo` from
-```
-https://github.com/rbgirshick/fast-rcnn/tree/master/data/demo
-```
 
 ## Training
-* Start training by run `python train.py`. Variable args can be found by run
-`python train.py --help`.
-* Training can be done in cpu, modify `train.py` accordingly.
-```
-usage: train.py [-h] [--image_set IMAGE_SET] [--year YEAR]
-                [--root_path ROOT_PATH] [--devkit_path DEVKIT_PATH]
-                [--pretrained PRETRAINED] [--epoch EPOCH] [--prefix PREFIX]
-                [--gpu GPU_ID] [--begin_epoch BEGIN_EPOCH]
-                [--end_epoch END_EPOCH] [--frequent FREQUENT]
-
-Train a Fast R-CNN network
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --image_set IMAGE_SET
-                        can be trainval or train
-  --year YEAR           can be 2007, 2010, 2012
-  --root_path ROOT_PATH
-                        output data folder
-  --devkit_path DEVKIT_PATH
-                        VOCdevkit path
-  --pretrained PRETRAINED
-                        pretrained model prefix
-  --epoch EPOCH         epoch of pretrained model
-  --prefix PREFIX       new model prefix
-  --gpu GPU_ID          GPU device to train with
-  --begin_epoch BEGIN_EPOCH
-                        begin epoch of training
-  --end_epoch END_EPOCH
-                        end epoch of training
-  --frequent FREQUENT   frequency of logging
-```
+* Start training by run `python -m tools.train_alternate`. Variable args can be found by run
+`python -m tools.train_alternate --help`.
 
 ## Testing
-* Start testing by run `python test.py`. Variable args can be found by run
-`python test.py --help`.
-* Testing can be done in cpu, modify `test.py` accordingly.
-```
-usage: test.py [-h] [--image_set IMAGE_SET] [--year YEAR]
-               [--root_path ROOT_PATH] [--devkit_path DEVKIT_PATH]
-               [--prefix PREFIX] [--epoch EPOCH] [--gpu GPU_ID]
+* Start testing by run `python -m tools.test_final`. Variable args can be found by run
+`python -m tools.test_final --help`.
 
-Test a Fast R-CNN network
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --image_set IMAGE_SET
-                        can be test
-  --year YEAR           can be 2007, 2010, 2012
-  --root_path ROOT_PATH
-                        output data folder
-  --devkit_path DEVKIT_PATH
-                        VOCdevkit path
-  --prefix PREFIX       new model prefix
-  --epoch EPOCH         epoch of pretrained model
-  --gpu GPU_ID          GPU device to test with
-```
-
-## Demonstration
-* If no training has been done, download reference model from Ross Girshick and use
-`mxnet/caffe/caffe_converter` to convert it to MXNet.
-```
-https://github.com/rbgirshick/fast-rcnn/blob/master/data/scripts/fetch_fast_rcnn_models.sh
-```
-* Run demo by `demo.py --gpu 0 --prefix path-to-model --epoch 0`, in which
-`path-to-model + '%4d' % epoch.params` will be the params file and
-`path-to-model + '-symbol.json'` will be the symbol json.
-* Demo can be run in cpu, modify `demo.py` accordingly.
-```
-usage: demo.py [-h] [--prefix PREFIX] [--epoch EPOCH] [--gpu GPU_ID]
-
-Demonstrate a Fast R-CNN network
-
-optional arguments:
-  -h, --help       show this help message and exit
-  --prefix PREFIX  new model prefix
-  --epoch EPOCH    epoch of pretrained model
-  --gpu GPU_ID     GPU device to test with
-```
+## Contributing Guide
+You are more than welcome to add new features to this implementation or fix any potential bugs. 
+Here are some topics to look at.
+* MXNet features superior and robust distributed training. This implementation 
+has not yet fully ultilized this power.
+* New approximate end to end training is available from Faster R-CNN python 
+implementation whose link can be found in Disclaimer. This implementation 
+does not support this feature.
+* MXNet has efficient data loading module which renders data IO irrelevant 
+in performance. This implementation has not used this module.
+* More object detection dataset is available online. The dataset module is designed 
+as simple and scalable. Welcome to add more dataset support to this implementation.
+* During inference, some operations are only conducted in cpu. Reimplement them may bring 
+better performance in testing time.
 
 ## Disclaimer
 This repository used code from [MXNet](https://github.com/dmlc/mxnet),
@@ -129,4 +78,13 @@ This repository used code from [MXNet](https://github.com/dmlc/mxnet),
 [caffe](https://github.com/BVLC/caffe). Training data are from
 [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/),
 [ImageNet](http://image-net.org/). Model comes from
-[VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
\ No newline at end of file
+[VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
+
+## References
+1. Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems. In Neural Information Processing Systems, Workshop on Machine Learning Systems, 2015
+2. Ross Girshick. "Fast R-CNN." In Proceedings of the IEEE International Conference on Computer Vision, 2015.
+3. Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. "Faster R-CNN: Towards real-time object detection with region proposal networks." In Advances in Neural Information Processing Systems, 2015.
+4. Yangqing Jia, Evan Shelhamer, Jeff Donahue, Sergey Karayev, Jonathan Long, Ross Girshick, Sergio Guadarrama, and Trevor Darrell. "Caffe: Convolutional architecture for fast feature embedding." In Proceedings of the ACM International Conference on Multimedia, 2014.
+5. Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. "The pascal visual object classes (voc) challenge." International journal of computer vision 88, no. 2 (2010): 303-338.
+6. Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. "ImageNet: A large-scale hierarchical image database." In Computer Vision and Pattern Recognition, IEEE Conference on, 2009.
+7. Karen Simonyan, and Andrew Zisserman. "Very deep convolutional networks for large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
deleted file mode 100644
index 768b1a7fe15a..000000000000
--- a/example/rcnn/demo.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import argparse
-import mxnet as mx
-import os
-from tools.load_model import load_param
-from rcnn.symbol import get_symbol_vgg_test
-from rcnn.detector import Detector
-from tools.demo_net import demo_net
-
-
-def get_net(prefix, epoch, ctx):
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-    sym = get_symbol_vgg_test()
-    detector = Detector(sym, ctx, args, auxs)
-    return detector
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Demonstrate a Fast R-CNN network')
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=9, type=int)
-    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
-                        default=0, type=int)
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = mx.gpu(args.gpu_id)
-    detector = get_net(args.prefix, args.epoch, ctx)
-    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '000004'))
-    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '001551'))
diff --git a/example/rcnn/helper/dataset/imdb.py b/example/rcnn/helper/dataset/imdb.py
index 3c431ff5bfd1..8f53ce5412e6 100644
--- a/example/rcnn/helper/dataset/imdb.py
+++ b/example/rcnn/helper/dataset/imdb.py
@@ -32,7 +32,7 @@ def roidb(self, gt_roidb):
     def create_roidb_from_box_list(self, box_list, gt_roidb):
         """
         given ground truth, prepare roidb
-        :param box_list: [image_index][box_index][x1, x2, y1, y2]
+        :param box_list: [image_index] ndarray of [box_index][x1, x2, y1, y2]
         :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         """
@@ -43,7 +43,7 @@ def create_roidb_from_box_list(self, box_list, gt_roidb):
             num_boxes = boxes.shape[0]
             overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
 
-            if gt_roidb is not None:
+            if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
                 gt_boxes = gt_roidb[i]['boxes']
                 gt_classes = gt_roidb[i]['gt_classes']
                 # n boxes and k gt_boxes => n * k overlap
@@ -106,5 +106,86 @@ def append_flipped_images(self, roidb):
         self.image_set_index *= 2
         return roidb
 
+    def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None, area='all', limit=None):
+        """
+        evaluate detection proposal recall metrics
+        record max overlap value for each gt box; return vector of overlap values
+        :param roidb: used to evaluate
+        :param candidate_boxes: if not given, use roidb's non-gt boxes
+        :param thresholds: array-like recall threshold
+        :param area: index in area ranges
+        :param limit: limit of bounding box evaluated
+        :return: None
+        ar: average recall, recalls: vector recalls at each IoU overlap threshold
+        thresholds: vector of IoU overlap threshold, gt_overlaps: vector of all ground-truth overlaps
+        """
+        areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3,
+                 '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
+        area_ranges = [[0**2, 1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2],
+                       [96**2, 128**2], [128**2, 256**2], [256**2, 512**2], [512**2, 1e5**2]]
+        assert areas.has_key(area), 'unknown area range: {}'.format(area)
+        area_range = area_ranges[areas[area]]
+        gt_overlaps = np.zeros(0)
+        num_pos = 0
+        for i in range(self.num_images):
+            # check for max_overlaps == 1 avoids including crowd annotations
+            max_gt_overlaps = roidb[i]['gt_overlaps'].toarray().max(axis=1)
+            gt_inds = np.where((roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0]
+            gt_boxes = roidb[i]['boxes'][gt_inds, :]
+            gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
+            valid_gt_inds = np.where((gt_areas >= area_range[0]) & (gt_areas <= area_range[1]))[0]
+            gt_boxes = gt_boxes[valid_gt_inds, :]
+            num_pos += len(valid_gt_inds)
+
+            if candidate_boxes is None:
+                # default is use the non-gt boxes from roidb
+                non_gt_inds = np.where(roidb[i]['gt_classes'] == 0)[0]
+                boxes = roidb[i]['boxes'][non_gt_inds, :]
+            else:
+                boxes = candidate_boxes[i]
+            if boxes.shape[0] == 0:
+                continue
+            if limit is not None and boxes.shape[0] > limit:
+                boxes = boxes[:limit, :]
+
+            overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float))
+
+            _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+            for j in range(gt_boxes.shape[0]):
+                # find which proposal maximally covers each gt box
+                argmax_overlaps = overlaps.argmax(axis=0)
+                # get the IoU amount of coverage for each gt box
+                max_overlaps = overlaps.max(axis=0)
+                # find which gt box is covered by most IoU
+                gt_ind = max_overlaps.argmax()
+                gt_ovr = max_overlaps.max()
+                assert (gt_ovr >= 0)
+                # find the proposal box that covers the best covered gt box
+                box_ind = argmax_overlaps[gt_ind]
+                # record the IoU coverage of this gt box
+                _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+                assert (_gt_overlaps[j] == gt_ovr)
+                # mark the proposal box and the gt box as used
+                overlaps[box_ind, :] = -1
+                overlaps[:, gt_ind] = -1
+            # append recorded IoU coverage level
+            gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+
+        gt_overlaps = np.sort(gt_overlaps)
+        if thresholds is None:
+            step = 0.05
+            thresholds = np.arange(0.5, 0.95 + 1e-5, step)
+        recalls = np.zeros_like(thresholds)
+
+        # compute recall for each IoU threshold
+        for i, t in enumerate(thresholds):
+            recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+        ar = recalls.mean()
+
+        # print results
+        print 'average recall: {:.3f}'.format(ar)
+        for threshold, recall in zip(thresholds, recalls):
+            print 'recall @{:.2f}: {:.3f}'.format(threshold, recall)
+
     def evaluate_detections(self, detections):
         raise NotImplementedError
diff --git a/example/rcnn/helper/dataset/pascal_voc.py b/example/rcnn/helper/dataset/pascal_voc.py
index 7d2356ba3eef..9ae27f4b91e5 100644
--- a/example/rcnn/helper/dataset/pascal_voc.py
+++ b/example/rcnn/helper/dataset/pascal_voc.py
@@ -13,6 +13,7 @@
 import cPickle
 from imdb import IMDB
 from voc_eval import voc_eval
+from helper.processing.bbox_process import unique_boxes, filter_small_boxes
 
 
 class PascalVOC(IMDB):
@@ -43,7 +44,8 @@ def __init__(self, image_set, year, root_path, devkit_path):
         self.num_images = len(self.image_set_index)
 
         self.config = {'comp_id': 'comp4',
-                       'use_diff': True}
+                       'use_diff': False,
+                       'min_size': 2}
 
     @property
     def cache_path(self):
@@ -102,17 +104,14 @@ def load_pascal_annotation(self, index):
         :param index: index of a specific image
         :return: record['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         """
-        import xml.dom.minidom as minidom
+        import xml.etree.ElementTree as ET
         filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
 
-        # print 'Loading: {}'.format(filename)
-        def get_data_from_tag(node, tag):
-            return node.getElementsByTagName(tag)[0].childNodes[0].data
-
-        with open(filename) as f:
-            data = minidom.parseString(f.read())
-
-        objs = data.getElementsByTagName('object')
+        tree = ET.parse(filename)
+        objs = tree.findall('object')
+        if not self.config['use_diff']:
+            non_diff_objs = [obj for obj in objs if int(obj.find('difficult').text) == 0]
+            objs = non_diff_objs
         num_objs = len(objs)
 
         boxes = np.zeros((num_objs, 4), dtype=np.uint16)
@@ -122,13 +121,13 @@ def get_data_from_tag(node, tag):
         class_to_index = dict(zip(self.classes, range(self.num_classes)))
         # Load object bounding boxes into a data frame.
         for ix, obj in enumerate(objs):
+            bbox = obj.find('bndbox')
             # Make pixel indexes 0-based
-            x1 = float(get_data_from_tag(obj, 'xmin')) - 1
-            y1 = float(get_data_from_tag(obj, 'ymin')) - 1
-            x2 = float(get_data_from_tag(obj, 'xmax')) - 1
-            y2 = float(get_data_from_tag(obj, 'ymax')) - 1
-            cls = class_to_index[
-                str(get_data_from_tag(obj, "name")).lower().strip()]
+            x1 = float(bbox.find('xmin').text) - 1
+            y1 = float(bbox.find('ymin').text) - 1
+            x2 = float(bbox.find('xmax').text) - 1
+            y2 = float(bbox.find('ymax').text) - 1
+            cls = class_to_index[obj.find('name').text.lower().strip()]
             boxes[ix, :] = [x1, y1, x2, y2]
             gt_classes[ix] = cls
             overlaps[ix, cls] = 1.0
@@ -155,7 +154,12 @@ def load_selective_search_roidb(self, gt_roidb):
 
         box_list = []
         for i in range(raw_data.shape[0]):
-            box_list.append(raw_data[i][:, (1, 0, 3, 2)] - 1)  # pascal voc dataset starts from 1.
+            boxes = raw_data[i][:, (1, 0, 3, 2)] - 1  # pascal voc dataset starts from 1.
+            keep = unique_boxes(boxes)
+            boxes = boxes[keep, :]
+            keep = filter_small_boxes(boxes, self.config['min_size'])
+            boxes = boxes[keep, :]
+            box_list.append(boxes)
 
         return self.create_roidb_from_box_list(box_list, gt_roidb)
 
@@ -183,6 +187,33 @@ def selective_search_roidb(self, gt_roidb):
 
         return roidb
 
+    def load_rpn_roidb(self, gt_roidb):
+        """
+        turn rpn detection boxes into roidb
+        :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        """
+        rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_rpn.pkl')
+        print 'loading {}'.format(rpn_file)
+        assert os.path.exists(rpn_file), 'rpn data not found at {}'.format(rpn_file)
+        with open(rpn_file, 'rb') as f:
+            box_list = cPickle.load(f)
+        return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+    def rpn_roidb(self, gt_roidb):
+        """
+        get rpn roidb and ground truth roidb
+        :param gt_roidb: ground truth roidb
+        :return: roidb of rpn (ground truth included)
+        """
+        if self.image_set != 'test':
+            rpn_roidb = self.load_rpn_roidb(gt_roidb)
+            roidb = IMDB.merge_roidbs(gt_roidb, rpn_roidb)
+        else:
+            print 'rpn database need not be used in test'
+            roidb = self.load_rpn_roidb(gt_roidb)
+        return roidb
+
     def evaluate_detections(self, detections):
         """
         top level evaluations
diff --git a/example/rcnn/helper/dataset/voc_eval.py b/example/rcnn/helper/dataset/voc_eval.py
index 3b2c153c0de5..8975b619b708 100644
--- a/example/rcnn/helper/dataset/voc_eval.py
+++ b/example/rcnn/helper/dataset/voc_eval.py
@@ -95,7 +95,6 @@ def voc_eval(detpath, annopath, imageset_file, classname, cache_dir, ovthresh=0.
     else:
         with open(cache_file, 'r') as f:
             recs = cPickle.load(f)
-        print 'ground truth annotations loaded from cache file {}'.format(cache_file)
 
     # extract objects in :param classname:
     class_recs = {}
diff --git a/example/rcnn/helper/processing/bbox_process.py b/example/rcnn/helper/processing/bbox_process.py
new file mode 100644
index 000000000000..60d8a7af86bd
--- /dev/null
+++ b/example/rcnn/helper/processing/bbox_process.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+
+def unique_boxes(boxes, scale=1.0):
+    """ return indices of unique boxes """
+    v = np.array([1, 1e3, 1e6, 1e9])
+    hashes = np.round(boxes * scale).dot(v)
+    _, index = np.unique(hashes, return_index=True)
+    return np.sort(index)
+
+
+def filter_small_boxes(boxes, min_size):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    keep = np.where((w >= min_size) & (h > min_size))[0]
+    return keep
diff --git a/example/rcnn/helper/processing/bbox_regression.py b/example/rcnn/helper/processing/bbox_regression.py
index 7e58324fc541..840a96cc5ec5 100644
--- a/example/rcnn/helper/processing/bbox_regression.py
+++ b/example/rcnn/helper/processing/bbox_regression.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from rcnn.config import config
+from bbox_transform import bbox_transform
 
 
 def bbox_overlaps(boxes, query_boxes):
@@ -43,6 +44,8 @@ def compute_bbox_regression_targets(rois, overlaps, labels):
 
     # Indices of ground-truth ROIs
     gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        print 'something wrong : zero ground truth rois'
     # Indices of examples for which we try to make predictions
     ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
 
@@ -55,27 +58,9 @@ def compute_bbox_regression_targets(rois, overlaps, labels):
     gt_rois = rois[gt_inds[gt_assignment], :]
     ex_rois = rois[ex_inds, :]
 
-    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + config['EPS']
-    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + config['EPS']
-    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
-    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
-
-    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + config['EPS']
-    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + config['EPS']
-    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
-    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
-
-    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
-    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
-    targets_dw = np.log(gt_widths / ex_widths)
-    targets_dh = np.log(gt_heights / ex_heights)
-
     targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
     targets[ex_inds, 0] = labels[ex_inds]
-    targets[ex_inds, 1] = targets_dx
-    targets[ex_inds, 2] = targets_dy
-    targets[ex_inds, 3] = targets_dw
-    targets[ex_inds, 4] = targets_dh
+    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
     return targets
 
 
diff --git a/example/rcnn/helper/processing/bbox_transform.py b/example/rcnn/helper/processing/bbox_transform.py
index ba5187f2ab0c..0757a70eedd7 100644
--- a/example/rcnn/helper/processing/bbox_transform.py
+++ b/example/rcnn/helper/processing/bbox_transform.py
@@ -1,10 +1,37 @@
 """
-This file has functions about bounding box post processing.
+This file has functions about bounding box processing.
 """
 
 import numpy as np
 
 
+def bbox_transform(ex_rois, gt_rois):
+    """
+    compute bounding box regression targets from ex_rois to gt_rois
+    :param ex_rois: [N, 4]
+    :param gt_rois: [N, 4]
+    :return: [N, 4]
+    """
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14)
+    targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14)
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    targets = np.vstack(
+        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+    return targets
+
+
 def bbox_pred(boxes, box_deltas):
     """
     Transform the set of class-agnostic boxes into class-specific boxes
@@ -17,10 +44,10 @@ def bbox_pred(boxes, box_deltas):
         return np.zeros((0, box_deltas.shape[1]))
 
     boxes = boxes.astype(np.float, copy=False)
-    widths = boxes[:, 2] - boxes[:, 0] + 1e-14
-    heights = boxes[:, 3] - boxes[:, 1] + 1e-14
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
 
     dx = box_deltas[:, 0::4]
     dy = box_deltas[:, 1::4]
@@ -34,13 +61,13 @@ def bbox_pred(boxes, box_deltas):
 
     pred_boxes = np.zeros(box_deltas.shape)
     # x1
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0)
     # y1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0)
     # x2
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0)
     # y2
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
 
     return pred_boxes
 
@@ -53,11 +80,11 @@ def clip_boxes(boxes, im_shape):
     :return: [N, 4* num_classes]
     """
     # x1 >= 0
-    boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
     # y1 >= 0
-    boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
     # x2 < im_shape[1]
-    boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
     # y2 < im_shape[0]
-    boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
     return boxes
diff --git a/example/rcnn/helper/processing/generate_anchor.py b/example/rcnn/helper/processing/generate_anchor.py
new file mode 100644
index 000000000000..8996a3aaab48
--- /dev/null
+++ b/example/rcnn/helper/processing/generate_anchor.py
@@ -0,0 +1,72 @@
+"""
+Generate base anchors on index 0
+"""
+
+import numpy as np
+
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2 ** np.arange(3, 6)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in xrange(ratio_anchors.shape[0])])
+    return anchors
+
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
diff --git a/example/rcnn/helper/processing/image_processing.py b/example/rcnn/helper/processing/image_processing.py
index 833c2c74caa5..5646c557929f 100644
--- a/example/rcnn/helper/processing/image_processing.py
+++ b/example/rcnn/helper/processing/image_processing.py
@@ -12,7 +12,7 @@ def resize(im, target_size, max_size):
     """
     im_shape = im.shape
     im_size_min = np.min(im_shape[0:2])
-    im_size_max = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
     im_scale = float(target_size) / float(im_size_min)
     # prevent bigger axis from being more than max_size:
     if np.round(im_scale * im_size_max) > max_size:
@@ -60,17 +60,23 @@ def transform_inverse(im_tensor, pixel_means):
     return im
 
 
-def tensor_vstack(im_list):
+def tensor_vstack(tensor_list, pad=0):
     """
-    stack input image (usually 2 image) to obtain input to CNN
-    extra regions are padded with zero
-    :param im_list: list of image to be stacked vertically
-    :return: im_tensor [batch, channel, height, width]
+    vertically stack tensors
+    :param tensor_list: list of tensor to be stacked vertically
+    :param pad: label to pad with
+    :return: tensor with max shape
     """
-    max_channel = max([im_tensor.shape[1] for im_tensor in im_list])
-    max_height = max([im_tensor.shape[2] for im_tensor in im_list])
-    max_width = max([im_tensor.shape[3] for im_tensor in im_list])
-    im_tensor = np.zeros((len(im_list), max_channel, max_height, max_width))
-    for ind, im in enumerate(im_list):
-        im_tensor[ind, :im.shape[1], :im.shape[2], :im.shape[3]] = im
-    return im_tensor
+    ndim = len(tensor_list[0].shape)
+    if ndim == 1:
+        return np.hstack(tensor_list)
+    dimensions = [0]
+    for dim in range(1, ndim):
+        dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))
+    for ind, tensor in enumerate(tensor_list):
+        pad_shape = [(0, 0)]
+        for dim in range(1, ndim):
+            pad_shape.append((0, dimensions[dim] - tensor.shape[dim]))
+        tensor_list[ind] = np.lib.pad(tensor, pad_shape, 'constant', constant_values=pad)
+    all_tensor = np.vstack(tensor_list)
+    return all_tensor
diff --git a/example/rcnn/helper/processing/roidb.py b/example/rcnn/helper/processing/roidb.py
index d68ddb94290c..7ad1b26c182f 100644
--- a/example/rcnn/helper/processing/roidb.py
+++ b/example/rcnn/helper/processing/roidb.py
@@ -4,6 +4,7 @@
 extended ['image', 'max_classes', 'max_overlaps', 'bbox_targets']
 """
 
+import cv2
 import numpy as np
 
 from bbox_regression import compute_bbox_regression_targets
@@ -17,8 +18,13 @@ def prepare_roidb(imdb, roidb):
     :param roidb: roidb
     :return: None
     """
+    print 'prepare roidb'
     for i in range(len(roidb)):  # image_index
         roidb[i]['image'] = imdb.image_path_from_index(imdb.image_set_index[i])
+        if config.TRAIN.ASPECT_GROUPING:
+            size = cv2.imread(roidb[i]['image']).shape
+            roidb[i]['height'] = size[0]
+            roidb[i]['width'] = size[1]
         gt_overlaps = roidb[i]['gt_overlaps'].toarray()
         max_overlaps = gt_overlaps.max(axis=1)
         max_classes = gt_overlaps.argmax(axis=1)
@@ -51,22 +57,27 @@ def add_bbox_regression_targets(roidb):
         max_classes = roidb[im_i]['max_classes']
         roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes)
 
-    # compute mean, std values
-    class_counts = np.zeros((num_classes, 1)) + config.EPS
-    sums = np.zeros((num_classes, 4))
-    squared_sums = np.zeros((num_classes, 4))
-    for im_i in range(num_images):
-        targets = roidb[im_i]['bbox_targets']
-        for cls in range(1, num_classes):
-            cls_indexes = np.where(targets[:, 0] == cls)[0]
-            if cls_indexes.size > 0:
-                class_counts[cls] += cls_indexes.size
-                sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
-                squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0)
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+        # use fixed / precomputed means and stds instead of empirical values
+        means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1))
+        stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1))
+    else:
+        # compute mean, std values
+        class_counts = np.zeros((num_classes, 1)) + config.EPS
+        sums = np.zeros((num_classes, 4))
+        squared_sums = np.zeros((num_classes, 4))
+        for im_i in range(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in range(1, num_classes):
+                cls_indexes = np.where(targets[:, 0] == cls)[0]
+                if cls_indexes.size > 0:
+                    class_counts[cls] += cls_indexes.size
+                    sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
+                    squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0)
 
-    means = sums / class_counts
-    # var(x) = E(x^2) - E(x)^2
-    stds = np.sqrt(squared_sums / class_counts - means ** 2)
+        means = sums / class_counts
+        # var(x) = E(x^2) - E(x)^2
+        stds = np.sqrt(squared_sums / class_counts - means ** 2)
 
     # normalized targets
     for im_i in range(num_images):
diff --git a/example/rcnn/rcnn/callback.py b/example/rcnn/rcnn/callback.py
index bf5a8e72d24d..7b05628829ad 100644
--- a/example/rcnn/rcnn/callback.py
+++ b/example/rcnn/rcnn/callback.py
@@ -22,10 +22,8 @@ def __call__(self, param):
                 speed = self.frequent * self.batch_size / (time.time() - self.tic)
                 if param.eval_metric is not None:
                     name, value = param.eval_metric.get()
-                    cls, cls_value = param.cls_metric.get()
-                    bbox, bbox_value = param.bbox_metric.get()
                     logging.info("Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f,\t%s=%f,\t%s=%f",
-                                 param.epoch, count, speed, name, value, cls, cls_value, bbox, bbox_value)
+                                 param.epoch, count, speed, name[0], value[0], name[1], value[1], name[2], value[2])
                 else:
                     logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
                                  param.epoch, count, speed)
diff --git a/example/rcnn/rcnn/config.py b/example/rcnn/rcnn/config.py
index 9ae69cb8adc1..fb9826c1b186 100644
--- a/example/rcnn/rcnn/config.py
+++ b/example/rcnn/rcnn/config.py
@@ -3,26 +3,65 @@
 
 config = edict()
 
+# image processing config
 config.EPS = 1e-14
 config.PIXEL_MEANS = np.array([[[123.68, 116.779, 103.939]]])
+config.SCALES = (600, )  # single scale training and testing
+config.MAX_SIZE = 1000
+
+# nms config
+config.USE_GPU_NMS = True
+config.GPU_ID = 0
 
 config.TRAIN = edict()
 
-config.TRAIN.SCALES = (600, )
-config.TRAIN.MAX_SIZE = 1000
+# R-CNN and RPN
+config.TRAIN.FINETUNE = False
+config.TRAIN.BATCH_SIZE = 128  # used in grad_scale
 
+# R-CNN
+config.TRAIN.HAS_RPN = False
+config.TRAIN.ASPECT_GROUPING = True
 config.TRAIN.BATCH_IMAGES = 2
-config.TRAIN.BATCH_SIZE = 128
 config.TRAIN.FG_FRACTION = 0.25
 config.TRAIN.FG_THRESH = 0.5
 config.TRAIN.BG_THRESH_HI = 0.5
 config.TRAIN.BG_THRESH_LO = 0.1
 
+# R-CNN bounding box regression
 config.TRAIN.BBOX_REGRESSION_THRESH = 0.5
 config.TRAIN.BBOX_INSIDE_WEIGHTS = np.array([1.0, 1.0, 1.0, 1.0])
 
+# RPN anchor loader
+config.TRAIN.RPN_BATCH_SIZE = 256
+config.TRAIN.RPN_FG_FRACTION = 0.5
+config.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
+config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
+config.TRAIN.RPN_CLOBBER_POSITIVES = False
+config.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+config.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
+
+# used for end2end training
+# RPN proposal
+config.TRAIN.RPN_NMS_THRESH = 0.7
+config.TRAIN.RPN_PRE_NMS_TOP_N = 12000
+config.TRAIN.RPN_POST_NMS_TOP_N = 6000
+config.TRAIN.RPN_MIN_SIZE = 16
+# approximate bounding box regression
+config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = False
+config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0)
+config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2)
+
 config.TEST = edict()
 
-config.TEST.SCALES = (600, )
+# R-CNN testing
+config.TEST.HAS_RPN = False
+config.TEST.BATCH_IMAGES = 1
 config.TEST.NMS = 0.3
 config.TEST.DEDUP_BOXES = 1. / 16.
+
+# RPN proposal
+config.TEST.RPN_NMS_THRESH = 0.7
+config.TEST.RPN_PRE_NMS_TOP_N = 6000
+config.TEST.RPN_POST_NMS_TOP_N = 300
+config.TEST.RPN_MIN_SIZE = 16
diff --git a/example/rcnn/rcnn/detector.py b/example/rcnn/rcnn/detector.py
index cc9787d3fff4..8e424c973108 100644
--- a/example/rcnn/rcnn/detector.py
+++ b/example/rcnn/rcnn/detector.py
@@ -16,15 +16,16 @@ def __init__(self, symbol, ctx=None,
         self.aux_params = aux_params
         self.executor = None
 
-    def im_detect(self, im_array, roi_array):
+    def im_detect(self, im_array, im_info=None, roi_array=None):
         """
         perform detection of designated im, box, must follow minibatch.get_testbatch format
         :param im_array: numpy.ndarray [b c h w]
+        :param im_info: numpy.ndarray [b 3]
         :param roi_array: numpy.ndarray [roi_num 5]
         :return: scores, pred_boxes
         """
         # remove duplicate feature rois
-        if config.TEST.DEDUP_BOXES > 0:
+        if config.TEST.DEDUP_BOXES > 0 and not config.TEST.HAS_RPN:
             roi_array = roi_array
             # rank roi by v .* (b, dx, dy, dw, dh)
             v = np.array([1, 1e3, 1e6, 1e9, 1e12])
@@ -33,27 +34,44 @@ def im_detect(self, im_array, roi_array):
             _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True)
             roi_array = roi_array[index, :]
 
-        self.arg_params['data'] = mx.nd.array(im_array, self.ctx)
-        self.arg_params['rois'] = mx.nd.array(roi_array, self.ctx)
-        arg_shapes, out_shapes, aux_shapes = \
-            self.symbol.infer_shape(data=self.arg_params['data'].shape, rois=self.arg_params['rois'].shape)
+        # fill in data
+        if config.TEST.HAS_RPN:
+            self.arg_params['data'] = mx.nd.array(im_array, self.ctx)
+            self.arg_params['im_info'] = mx.nd.array(im_info, self.ctx)
+            arg_shapes, out_shapes, aux_shapes = \
+                self.symbol.infer_shape(data=self.arg_params['data'].shape, im_info=self.arg_params['im_info'].shape)
+        else:
+            self.arg_params['data'] = mx.nd.array(im_array, self.ctx)
+            self.arg_params['rois'] = mx.nd.array(roi_array, self.ctx)
+            arg_shapes, out_shapes, aux_shapes = \
+                self.symbol.infer_shape(data=self.arg_params['data'].shape, rois=self.arg_params['rois'].shape)
+
+        # fill in label and aux
         arg_shapes_dict = {name: shape for name, shape in zip(self.symbol.list_arguments(), arg_shapes)}
         self.arg_params['cls_prob_label'] = mx.nd.zeros(arg_shapes_dict['cls_prob_label'], self.ctx)
-
         aux_names = self.symbol.list_auxiliary_states()
         self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
+
+        # execute
         self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=None,
                                          grad_req='null', aux_states=self.aux_params)
         output_dict = {name: nd for name, nd in zip(self.symbol.list_outputs(), self.executor.outputs)}
-
         self.executor.forward(is_train=False)
-        scores = output_dict['cls_prob_output'].asnumpy()
-        bbox_deltas = output_dict['bbox_pred_output'].asnumpy()
 
-        pred_boxes = bbox_pred(roi_array[:, 1:], bbox_deltas)
+        # save output
+        scores = output_dict['cls_prob_reshape_output'].asnumpy()[0]
+        bbox_deltas = output_dict['bbox_pred_reshape_output'].asnumpy()[0]
+        if config.TEST.HAS_RPN:
+            rois = output_dict['rois_output'].asnumpy()
+            rois = rois[:, 1:].copy()  # scale back
+        else:
+            rois = roi_array[:, 1:]
+
+        # post processing
+        pred_boxes = bbox_pred(rois, bbox_deltas)
         pred_boxes = clip_boxes(pred_boxes, im_array[0].shape[-2:])
 
-        if config.TEST.DEDUP_BOXES > 0:
+        if config.TEST.DEDUP_BOXES > 0 and not config.TEST.HAS_RPN:
             # map back to original
             scores = scores[inv_index, :]
             pred_boxes = pred_boxes[inv_index, :]
diff --git a/example/rcnn/rcnn/loader.py b/example/rcnn/rcnn/loader.py
new file mode 100644
index 000000000000..cea0900245a3
--- /dev/null
+++ b/example/rcnn/rcnn/loader.py
@@ -0,0 +1,298 @@
+import mxnet as mx
+import numpy as np
+import minibatch
+from config import config
+from mxnet.executor_manager import _split_input_slice
+from helper.processing.image_processing import tensor_vstack
+
+
+class ROIIter(mx.io.DataIter):
+    def __init__(self, roidb, batch_size=2, shuffle=False, mode='train', ctx=None, work_load_list=None):
+        """
+        This Iter will provide roi data to Fast R-CNN network
+        :param roidb: must be preprocessed
+        :param batch_size: must divide BATCH_SIZE(128)
+        :param shuffle: bool
+        :param mode: control returned info
+        :param ctx: list of contexts
+        :param work_load_list: list of work load
+        :return: ROIIter
+        """
+        super(ROIIter, self).__init__()
+
+        self.roidb = roidb
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.mode = mode
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = [mx.cpu()]
+        self.work_load_list = work_load_list
+
+        self.cur = 0
+        self.size = len(roidb)
+        self.index = np.arange(self.size)
+        self.num_classes = self.roidb[0]['gt_overlaps'].shape[1]
+        self.reset()
+
+        self.batch = None
+        self.data = None
+        self.label = None
+        self.get_batch()
+        self.data_name = ['data', 'rois']
+        self.label_name = ['label', 'bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']
+
+    @property
+    def provide_data(self):
+        if self.mode == 'train':
+            return [('data', self.data[0].shape), ('rois', self.data[1].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    @property
+    def provide_label(self):
+        if self.mode == 'train':
+            return [('label', self.label[0].shape),
+                    ('bbox_target', self.label[1].shape),
+                    ('bbox_inside_weight', self.label[2].shape),
+                    ('bbox_outside_weight', self.label[3].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    def reset(self):
+        self.cur = 0
+        if self.shuffle:
+            if config.TRAIN.ASPECT_GROUPING:
+                widths = np.array([r['width'] for r in self.roidb])
+                heights = np.array([r['height'] for r in self.roidb])
+                horz = (widths >= heights)
+                vert = np.logical_not(horz)
+                horz_inds = np.where(horz)[0]
+                vert_inds = np.where(vert)[0]
+                inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
+                inds = np.reshape(inds, (-1, 2))
+                row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                inds = np.reshape(inds[row_perm, :], (-1, ))
+                self.index = inds
+            else:
+                np.random.shuffle(self.index)
+
+    def iter_next(self):
+        return self.cur + self.batch_size <= self.size
+
+    def next(self):
+        if self.iter_next():
+            self.get_batch()
+            self.cur += self.batch_size
+            return mx.io.DataBatch(data=self.data, label=self.label,
+                                   pad=self.getpad(), index=self.getindex(),
+                                   provide_data=self.provide_data, provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def getindex(self):
+        return self.cur / self.batch_size
+
+    def getpad(self):
+        if self.cur + self.batch_size > self.size:
+            return self.cur + self.batch_size - self.size
+        else:
+            return 0
+
+    def get_batch(self):
+        cur_from = self.cur
+        cur_to = min(cur_from + self.batch_size, self.size)
+        roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)]
+        if self.mode == 'test':
+            self.data, self.label = minibatch.get_minibatch(roidb, self.num_classes, self.mode)
+        else:
+            work_load_list = self.work_load_list
+            ctx = self.ctx
+            if work_load_list is None:
+                work_load_list = [1] * len(ctx)
+            assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
+                "Invalid settings for work load. "
+            slices = _split_input_slice(self.batch_size, work_load_list)
+
+            data_list = []
+            label_list = []
+            for islice in slices:
+                iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
+                data, label = minibatch.get_minibatch(iroidb, self.num_classes, self.mode)
+                data_list.append(data)
+                label_list.append(label)
+
+            all_data = dict()
+            for key in data_list[0].keys():
+                all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+
+            all_label = dict()
+            for key in label_list[0].keys():
+                all_label[key] = tensor_vstack([batch[key] for batch in label_list])
+
+            self.data = [mx.nd.array(all_data['data']),
+                         mx.nd.array(all_data['rois'])]
+            self.label = [mx.nd.array(all_label['label']),
+                          mx.nd.array(all_label['bbox_target']),
+                          mx.nd.array(all_label['bbox_inside_weight']),
+                          mx.nd.array(all_label['bbox_outside_weight'])]
+
+
+class AnchorLoader(mx.io.DataIter):
+    def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, mode='train', ctx=None, work_load_list=None,
+                 feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2), allowed_border=0):
+        """
+        This Iter will provide roi data to Fast R-CNN network
+        :param feat_sym: to infer shape of assign_output
+        :param roidb: must be preprocessed
+        :param batch_size: must divide BATCH_SIZE(128)
+        :param shuffle: bool
+        :param mode: control returned info
+        :param ctx: list of contexts
+        :param work_load_list: list of work load
+        :return: AnchorLoader
+        """
+        super(AnchorLoader, self).__init__()
+
+        self.feat_sym = feat_sym
+        self.roidb = roidb
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.mode = mode
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = [mx.cpu()]
+        self.work_load_list = work_load_list
+        self.feat_stride = feat_stride
+        self.anchor_scales = anchor_scales
+        self.anchor_ratios = anchor_ratios
+        self.allowed_border = allowed_border
+
+        self.cur = 0
+        self.size = len(roidb)
+        self.index = np.arange(self.size)
+        self.num_classes = self.roidb[0]['gt_overlaps'].shape[1]
+        self.reset()
+
+        self.batch = None
+        self.data = None
+        self.label = None
+        self.get_batch()
+        self.data_name = ['data', 'im_info']
+        self.label_name = ['label', 'bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']
+
+    @property
+    def provide_data(self):
+        if self.mode == 'train':
+            return [('data', self.data[0].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    @property
+    def provide_label(self):
+        if self.mode == 'train':
+            return [('label', self.label[0].shape),
+                    ('bbox_target', self.label[1].shape),
+                    ('bbox_inside_weight', self.label[2].shape),
+                    ('bbox_outside_weight', self.label[3].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    def reset(self):
+        self.cur = 0
+        if self.shuffle:
+            if config.TRAIN.ASPECT_GROUPING:
+                widths = np.array([r['width'] for r in self.roidb])
+                heights = np.array([r['height'] for r in self.roidb])
+                horz = (widths >= heights)
+                vert = np.logical_not(horz)
+                horz_inds = np.where(horz)[0]
+                vert_inds = np.where(vert)[0]
+                inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
+                inds = np.reshape(inds, (-1, 2))
+                row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                inds = np.reshape(inds[row_perm, :], (-1, ))
+                self.index = inds
+            else:
+                np.random.shuffle(self.index)
+
+    def iter_next(self):
+        return self.cur + self.batch_size <= self.size
+
+    def next(self):
+        if self.iter_next():
+            self.get_batch()
+            self.cur += self.batch_size
+            return mx.io.DataBatch(data=self.data, label=self.label,
+                                   pad=self.getpad(), index=self.getindex(),
+                                   provide_data=self.provide_data, provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def getindex(self):
+        return self.cur / self.batch_size
+
+    def getpad(self):
+        if self.cur + self.batch_size > self.size:
+            return self.cur + self.batch_size - self.size
+        else:
+            return 0
+
+    def get_batch(self):
+        cur_from = self.cur
+        cur_to = min(cur_from + self.batch_size, self.size)
+        roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)]
+        if self.mode == 'test':
+            self.data, self.label = minibatch.get_minibatch(roidb, self.num_classes, self.mode)
+        else:
+            work_load_list = self.work_load_list
+            ctx = self.ctx
+            if work_load_list is None:
+                work_load_list = [1] * len(ctx)
+            assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
+                "Invalid settings for work load. "
+            slices = _split_input_slice(self.batch_size, work_load_list)
+
+            data_list = []
+            label_list = []
+            for islice in slices:
+                iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
+                data, label = minibatch.get_minibatch(iroidb, self.num_classes, self.mode)
+                data_list.append(data)
+                label_list.append(label)
+
+            # pad data first and then assign anchor (read label)
+            data_tensor = tensor_vstack([batch['data'] for batch in data_list])
+            for data, data_pad in zip(data_list, data_tensor):
+                data['data'] = data_pad[np.newaxis, :]
+
+            new_label_list = []
+            for data, label in zip(data_list, label_list):
+                # infer label shape
+                data_shape = {k: v.shape for k, v in data.items()}
+                del data_shape['im_info']
+                _, feat_shape, _ = self.feat_sym.infer_shape(**data_shape)
+                feat_shape = [int(i) for i in feat_shape[0]]
+
+                # assign anchor for label
+                label = minibatch.assign_anchor(feat_shape, label['gt_boxes'], data['im_info'],
+                                                self.feat_stride, self.anchor_scales,
+                                                self.anchor_ratios, self.allowed_border)
+                del data['im_info']
+                new_label_list.append(label)
+
+            all_data = dict()
+            for key in ['data']:
+                all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+
+            all_label = dict()
+            all_label['label'] = tensor_vstack([batch['label'] for batch in new_label_list], pad=-1)
+            for key in ['bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']:
+                all_label[key] = tensor_vstack([batch[key] for batch in new_label_list])
+
+            self.data = [mx.nd.array(all_data['data'])]
+
+            self.label = [mx.nd.array(all_label['label']),
+                          mx.nd.array(all_label['bbox_target']),
+                          mx.nd.array(all_label['bbox_inside_weight']),
+                          mx.nd.array(all_label['bbox_outside_weight'])]
diff --git a/example/rcnn/rcnn/metric.py b/example/rcnn/rcnn/metric.py
index 8bf5119dffc3..b8bd90875604 100644
--- a/example/rcnn/rcnn/metric.py
+++ b/example/rcnn/rcnn/metric.py
@@ -4,14 +4,52 @@
 from rcnn.config import config
 
 
+class AccuracyMetric(mx.metric.EvalMetric):
+    def __init__(self, use_ignore=False, ignore=None):
+        super(AccuracyMetric, self).__init__('Accuracy')
+        self.use_ignore = use_ignore
+        self.ignore = ignore
+        self.has_rpn = config.TRAIN.HAS_RPN
+        if self.has_rpn:
+            assert self.use_ignore and self.ignore is not None
+
+    def update(self, labels, preds):
+        if self.has_rpn:
+            pred_label = mx.ndarray.argmax_channel(preds[0]).asnumpy().astype('int32')
+            label = labels[0].asnumpy().astype('int32')
+            non_ignore_inds = np.where(label != self.ignore)
+            pred_label = pred_label[non_ignore_inds]
+            label = label[non_ignore_inds]
+        else:
+            last_dim = preds[0].shape[-1]
+            pred_label = preds[0].asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32')
+            label = labels[0].asnumpy().reshape(-1,).astype('int32')
+
+        self.sum_metric += (pred_label.flat == label.flat).sum()
+        self.num_inst += len(pred_label.flat)
+
+
 class LogLossMetric(mx.metric.EvalMetric):
-    def __init__(self):
+    def __init__(self, use_ignore=False, ignore=None):
         super(LogLossMetric, self).__init__('LogLoss')
+        self.use_ignore = use_ignore
+        self.ignore = ignore
+        self.has_rpn = config.TRAIN.HAS_RPN
+        if self.has_rpn:
+            assert self.use_ignore and self.ignore is not None
 
     def update(self, labels, preds):
-        pred_cls = preds[0].asnumpy()
-        label = labels[0].asnumpy().astype('int32')
-        cls = pred_cls[np.arange(label.shape[0]), label]
+        if self.has_rpn:
+            pred_cls = preds[0].asnumpy()[0]
+            label = labels[0].asnumpy().astype('int32')[0]
+            non_ignore_inds = np.where(label != self.ignore)[0]
+            label = label[non_ignore_inds]
+            cls = pred_cls[label, non_ignore_inds]
+        else:
+            last_dim = preds[0].shape[-1]
+            pred_cls = preds[0].asnumpy().reshape(-1, last_dim)
+            label = labels[0].asnumpy().reshape(-1,).astype('int32')
+            cls = pred_cls[np.arange(label.shape[0]), label]
         cls += config.EPS
         cls_loss = -1 * np.log(cls)
         cls_loss = np.sum(cls_loss)
@@ -22,10 +60,15 @@ def update(self, labels, preds):
 class SmoothL1LossMetric(mx.metric.EvalMetric):
     def __init__(self):
         super(SmoothL1LossMetric, self).__init__('SmoothL1Loss')
+        self.has_rpn = config.TRAIN.HAS_RPN
 
     def update(self, labels, preds):
-        bbox_loss = preds[0].asnumpy()
-        label = labels[0].asnumpy()
+        bbox_loss = preds[1].asnumpy()
+        if self.has_rpn:
+            bbox_loss = bbox_loss.reshape((bbox_loss.shape[0], -1))
+        else:
+            first_dim = bbox_loss.shape[0] * bbox_loss.shape[1]
+            bbox_loss = bbox_loss.reshape(first_dim, -1)
+        self.num_inst += bbox_loss.shape[0]
         bbox_loss = np.sum(bbox_loss)
         self.sum_metric += bbox_loss
-        self.num_inst += label.shape[0]
diff --git a/example/rcnn/rcnn/minibatch.py b/example/rcnn/rcnn/minibatch.py
index b47ec0a7cf07..920d27eef22b 100644
--- a/example/rcnn/rcnn/minibatch.py
+++ b/example/rcnn/rcnn/minibatch.py
@@ -1,18 +1,24 @@
 """
 To construct data iterator from imdb, batch sampling procedure are defined here
-training minibatch =
+RPN:
+data =
     {'data': [num_images, c, h, w],
-    'rois': [num_rois, 5],
-    'labels': [num_rois],
-    'bbox_targets': [num_rois, 4 * num_classes],
-    'bbox_inside_weights': [num_rois, 4 * num_classes],
-    'bbox_outside_weights': [num_rois, 4 * num_classes]}
-    num_images should divide config['TRAIN_BATCH_SIZE'] and num_rois = config['TRAIN_BATCH_SIZE'] / num_images
-validation minibatch is similar except num_images = 1 and num_rois = all rois
-testing minibatch =
+    'im_info': [num_images, 4] (optional)}
+label =
+prototype: {'gt_boxes': [num_boxes, 5]}
+final:  {'label': [batch_size, 1] <- [batch_size, num_anchors, feat_height, feat_width],
+         'bbox_target': [batch_size, num_anchors, feat_height, feat_width],
+         'bbox_inside_weight': [batch_size, num_anchors, feat_height, feat_width],
+         'bbox_outside_weight': [batch_size, num_anchors, feat_height, feat_width]}
+Fast R-CNN:
+data =
     {'data': [num_images, c, h, w],
-    'rois': [num_rois, 5]}
-    num_images = 1 and num_rois = all rois
+    'rois': [num_images, num_rois, 5]}
+label =
+    {'label': [num_images, num_rois],
+    'bbox_target': [num_images, num_rois, 4 * num_classes],
+    'bbox_inside_weight': [num_images, num_rois, 4 * num_classes],
+    'bbox_outside_weight': [num_images, num_rois, 4 * num_classes]}
 """
 
 import cv2
@@ -21,87 +27,101 @@
 
 from helper.processing import image_processing
 from helper.processing.bbox_regression import expand_bbox_regression_targets
+from helper.processing.generate_anchor import generate_anchors
+from helper.processing.bbox_regression import bbox_overlaps
+from helper.processing.bbox_transform import bbox_transform
 from rcnn.config import config
 
 
-def get_minibatch(roidb, num_classes):
+def get_minibatch(roidb, num_classes, mode='test'):
     """
     return minibatch of images in roidb
-    :param roidb: subset of main database
+    :param roidb: a list of dict, whose length controls batch size
     :param num_classes: number of classes is used in bbox regression targets
-    :return: minibatch: {'data', 'rois', 'labels', 'bbox_targets', 'bbox_inside_weights', 'bbox_outside_weights'}
+    :param mode: controls whether blank label are returned
+    :return: data, label
     """
+    # build im_array: [num_images, c, h, w]
     num_images = len(roidb)
-    random_scale_indexes = npr.randint(0, high=len(config.TRAIN.SCALES), size=num_images)
-    assert config.TRAIN.BATCH_SIZE % num_images == 0, \
-        'num_images {} must devide BATCHSIZE {}'.format(num_images, config.TRAIN.BATCH_SIZE)
-    rois_per_image = config.TRAIN.BATCH_SIZE / num_images
-    fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int)
-
-    # im_array: [num_images, c, h, w]
-    im_array, im_scales = get_image_array(roidb, config.TRAIN.SCALES, random_scale_indexes)
-
-    rois_array = list()
-    labels_array = list()
-    bbox_targets_array = list()
-    bbox_inside_array = list()
-
-    for im_i in range(num_images):
-        im_rois, labels, bbox_targets, bbox_inside_weights, overlaps = \
-            sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes)
-
-        # project im_rois
-        # do not round roi
-        rois = im_rois * im_scales[im_i]
-        batch_index = im_i * np.ones((rois.shape[0], 1))
-        rois_array_this_image = np.hstack((batch_index, rois))
-        rois_array.append(rois_array_this_image)
-
-        # add labels
-        labels_array.append(labels)
-        bbox_targets_array.append(bbox_targets)
-        bbox_inside_array.append(bbox_inside_weights)
-
-    rois_array = np.vstack(rois_array)
-    labels_array = np.hstack(labels_array)
-    bbox_targets_array = np.vstack(bbox_targets_array)
-    bbox_inside_array = np.vstack(bbox_inside_array)
-    bbox_outside_array = np.array(bbox_inside_array > 0).astype(np.float32)
-
-    minibatch = {'data': im_array,
-                 'rois': rois_array,
-                 'labels': labels_array,
-                 'bbox_targets': bbox_targets_array,
-                 'bbox_inside_weights': bbox_inside_array,
-                 'bbox_outside_weights': bbox_outside_array}
-    return minibatch
-
-
-def get_testbatch(roidb, num_classes):
-    """
-    return test batch of given roidb
-    actually, there is only one testing scale and len(roidb) is 1
-    :param roidb: subset of main database
-    :param num_classes: number of classes is used in bbox regression targets
-    :return: minibatch: {'data', 'rois'}
-    """
-    num_images = len(roidb)
-    random_scale_indexes = npr.randint(0, high=len(config.TEST.SCALES), size=num_images)
-    im_array, im_scales = get_image_array(roidb, config.TEST.SCALES, random_scale_indexes)
+    random_scale_indexes = npr.randint(0, high=len(config.SCALES), size=num_images)
+    im_array, im_scales = get_image_array(roidb, config.SCALES, random_scale_indexes)
+
+    if mode == 'train':
+        cfg_key = 'TRAIN'
+    else:
+        cfg_key = 'TEST'
+
+    if config[cfg_key].HAS_RPN:
+        assert len(roidb) == 1, 'Single batch only'
+        assert len(im_scales) == 1, 'Single batch only'
+        im_info = np.array([[im_array.shape[2], im_array.shape[3], im_scales[0]]], dtype=np.float32)
+
+        data = {'data': im_array,
+                'im_info': im_info}
+        label = {}
+
+        if mode == 'train':
+            # gt boxes: (x1, y1, x2, y2, cls)
+            gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
+            gt_boxes = np.empty((roidb[0]['boxes'].shape[0], 5), dtype=np.float32)
+            gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
+            gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
+            label = {'gt_boxes': gt_boxes}
+    else:
+        if mode == 'train':
+            assert config.TRAIN.BATCH_SIZE % config.TRAIN.BATCH_IMAGES == 0, \
+                'BATCHIMAGES {} must devide BATCHSIZE {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_SIZE)
+            rois_per_image = config.TRAIN.BATCH_SIZE / config.TRAIN.BATCH_IMAGES
+            fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int)
+
+            rois_array = list()
+            labels_array = list()
+            bbox_targets_array = list()
+            bbox_inside_array = list()
+
+            for im_i in range(num_images):
+                im_rois, labels, bbox_targets, bbox_inside_weights, overlaps = \
+                    sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes)
 
-    rois_array = list()
-    for im_i in range(num_images):
-        im_rois = roidb[im_i]['boxes']
-        rois = im_rois * im_scales[im_i]
-        batch_index = im_i * np.ones((rois.shape[0], 1))
-        rois_array_this_image = np.hstack((batch_index, rois))
-        rois_array.append(rois_array_this_image)
+                # project im_rois
+                # do not round roi
+                rois = im_rois * im_scales[im_i]
+                batch_index = im_i * np.ones((rois.shape[0], 1))
+                rois_array_this_image = np.hstack((batch_index, rois))
+                rois_array.append(rois_array_this_image)
 
-    rois_array = np.vstack(rois_array)
+                # add labels
+                labels_array.append(labels)
+                bbox_targets_array.append(bbox_targets)
+                bbox_inside_array.append(bbox_inside_weights)
 
-    testbatch = {'data': im_array,
-                 'rois': rois_array}
-    return testbatch
+            rois_array = np.array(rois_array)
+            labels_array = np.array(labels_array)
+            bbox_targets_array = np.array(bbox_targets_array)
+            bbox_inside_array = np.array(bbox_inside_array)
+            bbox_outside_array = np.array(bbox_inside_array > 0).astype(np.float32)
+
+            data = {'data': im_array,
+                    'rois': rois_array}
+            label = {'label': labels_array,
+                     'bbox_target': bbox_targets_array,
+                     'bbox_inside_weight': bbox_inside_array,
+                     'bbox_outside_weight': bbox_outside_array}
+        else:
+            rois_array = list()
+            for im_i in range(num_images):
+                im_rois = roidb[im_i]['boxes']
+                rois = im_rois * im_scales[im_i]
+                batch_index = im_i * np.ones((rois.shape[0], 1))
+                rois_array_this_image = np.hstack((batch_index, rois))
+                rois_array.append(rois_array_this_image)
+            rois_array = np.vstack(rois_array)
+
+            data = {'data': im_array,
+                    'rois': rois_array}
+            label = {}
+
+    return data, label
 
 
 def get_image_array(roidb, scales, scale_indexes):
@@ -120,7 +140,7 @@ def get_image_array(roidb, scales, scale_indexes):
         if roidb[i]['flipped']:
             im = im[:, ::-1, :]
         target_size = scales[scale_indexes[i]]
-        im, im_scale = image_processing.resize(im, target_size, config.TRAIN.MAX_SIZE)
+        im, im_scale = image_processing.resize(im, target_size, config.MAX_SIZE)
         im_tensor = image_processing.transform(im, config.PIXEL_MEANS)
         processed_ims.append(im_tensor)
         im_scales.append(im_scale)
@@ -179,3 +199,193 @@ def sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes):
         expand_bbox_regression_targets(roidb['bbox_targets'][keep_indexes, :], num_classes)
 
     return rois, labels, bbox_targets, bbox_inside_weights, overlaps
+
+
+def assign_anchor(feat_shape, gt_boxes, im_info, feat_stride=16,
+                  scales=(8, 16, 32), ratios=(0.5, 1, 2), allowed_border=0):
+    """
+    assign ground truth boxes to anchor positions
+    :param feat_shape: infer output shape
+    :param gt_boxes: assign ground truth
+    :param im_info: filter out anchors overlapped with edges
+    :param feat_stride: anchor position step
+    :param scales: used to generate anchors, affects num_anchors (per location)
+    :param ratios: aspect ratios of generated anchors
+    :param allowed_border: filter out anchors with edge overlap > allowed_border
+    :return: dict of label
+    'label': of shape (batch_size, 1) <- (batch_size, num_anchors, feat_height, feat_width)
+    'bbox_target': of shape (batch_size, num_anchors * 4, feat_height, feat_width)
+    'bbox_inside_weight': *todo* mark the assigned anchors
+    'bbox_outside_weight': used to normalize the bbox_loss, all weights sums to RPN_POSITIVE_WEIGHT
+    """
+    def _unmap(data, count, inds, fill=0):
+        """" unmap a subset inds of data into original data of size count """
+        if len(data.shape) == 1:
+            ret = np.empty((count,), dtype=np.float32)
+            ret.fill(fill)
+            ret[inds] = data
+        else:
+            ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
+            ret.fill(fill)
+            ret[inds, :] = data
+        return ret
+
+    def _compute_targets(ex_rois, gt_rois):
+        """ compute bbox targets for an image """
+        assert ex_rois.shape[0] == gt_rois.shape[0]
+        assert ex_rois.shape[1] == 4
+        assert gt_rois.shape[1] == 5
+
+        return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
+
+    DEBUG = False
+    im_info = im_info[0]
+    scales = np.array(scales, dtype=np.float32)
+    base_anchors = generate_anchors(base_size=16, ratios=list(ratios), scales=scales)
+    num_anchors = base_anchors.shape[0]
+    feat_height, feat_width = feat_shape[-2:]
+
+    if DEBUG:
+        print 'anchors:'
+        print base_anchors
+        print 'anchor shapes:'
+        print np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4],
+                         base_anchors[:, 3::4] - base_anchors[:, 1::4]))
+        print 'im_info', im_info
+        print 'height', feat_height, 'width', feat_width
+        print 'gt_boxes shape', gt_boxes.shape
+        print 'gt_boxes', gt_boxes
+
+    # 1. generate proposals from bbox deltas and shifted anchors
+    shift_x = np.arange(0, feat_width) * feat_stride
+    shift_y = np.arange(0, feat_height) * feat_stride
+    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
+    # add A anchors (1, A, 4) to
+    # cell K shifts (K, 1, 4) to get
+    # shift anchors (K, A, 4)
+    # reshape to (K*A, 4) shifted anchors
+    A = num_anchors
+    K = shifts.shape[0]
+    all_anchors = base_anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
+    all_anchors = all_anchors.reshape((K * A, 4))
+    total_anchors = int(K * A)
+
+    # only keep anchors inside the image
+    inds_inside = np.where((all_anchors[:, 0] >= -allowed_border) &
+                           (all_anchors[:, 1] >= -allowed_border) &
+                           (all_anchors[:, 2] < im_info[1] + allowed_border) &
+                           (all_anchors[:, 3] < im_info[0] + allowed_border))[0]
+    if DEBUG:
+        print 'total_anchors', total_anchors
+        print 'inds_inside', len(inds_inside)
+
+    # keep only inside anchors
+    anchors = all_anchors[inds_inside, :]
+    if DEBUG:
+        print 'anchors shape', anchors.shape
+
+    # label: 1 is positive, 0 is negative, -1 is dont care
+    labels = np.empty((len(inds_inside),), dtype=np.float32)
+    labels.fill(-1)
+
+    if gt_boxes.size > 0:
+        # overlap between the anchors and the gt boxes
+        # overlaps (ex, gt)
+        overlaps = bbox_overlaps(anchors.astype(np.float), gt_boxes.astype(np.float))
+        argmax_overlaps = overlaps.argmax(axis=1)
+        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
+        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+        if not config.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels first so that positive labels can clobber them
+            labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+        # fg label: for each gt, anchor with highest overlap
+        labels[gt_argmax_overlaps] = 1
+
+        # fg label: above threshold IoU
+        labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+
+        if config.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels last so that negative labels can clobber positives
+            labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+    else:
+        labels[:] = 0
+
+    # subsample positive labels if we have too many
+    num_fg = int(config.TRAIN.RPN_FG_FRACTION * config.TRAIN.RPN_BATCH_SIZE)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg:
+        disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+        if DEBUG:
+            disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
+        labels[disable_inds] = -1
+
+    # subsample negative labels if we have too many
+    num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1)
+    bg_inds = np.where(labels == 0)[0]
+    if len(bg_inds) > num_bg:
+        disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+        if DEBUG:
+            disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+        labels[disable_inds] = -1
+
+    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    if gt_boxes.size > 0:
+        bbox_targets[:] = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
+
+    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    bbox_inside_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
+
+    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    if config.TRAIN.RPN_POSITIVE_WEIGHT < 0:
+        # uniform weighting of exampling (given non-uniform sampling)
+        num_examples = np.sum(labels >= 0)
+        positive_weights = np.ones((1, 4)) * 1.0 / num_examples
+        negative_weights = np.ones((1, 4)) * 1.0 / num_examples
+    else:
+        assert ((config.TRAIN.RPN_POSTIVE_WEIGHT > 0) & (config.TRAIN.RPN_POSTIVE_WEIGHT < 1))
+        positive_weights = config.TRAIN.RPN_POSTIVE_WEIGHT / np.sum(labels == 1)
+        negative_weights = (1.0 - config.TRAIN.RPN_POSTIVE_WEIGHT) / np.sum(labels == 1)
+    bbox_outside_weights[labels == 1, :] = positive_weights
+    bbox_outside_weights[labels == 0, :] = negative_weights
+
+    if DEBUG:
+        _sums = bbox_targets[labels == 1, :].sum(axis=0)
+        _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
+        _counts = config.EPS + np.sum(labels == 1)
+        means = _sums / _counts
+        stds = np.sqrt(_squared_sums / _counts - means ** 2)
+        print 'means', means
+        print 'stdevs', stds
+
+    # map up to original set of anchors
+    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
+    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
+    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
+    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
+
+    if DEBUG:
+        print 'rpn: max max_overlaps', np.max(max_overlaps)
+        print 'rpn: num_positives', np.sum(labels == 1)
+        print 'rpn: num_negatives', np.sum(labels == 0)
+        _fg_sum = np.sum(labels == 1)
+        _bg_sum = np.sum(labels == 0)
+        _count = 1
+        print 'rpn: num_positive avg', _fg_sum / _count
+        print 'rpn: num_negative avg', _bg_sum / _count
+
+    labels = labels.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
+    labels = labels.reshape((1, A * feat_height * feat_width))
+    bbox_targets = bbox_targets.reshape((1, feat_height, feat_width, A * 4)).transpose(0, 3, 1, 2)
+    bbox_inside_weights = bbox_inside_weights.reshape((1, feat_height, feat_width, A * 4)).transpose((0, 3, 1, 2))
+    bbox_outside_weights = bbox_outside_weights.reshape((1, feat_height, feat_width, A * 4)).transpose((0, 3, 1, 2))
+
+    label = {'label': labels,
+             'bbox_target': bbox_targets,
+             'bbox_inside_weight': bbox_inside_weights,
+             'bbox_outside_weight': bbox_outside_weights}
+    return label
diff --git a/example/rcnn/rcnn/module.py b/example/rcnn/rcnn/module.py
new file mode 100644
index 000000000000..23fb4ce7dad8
--- /dev/null
+++ b/example/rcnn/rcnn/module.py
@@ -0,0 +1,193 @@
+"""A `MutableModule` implement the `BaseModule` API, and allows input shape
+varying with training iterations. If shapes vary, executors will rebind,
+using shared arrays from the initial module binded with maximum shape.
+"""
+
+import logging
+
+from mxnet import context as ctx
+from mxnet.initializer import Uniform
+from mxnet.module.base_module import BaseModule
+from mxnet.module.module import Module
+# import numpy as np
+
+class MutableModule(BaseModule):
+    """A mutable module is a module that supports variable input data.
+
+    Parameters
+    ----------
+    symbol : Symbol
+    data_names : list of str
+    label_names : list of str
+    logger : Logger
+    context : Context or list of Context
+    work_load_list : list of number
+    max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    """
+    def __init__(self, symbol, data_names, label_names,
+                 logger=logging, context=ctx.cpu(), work_load_list=None,
+                 max_data_shapes=None, max_label_shapes=None):
+        super(MutableModule, self).__init__(logger=logger)
+        self._symbol = symbol
+        self._data_names = data_names
+        self._label_names = label_names
+        self._context = context
+        self._work_load_list = work_load_list
+
+        self._curr_module = None
+        self._max_data_shapes = max_data_shapes
+        self._max_label_shapes = max_label_shapes
+        if self._max_data_shapes is None:
+            self._max_data_shapes = []
+        if self._max_label_shapes is None:
+            self._max_label_shapes = []
+        # self._monitor_weight = None
+
+    def _reset_bind(self):
+        self.binded = False
+        self._curr_module = None
+
+    @property
+    def data_names(self):
+        return self._data_names
+
+    @property
+    def output_names(self):
+        return self._symbol.list_outputs()
+
+    @property
+    def data_shapes(self):
+        assert self.binded
+        return self._curr_module.data_shapes
+
+    @property
+    def label_shapes(self):
+        assert self.binded
+        return self._curr_module.label_shapes
+
+    @property
+    def output_shapes(self):
+        assert self.binded
+        return self._curr_module.output_shapes
+
+    def get_params(self):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_params()
+
+    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
+                    allow_missing=False, force_init=False):
+        if self.params_initialized and not force_init:
+            return
+        assert self.binded, 'call bind before initializing the parameters'
+        self._curr_module.init_params(initializer=initializer, arg_params=arg_params,
+                                      aux_params=aux_params, allow_missing=allow_missing,
+                                      force_init=force_init)
+        self.params_initialized = True
+
+    def bind(self, data_shapes, label_shapes=None, for_training=True,
+             inputs_need_grad=False, force_rebind=False, shared_module=None):
+        # in case we already initialized params, keep it
+        if self.params_initialized:
+            arg_params, aux_params = self.get_params()
+
+        # force rebinding is typically used when one want to switch from
+        # training to prediction phase.
+        if force_rebind:
+            self._reset_bind()
+
+        if self.binded:
+            self.logger.warning('Already binded, ignoring bind()')
+            return
+
+        assert shared_module is None, 'shared_module for MutableModule is not supported'
+
+        self.for_training = for_training
+        self.inputs_need_grad = inputs_need_grad
+        self.binded = True
+
+        max_shapes_dict = dict(self._max_data_shapes + self._max_label_shapes)
+        max_data_shapes = list()
+        for name, shape in data_shapes:
+            if name in max_shapes_dict:
+                max_data_shapes.append((name, max_shapes_dict[name]))
+            else:
+                max_data_shapes.append((name, shape))
+        max_label_shapes = list()
+        for name, shape in label_shapes:
+            if name in max_shapes_dict:
+                max_label_shapes.append((name, max_shapes_dict[name]))
+            else:
+                max_label_shapes.append((name, shape))
+
+        module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger,
+                        context=self._context, work_load_list=self._work_load_list)
+        module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad,
+                    force_rebind=False, shared_module=None)
+        self._curr_module = module
+
+        # copy back saved params, if already initialized
+        if self.params_initialized:
+            self.set_params(arg_params, aux_params)
+
+    def init_optimizer(self, kvstore='local', optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
+        assert self.binded and self.params_initialized
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring.')
+            return
+
+        self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params,
+                                         force_init=force_init)
+        self.optimizer_initialized = True
+
+    def forward(self, data_batch, is_train=None):
+        assert self.binded and self.params_initialized
+
+        shape_changed = False
+        current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes)
+        input_shapes = dict(data_batch.provide_data + data_batch.provide_label)
+        for k, v in current_shapes.items():
+            if v != input_shapes[k]:
+                shape_changed = True
+
+        if shape_changed:
+            module = Module(self._symbol, self._data_names, self._label_names,
+                            logger=self.logger, context=self._context,
+                            work_load_list=self._work_load_list)
+            module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training,
+                        self._curr_module.inputs_need_grad, force_rebind=False,
+                        shared_module=self._curr_module)
+            self._curr_module = module
+
+        self._curr_module.forward(data_batch, is_train=is_train)
+
+        # arg_params = self._curr_module._arg_params
+        # if self._monitor_weight is not None:
+        #     print 'diff', np.sum(np.abs(arg_params['conv4_2_weight'].asnumpy() - self._monitor_weight))
+
+    def backward(self, out_grads=None):
+        assert self.binded and self.params_initialized
+        self._curr_module.backward(out_grads=out_grads)
+
+    def update(self):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._curr_module.update()
+
+        # arg_params = self._curr_module._arg_params
+        # if self._monitor_weight is not None:
+        #     self.get_params()
+        #     print 'diff2', np.sum(np.abs(arg_params['conv4_2_weight'].asnumpy() - self._monitor_weight))
+        # self._monitor_weight = arg_params['conv4_2_weight'].asnumpy()
+
+    def get_outputs(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_outputs(merge_multi_context=merge_multi_context)
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized and self.inputs_need_grad
+        return self._curr_module.get_input_grads(merge_multi_context=merge_multi_context)
+
+    def update_metric(self, eval_metric, labels):
+        assert self.binded and self.params_initialized
+        self._curr_module.update_metric(eval_metric, labels)
diff --git a/example/rcnn/rcnn/rpn/__init__.py b/example/rcnn/rcnn/rpn/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/example/rcnn/rcnn/rpn/generate.py b/example/rcnn/rcnn/rpn/generate.py
new file mode 100644
index 000000000000..f1c8ddbef2e3
--- /dev/null
+++ b/example/rcnn/rcnn/rpn/generate.py
@@ -0,0 +1,116 @@
+import mxnet as mx
+import numpy as np
+import os
+import cPickle
+
+
+class Detector(object):
+    def __init__(self, symbol, ctx=None,
+                 arg_params=None, aux_params=None):
+        self.symbol = symbol
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = mx.cpu()
+        self.executor = None
+        self.arg_params = arg_params
+        self.aux_params = aux_params
+
+    def im_detect(self, im, im_info):
+        """
+        perform detection of im, im_info
+        :param im: numpy.ndarray [b, c, h, w]
+        :param im_info: numpy.ndarray [b, 3]
+        :return: boxes [b, 5], scores [b,]
+        """
+        self.arg_params['data'] = mx.nd.array(im, self.ctx)
+        self.arg_params['im_info'] = mx.nd.array(im_info, self.ctx)
+        arg_shapes, out_shapes, aux_shapes = \
+            self.symbol.infer_shape(data=self.arg_params['data'].shape, im_info=self.arg_params['im_info'].shape)
+        aux_names = self.symbol.list_auxiliary_states()
+        self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
+        self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=None,
+                                         grad_req='null', aux_states=self.aux_params)
+        output_dict = {name: nd for name, nd in zip(self.symbol.list_outputs(), self.executor.outputs)}
+
+        self.executor.forward(is_train=False)
+        boxes = output_dict['rois_output'].asnumpy()
+        scores = output_dict['rois_score'].asnumpy()
+
+        return boxes, scores
+
+
+def generate_detections(detector, test_data, imdb, vis=False):
+    """
+    Generate detections results using RPN.
+    :param detector: Detector
+    :param test_data: data iterator, must be non-shuffled
+    :param imdb: image database
+    :param vis: controls visualization
+    :return: list of detected boxes
+    """
+    assert not test_data.shuffle
+
+    i = 0
+    imdb_boxes = list()
+    for databatch in test_data:
+        if i % 10 == 0:
+            print 'generating detections {}/{}'.format(i, imdb.num_images)
+
+        boxes, scores = detector.im_detect(databatch.data['data'], databatch.data['im_info'])
+        scale = databatch.data['im_info'][0, 2]
+        # drop the batch index
+        boxes = boxes[:, 1:].copy() / scale
+        imdb_boxes.append(boxes)
+        if vis:
+            dets = np.hstack((boxes * scale, scores))
+            vis_detection(databatch.data['data'], dets, thresh=0.9)
+        i += 1
+
+    assert len(imdb_boxes) == imdb.num_images, 'calculations not complete'
+    rpn_folder = os.path.join(imdb.root_path, 'rpn_data')
+    if not os.path.exists(rpn_folder):
+        os.mkdir(rpn_folder)
+    rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl')
+    with open(rpn_file, 'wb') as f:
+        cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL)
+    print 'wrote rpn proposals to {}'.format(rpn_file)
+    return imdb_boxes
+
+
+def vis_detection(im, dets, thresh=0.):
+    """
+    draw detected bounding boxes
+    :param im: [b, c, h, w] oin rgb
+    :param dets: only one class, [N * [4 coordinates score]]
+    :param thresh: thresh for valid detections
+    :return:
+    """
+    from rcnn.config import config
+    from helper.processing.image_processing import transform_inverse
+    import matplotlib.pyplot as plt
+    inds = np.where(dets[:, -1] >= thresh)[0]
+    if len(inds) == 0:
+        return
+    inds = np.argsort(dets[:, -1])[::-1]
+    inds = inds[:20]
+
+    class_name = 'obj'
+    fig, ax = plt.subplots(figsize=(12, 12))
+    im = transform_inverse(im, config.PIXEL_MEANS)
+    ax.imshow(im, aspect='equal')
+    for i in inds:
+        bbox = dets[i, :4]
+        score = dets[i, -1]
+        rect = plt.Rectangle((bbox[0], bbox[1]),
+                             bbox[2] - bbox[0],
+                             bbox[3] - bbox[1], fill=False,
+                             edgecolor='red', linewidth=3.5)
+        ax.add_patch(rect)
+        ax.text(bbox[0], bbox[1] - 2,
+                '{:s} {:3f}'.format(class_name, score),
+                bbox=dict(facecolor='blue', alpha=0.5), fontsize=14, color='white')
+    ax.set_title('{} detections with p({} | box) >= {:.1f}'.format(class_name, class_name, thresh), fontsize=14)
+    plt.axis('off')
+    plt.tight_layout()
+    plt.draw()
+    plt.show()
diff --git a/example/rcnn/rcnn/rpn/proposal.py b/example/rcnn/rcnn/rpn/proposal.py
new file mode 100644
index 000000000000..b0303c5cfd84
--- /dev/null
+++ b/example/rcnn/rcnn/rpn/proposal.py
@@ -0,0 +1,206 @@
+"""
+Proposal Operator transform anchor coordinates into ROI coordinates with prediction results on
+classification probability and bounding box prediction results, and image size and scale information.
+"""
+
+import mxnet as mx
+import numpy as np
+import numpy.random as npr
+
+from rcnn.config import config
+from helper.processing.generate_anchor import generate_anchors
+from helper.processing.bbox_transform import bbox_pred, clip_boxes
+from helper.processing.nms import nms
+
+DEBUG = False
+
+
+class ProposalOperator(mx.operator.CustomOp):
+    def __init__(self, feat_stride, scales, ratios, is_train=False, output_score=False):
+        super(ProposalOperator, self).__init__()
+        self._feat_stride = float(feat_stride)
+        self._scales = np.fromstring(scales[1:-1], dtype=float, sep=',')
+        self._ratios = np.fromstring(ratios[1:-1], dtype=float, sep=',').tolist()
+        self._anchors = generate_anchors(base_size=self._feat_stride, scales=self._scales, ratios=self._ratios)
+        self._num_anchors = self._anchors.shape[0]
+        self._output_score = output_score
+
+        if DEBUG:
+            print 'feat_stride: {}'.format(self._feat_stride)
+            print 'anchors:'
+            print self._anchors
+
+        if is_train:
+            self.cfg_key = 'TRAIN'
+        else:
+            self.cfg_key = 'TEST'
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        # for each (H, W) location i
+        #   generate A anchor boxes centered on cell i
+        #   apply predicted bbox deltas at cell i to each of the A anchors
+        # clip predicted boxes to image
+        # remove predicted boxes with either height or width < threshold
+        # sort all (proposal, score) pairs by score from highest to lowest
+        # take top pre_nms_topN proposals before NMS
+        # apply NMS with threshold 0.7 to remaining proposals
+        # take after_nms_topN proposals after NMS
+        # return the top proposals (-> RoIs top, scores top)
+
+        pre_nms_topN = config[self.cfg_key].RPN_PRE_NMS_TOP_N
+        post_nms_topN = config[self.cfg_key].RPN_POST_NMS_TOP_N
+        nms_thresh = config[self.cfg_key].RPN_NMS_THRESH
+        min_size = config[self.cfg_key].RPN_MIN_SIZE
+
+        # the first set of anchors are background probabilities
+        # keep the second part
+        scores = in_data[0].asnumpy()[:, self._num_anchors:, :, :]
+        bbox_deltas = in_data[1].asnumpy()
+        im_info = in_data[2].asnumpy()[0, :]
+
+        if DEBUG:
+            print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+            print 'scale: {}'.format(im_info[2])
+
+        # 1. Generate proposals from bbox_deltas and shifted anchors
+        height, width = scores.shape[-2:]
+
+        if DEBUG:
+            print 'score map size: {}'.format(scores.shape)
+
+        # Enumerate all shifts
+        shift_x = np.arange(0, width) * self._feat_stride
+        shift_y = np.arange(0, height) * self._feat_stride
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
+
+        # Enumerate all shifted anchors:
+        #
+        # add A anchors (1, A, 4) to
+        # cell K shifts (K, 1, 4) to get
+        # shift anchors (K, A, 4)
+        # reshape to (K*A, 4) shifted anchors
+        A = self._num_anchors
+        K = shifts.shape[0]
+        anchors = self._anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
+        anchors = anchors.reshape((K * A, 4))
+
+        # Transpose and reshape predicted bbox transformations to get them
+        # into the same order as the anchors:
+        #
+        # bbox deltas will be (1, 4 * A, H, W) format
+        # transpose to (1, H, W, 4 * A)
+        # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
+        # in slowest to fastest order
+        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+        # Same story for the scores:
+        #
+        # scores are (1, A, H, W) format
+        # transpose to (1, H, W, A)
+        # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
+        scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+        # Convert anchors into proposals via bbox transformations
+        proposals = bbox_pred(anchors, bbox_deltas)
+
+        # 2. clip predicted boxes to image
+        proposals = clip_boxes(proposals, im_info[:2])
+
+        # 3. remove predicted boxes with either height or width < threshold
+        # (NOTE: convert min_size to input image scale stored in im_info[2])
+        keep = ProposalOperator._filter_boxes(proposals, min_size * im_info[2])
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # 4. sort all (proposal, score) pairs by score from highest to lowest
+        # 5. take top pre_nms_topN (e.g. 6000)
+        order = scores.ravel().argsort()[::-1]
+        if pre_nms_topN > 0:
+            order = order[:pre_nms_topN]
+        proposals = proposals[order, :]
+        scores = scores[order]
+
+        # 6. apply nms (e.g. threshold = 0.7)
+        # 7. take after_nms_topN (e.g. 300)
+        # 8. return the top proposals (-> RoIs top)
+        keep = nms(np.hstack((proposals, scores)), nms_thresh)
+        if post_nms_topN > 0:
+            keep = keep[:post_nms_topN]
+        # pad to ensure output size remains unchanged
+        if len(keep) < post_nms_topN:
+            pad = npr.choice(keep, size=post_nms_topN - len(keep))
+            keep = np.hstack((keep, pad))
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # Output rois array
+        # Our RPN implementation only supports a single input image, so all
+        # batch inds are 0
+        batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
+        blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
+        self.assign(out_data[0], req[0], blob)
+
+        if self._output_score:
+            self.assign(out_data[1], req[1], scores.astype(np.float32, copy=False))
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        pass
+
+    @staticmethod
+    def _filter_boxes(boxes, min_size):
+        """ Remove all boxes with any side smaller than min_size """
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+        return keep
+
+
+@mx.operator.register("proposal")
+class ProposalProp(mx.operator.CustomOpProp):
+    def __init__(self, feat_stride, scales, ratios, is_train=False, output_score=False):
+        super(ProposalProp, self).__init__(need_top_grad=False)
+        self._feat_stride = feat_stride
+        self._scales = scales
+        self._ratios = ratios
+        self._is_train = is_train
+        self._output_score = output_score
+
+        if self._is_train:
+            self.cfg_key = 'TRAIN'
+        else:
+            self.cfg_key = 'TEST'
+
+    def list_arguments(self):
+        return ['cls_prob', 'bbox_pred', 'im_info']
+
+    def list_outputs(self):
+        if self._output_score:
+            return ['output', 'score']
+        else:
+            return ['output']
+
+    def infer_shape(self, in_shape):
+        cfg_key = self.cfg_key
+        cls_prob_shape = in_shape[0]
+        bbox_pred_shape = in_shape[1]
+        assert cls_prob_shape[0] == bbox_pred_shape[0], 'ROI number does not equal in cls and reg'
+
+        batch_size = cls_prob_shape[0]
+        if batch_size > 1:
+            raise ValueError("Only single item batches are supported")
+
+        im_info_shape = (batch_size, 3)
+        output_shape = (config[cfg_key].RPN_POST_NMS_TOP_N, 5)
+        score_shape = (config[cfg_key].RPN_POST_NMS_TOP_N, 1)
+
+        if self._output_score:
+            return [cls_prob_shape, bbox_pred_shape, im_info_shape], [output_shape, score_shape]
+        else:
+            return [cls_prob_shape, bbox_pred_shape, im_info_shape], [output_shape]
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return ProposalOperator(self._feat_stride, self._scales, self._ratios, self._is_train, self._output_score)
+
+    def declare_backward_dependency(self, out_grad, in_data, out_data):
+        return []
diff --git a/example/rcnn/rcnn/solver.py b/example/rcnn/rcnn/solver.py
index 8f5d6efbb5f8..d82bdbfd0f15 100644
--- a/example/rcnn/rcnn/solver.py
+++ b/example/rcnn/rcnn/solver.py
@@ -2,7 +2,6 @@
 import logging
 import metric
 
-from collections import namedtuple
 from callback import Speedometer
 from config import config
 
@@ -11,97 +10,93 @@ class Solver(object):
     def __init__(self, prefix,
                  symbol, ctx=None,
                  begin_epoch=0, num_epoch=None,
+                 kv_store='local',
                  arg_params=None, aux_params=None,
-                 optimizer='sgd', **kwargs):
+                 optimizer='sgd',
+                 mutable_data_shape=False, max_data_shape=None, max_label_shape=None, **kwargs):
         self.prefix = prefix
         self.symbol = symbol
         self.ctx = ctx
         if self.ctx is None:
-            self.ctx = mx.cpu()
+            self.ctx = [mx.cpu()]
         self.begin_epoch = begin_epoch
         self.num_epoch = num_epoch
+        self.kv_store = kv_store
         self.arg_params = arg_params
         self.aux_params = aux_params
-        self.grad_params = None
-        self.executor = None
         self.optimizer = optimizer
         self.updater = None
+        self.mutable_data_shape = mutable_data_shape
+        self.max_data_shape = max_data_shape
+        self.max_label_shape = max_label_shape
         self.kwargs = kwargs.copy()
 
-    def get_params(self, grad_req):
+        self.check_params()
+        self.arg_names = None
+        self.param_names = None
+        self.aux_names = None
+
+    def get_params(self, grad_req, data_shapes):
         arg_names = self.symbol.list_arguments()
-        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
+        self.arg_names = arg_names
+
+        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(**dict(data_shapes))
         if grad_req != 'null':
-            self.grad_params = {}
+            param_names = []
             for name, shape in zip(arg_names, arg_shapes):
                 if not (name.endswith('data') or name.endswith('rois') or
+                        name.endswith('im_info') or name.endswith('gt_boxes') or
                         name.endswith('inside_weight') or name.endswith('outside_weight') or
                         name.endswith('label') or name.endswith('target') or
                         name.startswith('conv1') or name.startswith('conv2')):
-                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
+                    if not (config.TRAIN.FINETUNE and name.startswith('conv')):
+                        param_names.append(name)
+            self.param_names = param_names
+
         aux_names = self.symbol.list_auxiliary_states()
-        self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
+        self.aux_names = aux_names
+
+    def check_params(self):
+        arg_names = set(self.symbol.list_arguments())
+        self.arg_params = {k: v for k, v in self.arg_params.items() if k in arg_names}
+        aux_names = set(self.symbol.list_arguments())
+        self.aux_params = {k: v for k, v in self.aux_params.items() if k in aux_names}
 
     def fit(self, train_data,
             grad_req='write',
             frequent=20,
             logger=None):
+        (kvstore, update_on_kvstore) = mx.model._create_kvstore(self.kv_store, len(self.ctx), self.arg_params)
         if logger is None:
             logger = logging
-        logger.info('Start training with %s', str(self.ctx))
-        speedometer_param = namedtuple('BatchEndParams',
-                                       ['epoch', 'nbatch', 'eval_metric', 'cls_metric', 'bbox_metric'])
+
         batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
         epoch_end_callback = mx.callback.do_checkpoint(self.prefix)
 
-        self.get_params(grad_req)
-        self.optimizer = mx.optimizer.create(self.optimizer, rescale_grad=(1.0 / config.TRAIN.BATCH_SIZE), **self.kwargs)
-        self.updater = mx.optimizer.get_updater(self.optimizer)
+        self.get_params(grad_req, train_data.provide_data + train_data.provide_label)
 
-        eval_metric = mx.metric.create("accuracy")
-        cls_metric = metric.LogLossMetric()
+        if config.TRAIN.HAS_RPN is True:
+            eval_metric = metric.AccuracyMetric(use_ignore=True, ignore=-1)
+            cls_metric = metric.LogLossMetric(use_ignore=True, ignore=-1)
+        else:
+            eval_metric = metric.AccuracyMetric()
+            cls_metric = metric.LogLossMetric()
         bbox_metric = metric.SmoothL1LossMetric()
+        eval_metrics = mx.metric.CompositeEvalMetric()
+        for child_metric in [eval_metric, cls_metric, bbox_metric]:
+            eval_metrics.add(child_metric)
+        mutable_data_shape = self.mutable_data_shape
+        max_data_shape = self.max_data_shape
+        max_label_shape = self.max_label_shape
 
-        # begin training
-        for epoch in range(self.begin_epoch, self.num_epoch):
-            nbatch = 0
-            train_data.reset()
-            eval_metric.reset()
-            cls_metric.reset()
-            bbox_metric.reset()
-            for databatch in train_data:
-                nbatch += 1
-                for k, v in databatch.data.items():
-                    self.arg_params[k] = mx.nd.array(v, self.ctx)
-                for k, v in databatch.label.items():
-                    self.arg_params[k] = mx.nd.array(v, self.ctx)
-                self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params,
-                                                 grad_req=grad_req, aux_states=self.aux_params)
-                assert len(self.symbol.list_arguments()) == len(self.executor.grad_arrays)
-                update_dict = {name: nd for name, nd
-                               in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd}
-                output_dict = {name: nd for name, nd
-                               in zip(self.symbol.list_outputs(), self.executor.outputs)}
-                self.executor.forward(is_train=True)
-                self.executor.backward()
-
-                for key, arr in update_dict.items():
-                    self.updater(key, arr, self.arg_params[key])
-
-                label = self.arg_params['cls_prob_label']
-                pred = output_dict['cls_prob_output']
-                bb_target = self.arg_params['bbox_loss_target']
-                bb_loss = output_dict['bbox_loss_output']
-                eval_metric.update([label], [pred])
-                cls_metric.update([label], [pred])
-                bbox_metric.update([bb_target], [bb_loss])
-
-                # print speed and accuracy metric
-                batch_end_params = speedometer_param(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric,
-                                                     cls_metric=cls_metric, bbox_metric=bbox_metric)
-                batch_end_callback(batch_end_params)
-
-            if epoch_end_callback:
-                epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params)
-            name, value = eval_metric.get()
-            logger.info("                     --->Epoch[%d] Train-%s=%f", epoch, name, value)
+        self.optimizer = mx.optimizer.create(self.optimizer,
+                                             rescale_grad=(1.0 / config.TRAIN.BATCH_SIZE), **self.kwargs)
+        mx.model._train_multi_device(self.symbol, self.ctx, self.arg_names, self.param_names, self.aux_names,
+                                     self.arg_params, self.aux_params, self.begin_epoch, self.num_epoch,
+                                     epoch_size=None, optimizer=self.optimizer,
+                                     kvstore=kvstore, update_on_kvstore=update_on_kvstore,
+                                     train_data=train_data, eval_data=None, eval_metric=eval_metrics,
+                                     epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback,
+                                     logger=logger, work_load_list=None, monitor=None,
+                                     mutable_data_shape=mutable_data_shape, max_data_shape=max_data_shape,
+                                     max_label_shape=max_label_shape)
diff --git a/example/rcnn/rcnn/symbol.py b/example/rcnn/rcnn/symbol.py
index dcbbf53ece74..e483fdc4f0b2 100644
--- a/example/rcnn/rcnn/symbol.py
+++ b/example/rcnn/rcnn/symbol.py
@@ -1,18 +1,14 @@
 import mxnet as mx
+import rpn.proposal
+from config import config
 
 
-def get_symbol_vgg(num_classes=21):
+def get_vgg_conv(data):
     """
-    Fast R-CNN with VGG 16 conv layers
-    :param num_classes: used to determine output size
+    shared convolutional layers
+    :param data: Symbol
     :return: Symbol
     """
-    data = mx.symbol.Variable(name="data")
-    rois = mx.symbol.Variable(name='rois')
-    cls_prob_label = mx.symbol.Variable(name='cls_prob_label')
-    bbox_loss_target = mx.symbol.Variable(name='bbox_loss_target')
-    bbox_loss_inside_weight = mx.symbol.Variable(name='bbox_loss_inside_weight')
-    bbox_loss_outside_weight = mx.symbol.Variable(name='bbox_loss_outside_weight')
     # group 1
     conv1_1 = mx.symbol.Convolution(
         data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
@@ -65,6 +61,34 @@ def get_symbol_vgg(num_classes=21):
     conv5_3 = mx.symbol.Convolution(
         data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
     relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
+
+    return relu5_3
+
+
+def get_vgg_rcnn(num_classes=21):
+    """
+    Fast R-CNN with VGG 16 conv layers
+    :param num_classes: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    rois = mx.symbol.Variable(name='rois')
+    label = mx.symbol.Variable(name='label')
+    bbox_target = mx.symbol.Variable(name='bbox_target')
+    bbox_inside_weight = mx.symbol.Variable(name='bbox_inside_weight')
+    bbox_outside_weight = mx.symbol.Variable(name='bbox_outside_weight')
+
+    # reshape input
+    rois = mx.symbol.Reshape(data=rois, shape=(-1, 5), name='rois_reshape')
+    label = mx.symbol.Reshape(data=label, shape=(-1, ), name='label_reshape')
+    bbox_target = mx.symbol.Reshape(data=bbox_target, shape=(-1, 4 * num_classes), name='bbox_target_reshape')
+    bbox_inside_weight = mx.symbol.Reshape(data=bbox_inside_weight, shape=(-1, 4 * num_classes), name='bbox_inside_weight_reshape')
+    bbox_outside_weight = mx.symbol.Reshape(data=bbox_outside_weight, shape=(-1, 4 * num_classes), name='bbox_outside_weight_reshape')
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # Fast R-CNN
     pool5 = mx.symbol.ROIPooling(
         name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=0.0625)
     # group 6
@@ -78,78 +102,39 @@ def get_symbol_vgg(num_classes=21):
     drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
     # classification
     cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes)
-    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=cls_prob_label)
+    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label)
     # bounding box regression
     bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
-    bbox_loss_ = bbox_loss_outside_weight * \
+    bbox_loss_ = bbox_outside_weight * \
                  mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0,
-                                     data=bbox_loss_inside_weight * (bbox_pred - bbox_loss_target))
+                                     data=bbox_inside_weight * (bbox_pred - bbox_target))
     bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TRAIN.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape')
+    bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(config.TRAIN.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_loss_reshape')
+
     # group output
     group = mx.symbol.Group([cls_prob, bbox_loss])
     return group
 
 
-def get_symbol_vgg_test(num_classes=21):
+def get_vgg_rcnn_test(num_classes=21):
     """
-    Fast R-CNN test with VGG 16 conv layers
+    Fast R-CNN Network with VGG
     :param num_classes: used to determine output size
     :return: Symbol
     """
     data = mx.symbol.Variable(name="data")
     rois = mx.symbol.Variable(name='rois')
-    # group 1
-    conv1_1 = mx.symbol.Convolution(
-        data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
-    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
-    conv1_2 = mx.symbol.Convolution(
-        data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
-    relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
-    pool1 = mx.symbol.Pooling(
-        data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")
-    # group 2
-    conv2_1 = mx.symbol.Convolution(
-        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
-    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
-    conv2_2 = mx.symbol.Convolution(
-        data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
-    relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
-    pool2 = mx.symbol.Pooling(
-        data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")
-    # group 3
-    conv3_1 = mx.symbol.Convolution(
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
-    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
-    conv3_2 = mx.symbol.Convolution(
-        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
-    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
-    conv3_3 = mx.symbol.Convolution(
-        data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
-    relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
-    pool3 = mx.symbol.Pooling(
-        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool3")
-    # group 4
-    conv4_1 = mx.symbol.Convolution(
-        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
-    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
-    conv4_2 = mx.symbol.Convolution(
-        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
-    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
-    conv4_3 = mx.symbol.Convolution(
-        data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
-    relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
-    pool4 = mx.symbol.Pooling(
-        data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")
-    # group 5
-    conv5_1 = mx.symbol.Convolution(
-        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
-    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
-    conv5_2 = mx.symbol.Convolution(
-        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
-    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
-    conv5_3 = mx.symbol.Convolution(
-        data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
-    relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
+
+    # reshape rois
+    rois = mx.symbol.Reshape(data=rois, shape=(-1, 5), name='rois_reshape')
+
+    # shared convolutional layer
+    relu5_3 = get_vgg_conv(data)
+    
+    # Fast R-CNN
     pool5 = mx.symbol.ROIPooling(
         name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=0.0625)
     # group 6
@@ -166,6 +151,151 @@ def get_symbol_vgg_test(num_classes=21):
     cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score)
     # bounding box regression
     bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape')
+    bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape')
+
     # group output
     group = mx.symbol.Group([cls_prob, bbox_pred])
     return group
+
+
+def get_vgg_rpn(num_classes=21, num_anchors=9):
+    """
+    Region Proposal Network with VGG
+    :param num_classes: used to determine output size
+    :param num_anchors: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    label = mx.symbol.Variable(name='label')
+    bbox_target = mx.symbol.Variable(name='bbox_target')
+    bbox_inside_weight = mx.symbol.Variable(name='bbox_inside_weight')
+    bbox_outside_weight = mx.symbol.Variable(name='bbox_outside_weight')
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1), name="rpn_cls_score_reshape")
+
+    # classification
+    cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=label, multi_output=True,
+                                       normalization='valid', use_ignore=True, ignore_label=-1, name="cls_prob")
+    # bounding box regression
+    bbox_loss_ = bbox_outside_weight * \
+                 mx.symbol.smooth_l1(name='bbox_loss_', scalar=3.0,
+                                     data=bbox_inside_weight * (rpn_bbox_pred - bbox_target))
+    bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_)
+    # group output
+    group = mx.symbol.Group([cls_prob, bbox_loss])
+    return group
+
+
+def get_vgg_rpn_test(num_classes=21, num_anchors=9):
+    """
+    Region Proposal Network with VGG
+    :param num_classes: used to determine output size
+    :param num_anchors: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    im_info = mx.symbol.Variable(name="im_info")
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+
+    # ROI Proposal
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape")
+    rpn_cls_prob = mx.symbol.SoftmaxActivation(
+        data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob")
+    rpn_cls_prob_reshape = mx.symbol.Reshape(
+        data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
+    group = mx.symbol.Custom(
+        cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
+        op_type='proposal', feat_stride=16, scales=(8, 16, 32), ratios=(0.5, 1, 2), output_score=True)
+    # rois = group[0]
+    # score = group[1]
+
+    return group
+
+
+def get_vgg_test(num_classes=21, num_anchors=9):
+    """
+    Faster R-CNN test with VGG 16 conv layers
+    :param num_classes: used to determine output size
+    :param num_anchors: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    im_info = mx.symbol.Variable(name="im_info")
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+
+    # ROI Proposal
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape")
+    rpn_cls_prob = mx.symbol.SoftmaxActivation(
+        data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob")
+    rpn_cls_prob_reshape = mx.symbol.Reshape(
+        data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
+    rois = mx.symbol.Custom(
+        cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
+        op_type='proposal', feat_stride=16, scales=(8, 16, 32), ratios=(0.5, 1, 2))
+
+    # Fast R-CNN
+    pool5 = mx.symbol.ROIPooling(
+        name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=0.0625)
+    # group 6
+    flatten = mx.symbol.Flatten(data=pool5, name="flatten")
+    fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+    relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6")
+    drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
+    # group 7
+    fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+    relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7")
+    drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
+    # classification
+    cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes)
+    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score)
+    # bounding box regression
+    bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape')
+    bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape')
+
+    # group output
+    group = mx.symbol.Group([rois, cls_prob, bbox_pred])
+    return group
diff --git a/example/rcnn/rcnn/tester.py b/example/rcnn/rcnn/tester.py
index 3f69daa6f822..1789c5a96855 100644
--- a/example/rcnn/rcnn/tester.py
+++ b/example/rcnn/rcnn/tester.py
@@ -19,7 +19,7 @@ def pred_eval(detector, test_data, imdb, vis=False):
     """
     assert not test_data.shuffle
 
-    thresh = 0.1
+    thresh = 0.05
     # limit detections to max_per_image over all classes
     max_per_image = 100
 
@@ -35,15 +35,17 @@ def pred_eval(detector, test_data, imdb, vis=False):
         if i % 10 == 0:
             print 'testing {}/{}'.format(i, imdb.num_images)
 
-        scores, boxes = detector.im_detect(databatch.data['data'], databatch.data['rois'])
-
-        # we used scaled image & roi to train, so it is necessary to transform them back
-        # visualization should also be from the original size
-        im_path = imdb.image_path_from_index(imdb.image_set_index[i])
-        im = cv2.imread(im_path)
-        im_height = im.shape[0]
-        scale = float(databatch.data['data'].shape[2]) / float(im_height)
-        im = image_processing.transform(im, config.PIXEL_MEANS)
+        if config.TEST.HAS_RPN:
+            scores, boxes = detector.im_detect(databatch.data['data'], im_info=databatch.data['im_info'])
+            scale = databatch.data['im_info'][0, 2]
+        else:
+            scores, boxes = detector.im_detect(databatch.data['data'], roi_array=databatch.data['rois'])
+            # we used scaled image & roi to train, so it is necessary to transform them back
+            # visualization should also be from the original size
+            im_path = imdb.image_path_from_index(imdb.image_set_index[i])
+            im = cv2.imread(im_path)
+            im_height = im.shape[0]
+            scale = float(databatch.data['data'].shape[2]) / float(im_height)
 
         for j in range(1, imdb.num_classes):
             indexes = np.where(scores[:, j] > thresh)[0]
@@ -64,7 +66,11 @@ def pred_eval(detector, test_data, imdb, vis=False):
 
         boxes_this_image = [[]] + [all_boxes[j][i] for j in range(1, imdb.num_classes)]
         if vis:
-            vis_all_detection(im, boxes_this_image,
+            # visualize the testing scale
+            for box in boxes_this_image:
+                if isinstance(box, np.ndarray):
+                    box *= scale
+            vis_all_detection(databatch.data['data'], boxes_this_image,
                               imdb_classes=imdb.classes)
         i += 1
 
@@ -78,7 +84,7 @@ def pred_eval(detector, test_data, imdb, vis=False):
     imdb.evaluate_detections(all_boxes)
 
 
-def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.):
+def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.7):
     """
     visualize all detections in one image
     :param im_array: [b=1 c h w] in rgb
@@ -101,8 +107,9 @@ def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.):
                 rect = plt.Rectangle((bbox[0], bbox[1]),
                                      bbox[2] - bbox[0],
                                      bbox[3] - bbox[1], fill=False,
-                                     edgecolor=color, linewidth=2)
+                                     edgecolor=color, linewidth=3.5)
                 plt.gca().add_patch(rect)
-                plt.gca().annotate('{} {:.3f}'.format(imdb_classes[j], score),
-                                   rect.get_xy(), color='w')
+                plt.gca().text(bbox[0], bbox[1] - 2,
+                               '{:s} {:.3f}'.format(imdb_classes[j], score),
+                               bbox=dict(facecolor=color, alpha=0.5), fontsize=12, color='white')
     plt.show()
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
deleted file mode 100644
index be183c9ef7d1..000000000000
--- a/example/rcnn/test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import argparse
-import mxnet as mx
-import os
-from tools.test_net import test_net
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
-    parser.add_argument('--image_set', dest='image_set', help='can be test',
-                        default='test', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=9, type=int)
-    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
-                        default=0, type=int)
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = mx.gpu(args.gpu_id)
-    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx)
diff --git a/example/rcnn/tools/fast-rcnn/__init__.py b/example/rcnn/tools/fast-rcnn/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/example/rcnn/tools/demo_net.py b/example/rcnn/tools/fast-rcnn/demo.py
similarity index 58%
rename from example/rcnn/tools/demo_net.py
rename to example/rcnn/tools/fast-rcnn/demo.py
index 4e9cdf7cd3eb..87e04da0feea 100644
--- a/example/rcnn/tools/demo_net.py
+++ b/example/rcnn/tools/fast-rcnn/demo.py
@@ -1,10 +1,26 @@
+import argparse
+import os
 import numpy as np
 import cv2
 import scipy.io as sio
+
+import mxnet as mx
+
 from helper.processing.image_processing import resize, transform
-from rcnn.config import config
 from helper.processing.nms import nms
+from rcnn.config import config
+from rcnn.detector import Detector
+from rcnn.symbol import get_vgg_rcnn_test
 from rcnn.tester import vis_all_detection
+from utils.load_model import load_param
+
+
+def get_net(prefix, epoch, ctx):
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+    sym = get_vgg_rcnn_test()
+    detector = Detector(sym, ctx, args, auxs)
+    return detector
+
 
 CLASSES = ('__background__',
            'aeroplane', 'bicycle', 'bird', 'boat',
@@ -43,8 +59,27 @@ def demo_net(detector, image_name):
         cls_boxes = cls_boxes[keep, :]
         cls_scores = cls_scores[keep]
         dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32)
-        keep = nms(dets, NMS_THRESH)
+        keep = nms(dets.astype(np.float32), NMS_THRESH)
         all_boxes[cls_ind] = dets[keep, :]
 
     boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))]
     vis_all_detection(im_array, boxes_this_image, CLASSES, 0)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Demonstrate a Fast R-CNN network')
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=9, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
+                        default=0, type=int)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    detector = get_net(args.prefix, args.epoch, ctx)
+    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '000004'))
+    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '001551'))
diff --git a/example/rcnn/tools/fast-rcnn/test.py b/example/rcnn/tools/fast-rcnn/test.py
new file mode 100644
index 000000000000..a2613e4602f8
--- /dev/null
+++ b/example/rcnn/tools/fast-rcnn/test.py
@@ -0,0 +1,57 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.loader import ROIIter
+from rcnn.detector import Detector
+from rcnn.symbol import get_vgg_rcnn_test
+from rcnn.tester import pred_eval
+from utils.load_data import load_test_ss_roidb
+from utils.load_model import load_param
+
+
+def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load testing data
+    voc, roidb = load_test_ss_roidb(imageset, year, root_path, devkit_path)
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load model
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # load symbol
+    sym = get_vgg_rcnn_test()
+
+    # detect
+    detector = Detector(sym, ctx, args, auxs)
+    pred_eval(detector, test_data, voc, vis=vis)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    parser.add_argument('--image_set', dest='image_set', help='can be test',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='model to test with',
+                        default=8, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
+                        default=0, type=int)
+    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/fast-rcnn/train.py b/example/rcnn/tools/fast-rcnn/train.py
new file mode 100644
index 000000000000..b1d0b9837f16
--- /dev/null
+++ b/example/rcnn/tools/fast-rcnn/train.py
@@ -0,0 +1,101 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.solver import Solver
+from rcnn.symbol import get_vgg_rcnn
+from utils.load_data import load_ss_roidb
+from utils.load_model import load_param
+from utils.save_model import save_checkpoint
+
+
+def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
+              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load symbol
+    sym = get_vgg_rcnn()
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb, means, stds = load_ss_roidb(image_set, year, root_path, devkit_path, flip=True)
+    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
+                         ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (1, 3, 1000, 1000))]
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    if not resume:
+        arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
+        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+        args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'])
+        args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
+        args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'])
+        args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
+
+    # train
+    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
+                    learning_rate=1e-3, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1),
+                    mutable_data_shape=True, max_data_shape=max_data_shape)
+    solver.fit(train_data, frequent=frequent)
+
+    # edit params and save
+    for epoch in range(begin_epoch + 1, end_epoch + 1):
+        arg_params, aux_params = load_param(pretrained, epoch, convert=True)
+        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
+        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds) + \
+                                       mx.nd.array(means)
+        save_checkpoint(prefix, epoch, arg_params, aux_params)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a Region Proposal Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'rcnn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
+              args.kv_store, args.work_load_list, args.resume)
diff --git a/example/rcnn/tools/load_data.py b/example/rcnn/tools/load_data.py
deleted file mode 100644
index 2ecdb512cb13..000000000000
--- a/example/rcnn/tools/load_data.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from helper.dataset.pascal_voc import PascalVOC
-from helper.processing.roidb import prepare_roidb, add_bbox_regression_targets
-
-
-def load_train_roidb(image_set, year, root_path, devkit_path, flip=False):
-    voc = PascalVOC(image_set, year, root_path, devkit_path)
-    gt_roidb = voc.gt_roidb()
-    ss_roidb = voc.selective_search_roidb(gt_roidb)
-    if flip:
-        ss_roidb = voc.append_flipped_images(ss_roidb)
-    prepare_roidb(voc, ss_roidb)
-    means, stds = add_bbox_regression_targets(ss_roidb)
-    return voc, ss_roidb, means, stds
-
-
-def load_test_roidb(image_set, year, root_path, devkit_path):
-    voc = PascalVOC(image_set, year, root_path, devkit_path)
-    gt_roidb = voc.gt_roidb()
-    ss_roidb = voc.selective_search_roidb(gt_roidb)
-    prepare_roidb(voc, ss_roidb)
-    return voc, ss_roidb
diff --git a/example/rcnn/tools/test_final.py b/example/rcnn/tools/test_final.py
new file mode 100644
index 000000000000..78fe07852ac3
--- /dev/null
+++ b/example/rcnn/tools/test_final.py
@@ -0,0 +1,61 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.detector import Detector
+from rcnn.symbol import get_vgg_test
+from rcnn.tester import pred_eval
+from utils.load_data import load_gt_roidb
+from utils.load_model import load_param
+
+
+def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis):
+    # set config
+    config.TEST.HAS_RPN = True
+
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load testing data
+    voc, roidb = load_gt_roidb(imageset, year, root_path, devkit_path)
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load model
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # load symbol
+    sym = get_vgg_test()
+
+    # detect
+    detector = Detector(sym, ctx, args, auxs)
+    pred_eval(detector, test_data, voc, vis=vis)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Faster R-CNN network')
+    parser.add_argument('--image_set', dest='image_set', help='can be test',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='model to test with',
+                        default=8, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
+                        default=0, type=int)
+    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/test_rcnn.py b/example/rcnn/tools/test_rcnn.py
new file mode 100644
index 000000000000..901828bfea7c
--- /dev/null
+++ b/example/rcnn/tools/test_rcnn.py
@@ -0,0 +1,57 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.loader import ROIIter
+from rcnn.detector import Detector
+from rcnn.symbol import get_vgg_rcnn_test
+from rcnn.tester import pred_eval
+from utils.load_data import load_test_rpn_roidb
+from utils.load_model import load_param
+
+
+def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load testing data
+    voc, roidb = load_test_rpn_roidb(imageset, year, root_path, devkit_path)
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load model
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # load symbol
+    sym = get_vgg_rcnn_test()
+
+    # detect
+    detector = Detector(sym, ctx, args, auxs)
+    pred_eval(detector, test_data, voc, vis=vis)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    parser.add_argument('--image_set', dest='image_set', help='can be test',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='model to test with',
+                        default=8, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
+                        default=0, type=int)
+    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/test_rpn.py b/example/rcnn/tools/test_rpn.py
new file mode 100644
index 000000000000..1108cdcd9517
--- /dev/null
+++ b/example/rcnn/tools/test_rpn.py
@@ -0,0 +1,58 @@
+import argparse
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.rpn.generate import Detector, generate_detections
+from rcnn.symbol import get_vgg_rpn_test
+from utils.load_data import load_gt_roidb
+from utils.load_model import load_param
+
+
+def test_rpn(image_set, year, root_path, devkit_path, prefix, epoch, ctx, vis):
+    # set config
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+
+    # load symbol
+    sym = get_vgg_rpn_test()
+
+    # load testing data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path)
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load model
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # start testing
+    detector = Detector(sym, ctx, args, auxs)
+    imdb_boxes = generate_detections(detector, test_data, voc, vis=vis)
+    voc.evaluate_recall(roidb, candidate_boxes=imdb_boxes)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Region Proposal Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='model to test with',
+                        default=8, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with',
+                        default=0, type=int)
+    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_rpn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/train_alternate.py b/example/rcnn/tools/train_alternate.py
new file mode 100644
index 000000000000..e0d475c9396a
--- /dev/null
+++ b/example/rcnn/tools/train_alternate.py
@@ -0,0 +1,216 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import AnchorLoader, ROIIter
+from rcnn.solver import Solver
+from rcnn.symbol import get_vgg_rpn, get_vgg_rpn_test, get_vgg_rcnn
+from utils.load_data import load_gt_roidb, load_rpn_roidb
+from utils.load_model import load_checkpoint, load_param
+from utils.save_model import save_checkpoint
+from utils.combine_model import combine_model
+
+
+def train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
+              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None):
+    # load symbol
+    sym = get_vgg_rpn()
+    feat_sym = get_vgg_rpn().get_internals()['rpn_cls_score_output']
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path, flip=True)
+    train_data = AnchorLoader(feat_sym, roidb, batch_size=config.TRAIN.BATCH_SIZE, shuffle=True, mode='train',
+                              ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (1, 3, 1000, 1000))]
+    max_data_shape_dict = {k: v for k, v in max_data_shape}
+    _, feat_shape, _ = feat_sym.infer_shape(**max_data_shape_dict)
+    from rcnn.minibatch import assign_anchor
+    import numpy as np
+    label = assign_anchor(feat_shape[0], np.zeros((0, 5)), [[1000, 1000, 1.0]])
+    max_label_shape = [('label', label['label'].shape),
+                       ('bbox_target', label['bbox_target'].shape),
+                       ('bbox_inside_weight', label['bbox_inside_weight'].shape),
+                       ('bbox_outside_weight', label['bbox_outside_weight'].shape)]
+    print 'providing maximum shape', max_data_shape, max_label_shape
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224))
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    args['rpn_conv_3x3_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['rpn_conv_3x3_weight'])
+    args['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias'])
+    args['rpn_cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['rpn_cls_score_weight'])
+    args['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias'])
+    args['rpn_bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['rpn_bbox_pred_weight'])
+    args['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias'])
+
+    # train
+    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
+                    learning_rate=1e-3, lr_scheduler=mx.lr_scheduler.FactorScheduler(60000, 0.1),
+                    mutable_data_shape=True, max_data_shape=max_data_shape, max_label_shape=max_label_shape)
+    solver.fit(train_data, frequent=frequent)
+
+
+def test_rpn(image_set, year, root_path, devkit_path, trained, epoch, ctx):
+    from rcnn.rpn.generate import Detector, generate_detections
+
+    # load symbol
+    sym = get_vgg_rpn_test()
+
+    # load testing data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path)
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load trained
+    args, auxs = load_param(trained, epoch, convert=True, ctx=ctx[0])
+
+    # start testing
+    detector = Detector(sym, ctx[0], args, auxs)
+    imdb_boxes = generate_detections(detector, test_data, voc, vis=False)
+    voc.evaluate_recall(roidb, candidate_boxes=imdb_boxes)
+
+
+def train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
+               prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None):
+    # load symbol
+    sym = get_vgg_rcnn()
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb, means, stds = load_rpn_roidb(image_set, year, root_path, devkit_path, flip=True)
+    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
+                         ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (1, 3, 1000, 1000))]
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'])
+    args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
+    args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'])
+    args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
+
+    # train
+    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
+                    learning_rate=1e-3, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1),
+                    mutable_data_shape=True, max_data_shape=max_data_shape)
+    solver.fit(train_data, frequent=frequent)
+
+    # edit params and save
+    for epoch in range(begin_epoch + 1, end_epoch + 1):
+        arg_params, aux_params = load_checkpoint(prefix, epoch)
+        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
+        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds) + \
+                                       mx.nd.array(means)
+        save_checkpoint(prefix, epoch, arg_params, aux_params)
+
+
+def alternate_train(image_set, year, root_path, devkit_path, pretrained, epoch,
+                    ctx, begin_epoch, rpn_epoch, rcnn_epoch, frequent, kv_store, work_load_list=None):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    config.TRAIN.BG_THRESH_LO = 0.0
+
+    logging.info('########## TRAIN RPN WITH IMAGENET INIT')
+    config.TRAIN.HAS_RPN = True
+    config.TRAIN.BATCH_SIZE = 1
+    train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
+              'model/rpn1', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## GENERATE RPN DETECTION')
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn1', rpn_epoch, ctx)
+
+    logging.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION')
+    config.TRAIN.HAS_RPN = False
+    config.TRAIN.BATCH_SIZE = 128
+    train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
+               'model/rcnn1', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## TRAIN RPN WITH RCNN INIT')
+    config.TRAIN.HAS_RPN = True
+    config.TRAIN.BATCH_SIZE = 1
+    config.TRAIN.FINETUNE = True
+    train_rpn(image_set, year, root_path, devkit_path, 'model/rcnn1', rcnn_epoch,
+              'model/rpn2', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## GENERATE RPN DETECTION')
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn2', rpn_epoch, ctx)
+
+    logger.info('########## COMBINE RPN2 WITH RCNN1')
+    combine_model('model/rpn2', rpn_epoch, 'model/rcnn1', rcnn_epoch, 'model/rcnn2', 0)
+
+    logger.info('########## TRAIN RCNN WITH RPN INIT AND DETECTION')
+    config.TRAIN.HAS_RPN = False
+    config.TRAIN.BATCH_SIZE = 128
+    train_rcnn(image_set, year, root_path, devkit_path, 'model/rcnn2', 0,
+               'model/rcnn2', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
+
+    logger.info('########## COMBINE RPN2 WITH RCNN2')
+    combine_model('model/rpn2', rpn_epoch, 'model/rcnn2', rcnn_epoch, 'model/final', 0)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train Faster R-CNN Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'rcnn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--rpn_epoch', dest='rpn_epoch', help='end epoch of rpn training',
+                        default=8, type=int)
+    parser.add_argument('--rcnn_epoch', dest='rcnn_epoch', help='end epoch of rcnn training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    alternate_train(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+                    ctx, args.begin_epoch, args.rpn_epoch, args.rcnn_epoch, args.frequent,
+                    args.kv_store, args.work_load_list)
diff --git a/example/rcnn/tools/train_rcnn.py b/example/rcnn/tools/train_rcnn.py
new file mode 100644
index 000000000000..c1c9790149bc
--- /dev/null
+++ b/example/rcnn/tools/train_rcnn.py
@@ -0,0 +1,140 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.callback import Speedometer
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.metric import AccuracyMetric, LogLossMetric, SmoothL1LossMetric
+from rcnn.module import MutableModule
+from rcnn.symbol import get_vgg_rcnn
+from utils.load_data import load_rpn_roidb
+from utils.load_model import load_checkpoint, load_param
+from utils.save_model import save_checkpoint
+
+config.TRAIN.BG_THRESH_LO = 0.0
+config.TRAIN.ASPECT_GROUPING = False
+
+
+def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
+              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load symbol
+    sym = get_vgg_rcnn()
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb, means, stds = load_rpn_roidb(image_set, year, root_path, devkit_path, flip=True)
+    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
+                         ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, 1000, 1000))]
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    if not resume:
+        input_shapes = {k: v for k, v in train_data.provide_data + train_data.provide_label}
+        arg_shape, _, _ = sym.infer_shape(**input_shapes)
+        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+        args['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight'])
+        args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
+        args['bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['bbox_pred_weight'])
+        args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
+
+    # prepare training
+    fixed_params_names = []
+    for name in args.keys():
+        if config.TRAIN.FINETUNE and name.startswith('conv'):
+            fixed_params_names.append(name)
+        elif name.startswith('conv1') or name.startswith('conv2'):
+            fixed_params_names.append(name)
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
+    epoch_end_callback = mx.callback.do_checkpoint(prefix)
+    if config.TRAIN.HAS_RPN is True:
+        eval_metric = AccuracyMetric(use_ignore=True, ignore=-1)
+        cls_metric = LogLossMetric(use_ignore=True, ignore=-1)
+    else:
+        eval_metric = AccuracyMetric()
+        cls_metric = LogLossMetric()
+    bbox_metric = SmoothL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    optimizer_params = {'momentum': 0.9,
+                        'wd': 0.0005,
+                        'learning_rate': 0.001,
+                        'lr_scheduler': mx.lr_scheduler.FactorScheduler(30000, 0.1),
+                        'rescale_grad': (1.0 / config.TRAIN.BATCH_SIZE)}
+
+    # train
+    mod = MutableModule(sym, data_names=data_names, label_names=label_names,
+                        logger=logger, context=ctx, work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape)
+    mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback, kvstore=kv_store,
+            optimizer='sgd', optimizer_params=optimizer_params,
+            arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
+
+    # edit params and save
+    for epoch in range(begin_epoch + 1, end_epoch + 1):
+        arg_params, aux_params = load_checkpoint(prefix, epoch)
+        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
+        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds) + \
+                                       mx.nd.array(means)
+        save_checkpoint(prefix, epoch, arg_params, aux_params)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a Region Proposal Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'rcnn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    parser.add_argument('--finetune', dest='finetune', help='second round finetune', action='store_true')
+    parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    if args.finetune:
+        config.TRAIN.FINETUNE = True
+    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
+              args.kv_store, args.work_load_list, args.resume)
diff --git a/example/rcnn/tools/train_rpn.py b/example/rcnn/tools/train_rpn.py
new file mode 100644
index 000000000000..b2c100b29095
--- /dev/null
+++ b/example/rcnn/tools/train_rpn.py
@@ -0,0 +1,145 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.callback import Speedometer
+from rcnn.config import config
+from rcnn.loader import AnchorLoader
+from rcnn.metric import AccuracyMetric, LogLossMetric, SmoothL1LossMetric
+from rcnn.module import MutableModule
+from rcnn.symbol import get_vgg_rpn
+from utils.load_data import load_gt_roidb
+from utils.load_model import load_param
+
+config.TRAIN.HAS_RPN = True
+config.TRAIN.BATCH_SIZE = 1
+config.TRAIN.ASPECT_GROUPING = False
+
+
+def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
+              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load symbol
+    sym = get_vgg_rpn()
+    feat_sym = get_vgg_rpn().get_internals()['rpn_cls_score_output']
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path, flip=True)
+    train_data = AnchorLoader(feat_sym, roidb, batch_size=config.TRAIN.BATCH_SIZE, shuffle=True, mode='train',
+                              ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (config.TRAIN.BATCH_SIZE, 3, 1000, 1000))]
+    max_data_shape_dict = {k: v for k, v in max_data_shape}
+    _, feat_shape, _ = feat_sym.infer_shape(**max_data_shape_dict)
+    from rcnn.minibatch import assign_anchor
+    import numpy as np
+    label = assign_anchor(feat_shape[0], np.zeros((0, 5)), [[1000, 1000, 1.0]])
+    max_label_shape = [('label', label['label'].shape),
+                       ('bbox_target', label['bbox_target'].shape),
+                       ('bbox_inside_weight', label['bbox_inside_weight'].shape),
+                       ('bbox_outside_weight', label['bbox_outside_weight'].shape)]
+    print 'providing maximum shape', max_data_shape, max_label_shape
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    if not resume:
+        input_shapes = {k: v for k, v in train_data.provide_data + train_data.provide_label}
+        arg_shape, _, _ = sym.infer_shape(**input_shapes)
+        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+        args['rpn_conv_3x3_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_conv_3x3_weight'])
+        args['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias'])
+        args['rpn_cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_cls_score_weight'])
+        args['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias'])
+        args['rpn_bbox_pred_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_bbox_pred_weight'])
+        args['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias'])
+
+    # prepare training
+    fixed_params_names = []
+    for name in args.keys():
+        if config.TRAIN.FINETUNE and name.startswith('conv'):
+            fixed_params_names.append(name)
+        elif name.startswith('conv1') or name.startswith('conv2'):
+            fixed_params_names.append(name)
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
+    epoch_end_callback = mx.callback.do_checkpoint(prefix)
+    if config.TRAIN.HAS_RPN is True:
+        eval_metric = AccuracyMetric(use_ignore=True, ignore=-1)
+        cls_metric = LogLossMetric(use_ignore=True, ignore=-1)
+    else:
+        eval_metric = AccuracyMetric()
+        cls_metric = LogLossMetric()
+    bbox_metric = SmoothL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    optimizer_params = {'momentum': 0.9,
+                        'wd': 0.0005,
+                        'learning_rate': 0.001,
+                        'lr_scheduler': mx.lr_scheduler.FactorScheduler(60000, 0.1),
+                        'rescale_grad': (1.0 / config.TRAIN.BATCH_SIZE)}
+
+    # train
+    mod = MutableModule(sym, data_names=data_names, label_names=label_names,
+                        logger=logger, context=ctx, work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape, max_label_shapes=max_label_shape)
+    mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback, kvstore=kv_store,
+            optimizer='sgd', optimizer_params=optimizer_params,
+            arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a Region Proposal Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'rpn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    parser.add_argument('--finetune', dest='finetune', help='second round finetune', action='store_true')
+    parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    if args.finetune:
+        config.TRAIN.FINETUNE = True
+    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
+              args.kv_store, args.work_load_list, args.resume)
diff --git a/example/rcnn/train.py b/example/rcnn/train.py
deleted file mode 100644
index bca8585efd80..000000000000
--- a/example/rcnn/train.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import argparse
-import mxnet as mx
-import os
-from tools.train_net import train_net
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')
-    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
-                        default='train', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=1, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with',
-                        default=0, type=int)
-    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
-                        default=0, type=int)
-    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
-                        default=8, type=int)
-    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
-                        default=20, type=int)
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = mx.gpu(args.gpu_id)
-    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
-              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent)
diff --git a/example/rcnn/utils/__init__.py b/example/rcnn/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/example/rcnn/utils/caffe_convert.py b/example/rcnn/utils/caffe_convert.py
new file mode 100644
index 000000000000..4dfbfb4e186f
--- /dev/null
+++ b/example/rcnn/utils/caffe_convert.py
@@ -0,0 +1,74 @@
+# This script will not work unless all paths are set right
+
+import os
+import sys
+import mxnet as mx
+import numpy as np
+fast_rcnn_path = None
+sys.path.insert(0, os.path.join(fast_rcnn_path, 'caffe-fast-rcnn', 'python'))
+sys.path.insert(0, os.path.join(fast_rcnn_path, 'lib'))
+import caffe
+from rcnn.symbol import get_symbol_vgg_test
+
+def load_model(caffeproto, caffemodel, arg_shape_dic):
+    def get_caffe_iter(layer_names, layers):
+        for layer_idx, layer in enumerate(layers):
+            layer_name = layer_names[layer_idx].replace('/', '_')
+            layer_type = layer.type
+            layer_blobs = layer.blobs
+            yield (layer_name, layer_type, layer_blobs)
+
+    net_caffe = caffe.Net(caffeproto, caffemodel, caffe.TEST)
+    layer_names = net_caffe._layer_names
+    layers = net_caffe.layers
+    iter = ''
+    iter = get_caffe_iter(layer_names, layers)
+    first_conv = True
+
+    arg_params = {}
+    for layer_name, layer_type, layer_blobs in iter:
+        if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14:
+            assert(len(layer_blobs) == 2)
+            wmat = np.array(layer_blobs[0].data).reshape(layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width)
+            bias = np.array(layer_blobs[1].data)
+            if first_conv:
+                print 'Swapping BGR of caffe into RGB in mxnet'
+                wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
+
+            assert(wmat.flags['C_CONTIGUOUS'] is True)
+            assert(bias.flags['C_CONTIGUOUS'] is True)
+            print 'converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape)
+            wmat = wmat.reshape((wmat.shape[0], -1))
+            bias = bias.reshape((bias.shape[0], 1))
+            weight_name = layer_name + "_weight"
+            bias_name = layer_name + "_bias"
+            
+            if weight_name not in arg_shape_dic:
+                print weight_name + ' not found in arg_shape_dic.'
+                continue
+            wmat = wmat.reshape(arg_shape_dic[weight_name])
+            arg_params[weight_name] = mx.nd.zeros(wmat.shape)
+            arg_params[weight_name][:] = wmat
+
+            bias = bias.reshape(arg_shape_dic[bias_name])
+            arg_params[bias_name] = mx.nd.zeros(bias.shape)
+            arg_params[bias_name][:] = bias
+
+            if first_conv and (layer_type == 'Convolution' or layer_type == 4):
+                first_conv = False
+    
+    return arg_params
+
+proto_path = os.path.join(fast_rcnn_path, 'models', 'VGG16', 'test.prototxt')
+model_path = os.path.join(fast_rcnn_path, 'data', 'fast_rcnn_models', 'vgg16_fast_rcnn_iter_40000.caffemodel')
+
+symbol = get_symbol_vgg_test()
+arg_shapes, out_shapes, aux_shapes = symbol.infer_shape(**{'data': (1, 3, 224, 224), 'rois': (1, 5)})
+arg_shape_dic = { name: shape for name, shape in zip(symbol.list_arguments(), arg_shapes) }
+
+arg_params = load_model(proto_path, model_path, arg_shape_dic)
+
+model = mx.model.FeedForward(ctx=mx.cpu(), symbol=symbol, arg_params=arg_params,
+                             aux_params={}, num_epoch=1,
+                             learning_rate=0.01, momentum=0.9, wd=0.0001)
+model.save('model/ref')
diff --git a/example/rcnn/utils/combine_model.py b/example/rcnn/utils/combine_model.py
new file mode 100644
index 000000000000..5518dda4a989
--- /dev/null
+++ b/example/rcnn/utils/combine_model.py
@@ -0,0 +1,22 @@
+from load_model import load_checkpoint
+from save_model import save_checkpoint
+
+
+def combine_model(prefix1, epoch1, prefix2, epoch2, prefix_out, epoch_out):
+    args1, auxs1 = load_checkpoint(prefix1, epoch1)
+    args2, auxs2 = load_checkpoint(prefix2, epoch2)
+    arg_names = args1.keys() + args2.keys()
+    aux_names = auxs1.keys() + auxs2.keys()
+    args = dict()
+    for arg in arg_names:
+        if arg in args1:
+            args[arg] = args1[arg]
+        else:
+            args[arg] = args2[arg]
+    auxs = dict()
+    for aux in aux_names:
+        if aux in auxs1:
+            auxs[aux] = auxs1[aux]
+        else:
+            auxs[aux] = auxs2[aux]
+    save_checkpoint(prefix_out, epoch_out, args, auxs)
diff --git a/example/rcnn/utils/load_data.py b/example/rcnn/utils/load_data.py
new file mode 100644
index 000000000000..cc6317e0e74a
--- /dev/null
+++ b/example/rcnn/utils/load_data.py
@@ -0,0 +1,49 @@
+from helper.dataset.pascal_voc import PascalVOC
+from helper.processing.roidb import prepare_roidb, add_bbox_regression_targets
+
+
+def load_ss_roidb(image_set, year, root_path, devkit_path, flip=False):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    ss_roidb = voc.selective_search_roidb(gt_roidb)
+    if flip:
+        ss_roidb = voc.append_flipped_images(ss_roidb)
+    prepare_roidb(voc, ss_roidb)
+    means, stds = add_bbox_regression_targets(ss_roidb)
+    return voc, ss_roidb, means, stds
+
+
+def load_gt_roidb(image_set, year, root_path, devkit_path, flip=False):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    if flip:
+        gt_roidb = voc.append_flipped_images(gt_roidb)
+    prepare_roidb(voc, gt_roidb)
+    return voc, gt_roidb
+
+
+def load_rpn_roidb(image_set, year, root_path, devkit_path, flip=False):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    rpn_roidb = voc.rpn_roidb(gt_roidb)
+    if flip:
+        rpn_roidb = voc.append_flipped_images(rpn_roidb)
+    prepare_roidb(voc, rpn_roidb)
+    means, stds = add_bbox_regression_targets(rpn_roidb)
+    return voc, rpn_roidb, means, stds
+
+
+def load_test_ss_roidb(image_set, year, root_path, devkit_path):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    ss_roidb = voc.selective_search_roidb(gt_roidb)
+    prepare_roidb(voc, ss_roidb)
+    return voc, ss_roidb
+
+
+def load_test_rpn_roidb(image_set, year, root_path, devkit_path):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    rpn_roidb = voc.rpn_roidb(gt_roidb)
+    prepare_roidb(voc, rpn_roidb)
+    return voc, rpn_roidb
diff --git a/example/rcnn/tools/load_model.py b/example/rcnn/utils/load_model.py
similarity index 97%
rename from example/rcnn/tools/load_model.py
rename to example/rcnn/utils/load_model.py
index bd5a28ea23ef..c767661232e7 100644
--- a/example/rcnn/tools/load_model.py
+++ b/example/rcnn/utils/load_model.py
@@ -47,7 +47,8 @@ def load_param(prefix, epoch, convert=False, ctx=None):
     """
     arg_params, aux_params = load_checkpoint(prefix, epoch)
     if convert:
-        assert ctx is not None
+        if ctx is None:
+            ctx = mx.cpu()
         arg_params = convert_context(arg_params, ctx)
         aux_params = convert_context(aux_params, ctx)
     return arg_params, aux_params
diff --git a/example/rcnn/tools/save_model.py b/example/rcnn/utils/save_model.py
similarity index 100%
rename from example/rcnn/tools/save_model.py
rename to example/rcnn/utils/save_model.py

From 3c7bdcb6511acd802e53450f2632962885677aa4 Mon Sep 17 00:00:00 2001
From: sxjscience <xshiab@ust.hk>
Date: Wed, 20 Jul 2016 23:01:21 +0800
Subject: [PATCH 115/126] Revise Comment + Fix typo

---
 src/operator/elementwise_binary_broadcast_op-inl.h | 2 +-
 src/storage/storage.cc                             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/elementwise_binary_broadcast_op-inl.h b/src/operator/elementwise_binary_broadcast_op-inl.h
index 5abdafa6e85c..0723657cc3ef 100644
--- a/src/operator/elementwise_binary_broadcast_op-inl.h
+++ b/src/operator/elementwise_binary_broadcast_op-inl.h
@@ -55,7 +55,7 @@ namespace op {
 inline bool IsBroadcastNeeded_(const TShape& lhs,
                               const TShape& rhs) {
   // force ndim to be equal. do not smartly padding dims with 1s, which may confuse users
-  CHECK_EQ(lhs.ndim(), rhs.ndim());
+  CHECK_EQ(lhs.ndim(), rhs.ndim()) << "lhs:" << lhs << " rhs:" << rhs;
   for (index_t i = 0; i < lhs.ndim(); ++i) {
     if (lhs[i] != rhs[i]) return true;
   }
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 177d95e257f9..79cc06c8dc0b 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -86,13 +86,13 @@ Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
 void StorageImpl::Free(Storage::Handle handle) {
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
-  storage::StorageManager *maneger = device.Get(
+  storage::StorageManager *manager = device.Get(
       ctx.dev_id, []() {
         LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
         return nullptr;
       });
   this->ActivateDevice(ctx);
-  maneger->Free(handle.dptr, handle.size);
+  manager->Free(handle.dptr, handle.size);
 }
 
 std::shared_ptr<Storage> Storage::_GetSharedRef() {

From ff550afae53c175f97e93493b6ef4f764730b2af Mon Sep 17 00:00:00 2001
From: Xiang Liang <xlvector@gmail.com>
Date: Fri, 22 Jul 2016 00:35:42 +0800
Subject: [PATCH 116/126] support cpu (#2793)

* support cpu

* wrong free for cpu

* typo
---
 dmlc-core                    | 2 +-
 plugin/warpctc/warpctc-inl.h | 5 ++---
 ps-lite                      | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/dmlc-core b/dmlc-core
index ddef90217681..755f577a38cf 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit ddef9021768181add97e67b6d80fe25abd8e16b3
+Subproject commit 755f577a38cf3aa07f38a2667ffc583d22195e52
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index e4ea4b99059f..b37132144cb9 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -155,8 +155,6 @@ class WarpCTCOp : public Operator {
                                     ctx.get_stream<gpu>()->stream_);
       CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
 #endif
-    } else {
-      LOG(FATAL) << "Unknown device type " << data.dev_mask_;
     }
 
     int total_label_length = 0;
@@ -167,7 +165,6 @@ class WarpCTCOp : public Operator {
     int* cpu_labels = reinterpret_cast<int*>(
         malloc(sizeof(int) * total_label_length));
     removeBlank(cpu_raw_labels, cpu_labels, label.Size(), 0);
-    free(cpu_raw_labels);
 
     size_t alloc_bytes;
     throw_on_error(get_workspace_size(label_lengths.data(),
@@ -201,10 +198,12 @@ class WarpCTCOp : public Operator {
 
     if (data.dev_mask_ == cpu::kDevMask) {
       free(ctc_workspace);
+      free(cpu_labels);
     } else if (data.dev_mask_ == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       cuda_status = cudaFree(ctc_workspace);
       CHECK_EQ(cuda_status, cudaSuccess) << "cuda free workspace fail";
+      free(cpu_raw_labels);
       free(cpu_labels);
 #endif
     }
diff --git a/ps-lite b/ps-lite
index 708d4daded09..35ddccd4cd03 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 708d4daded09e857faf315c13206671daf5c2928
+Subproject commit 35ddccd4cd0302f78ed2a05f1258860d4666e43c

From 28d1860b6ebe2345433a59c1028673bf4e0e5c1d Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Thu, 21 Jul 2016 00:52:58 -0700
Subject: [PATCH 117/126] fix cudnn auto tuen

---
 src/operator/convolution-inl.h       |  6 ++-
 src/operator/cudnn_convolution-inl.h | 57 ++++++++++++++--------------
 src/operator/cudnn_convolution.cc    | 40 +++++++++++++++----
 3 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index bc46f28d5c47..2d74b2383203 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -62,9 +62,11 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     .add_enum("off", conv::kOff)
     .add_enum("limited_workspace", conv::kLimited)
     .add_enum("fastest", conv::kFastest)
-    .set_default(conv::kLimited)
+    .set_default(dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 1))
     .describe("Whether to find convolution algo by running performance test."
-              "Leads to higher startup time but may give better speed");
+              "Leads to higher startup time but may give better speed."
+              "auto tune is turned on by default but can vastly slow down models with bucketing."
+              "Set environment varialbe MXNET_CUDNN_AUTOTUNE_DEFAULT=0 to turn off by default.");
   }
 };
 
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index d37f24b40b56..9f05c860c286 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -20,11 +20,9 @@ void TuneCudnnConvolution(ConvolutionParam param,
                           std::vector<TShape> *out_shape,
                           Context ctx,
                           cudnnDataType_t dtype,
-                          cudnnConvolutionFwdAlgo_t *algo_,
-                          cudnnConvolutionBwdDataAlgo_t *back_algo_,
-                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w_,
-                          size_t *forward_workspace_byte_,
-                          size_t *backward_workspace_byte_);
+                          cudnnConvolutionFwdAlgo_t *algo,
+                          cudnnConvolutionBwdDataAlgo_t *back_algo,
+                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w);
 
 template<typename DType>
 class CuDNNConvolutionOp : public Operator {
@@ -42,8 +40,7 @@ class CuDNNConvolutionOp : public Operator {
 
     if (param.cudnn_tune != conv::kOff) {
       TuneCudnnConvolution(param, in_shape, out_shape, ctx, dtype_,
-                           &algo_, &back_algo_, &back_algo_w_,
-                           &forward_workspace_byte_, &backward_workspace_byte_);
+                           &algo_, &back_algo_, &back_algo_w_);
     }
   }
 
@@ -456,29 +453,31 @@ class CuDNNConvolutionOp : public Operator {
                  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                  workspace_byte,
                  &back_algo_), CUDNN_STATUS_SUCCESS);
-        CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
-                 filter_desc_,
-                 out_desc_,
-                 conv_desc_,
-                 in_desc_,
-                 back_algo_,
-                 &back_size), CUDNN_STATUS_SUCCESS);
-        CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
-                 in_desc_,
-                 out_desc_,
-                 conv_desc_,
-                 filter_desc_,
-                 back_algo_w_,
-                 &back_size_w), CUDNN_STATUS_SUCCESS);
-        backward_workspace_byte_ = std::max(back_size, back_size_w);
-        CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
-                 in_desc_,
-                 filter_desc_,
-                 conv_desc_,
-                 out_desc_,
-                 algo_,
-                 &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
       }
+
+      CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
+               filter_desc_,
+               out_desc_,
+               conv_desc_,
+               in_desc_,
+               back_algo_,
+               &back_size), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
+               in_desc_,
+               out_desc_,
+               conv_desc_,
+               filter_desc_,
+               back_algo_w_,
+               &back_size_w), CUDNN_STATUS_SUCCESS);
+      backward_workspace_byte_ = std::max(back_size, back_size_w);
+      CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+               in_desc_,
+               filter_desc_,
+               conv_desc_,
+               out_desc_,
+               algo_,
+               &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
+
       forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
       backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
       // ugly fix CUDNN algorithm selection
diff --git a/src/operator/cudnn_convolution.cc b/src/operator/cudnn_convolution.cc
index 9c7efb7ce9ff..40051b0b975a 100644
--- a/src/operator/cudnn_convolution.cc
+++ b/src/operator/cudnn_convolution.cc
@@ -5,12 +5,26 @@
  * \author Junyuan Xie
 */
 #include "./cudnn_convolution-inl.h"
+#include <sstream>
+#include <mutex>
+#include <unordered_map>
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
+namespace conv {
+struct CudnnAlgorithms
+{
+  cudnnConvolutionFwdAlgo_t fwd;
+  cudnnConvolutionBwdDataAlgo_t bwd;
+  cudnnConvolutionBwdFilterAlgo_t flt;
+};
+
+std::unordered_map<std::string,CudnnAlgorithms> g_cudnn_algo_reg;
+std::mutex g_reg_mutex;
+}  // namespace conv
 // TODO(xxx): Refactor with Init CuDNN function, remove redandent code in initalization
 void TuneCudnnConvolution(ConvolutionParam param,
                           std::vector<TShape> *in_shape,
@@ -19,9 +33,7 @@ void TuneCudnnConvolution(ConvolutionParam param,
                           cudnnDataType_t dtype,
                           cudnnConvolutionFwdAlgo_t *algo,
                           cudnnConvolutionBwdDataAlgo_t *back_algo,
-                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w,
-                          size_t *forward_workspace_byte,
-                          size_t *backward_workspace_byte) {
+                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w) {
   using namespace mshadow;
   // convert MB to bytes
 
@@ -32,8 +44,18 @@ void TuneCudnnConvolution(ConvolutionParam param,
   CHECK_EQ(in_shape->size(), expected);
   CHECK_EQ(out_shape->size(), 1);
   TShape &x_shape = (*in_shape)[conv::kData];
+  TShape &w_shape = (*in_shape)[conv::kWeight];
   TShape &y_shape = (*out_shape)[conv::kOut];
-
+  std::ostringstream oss;
+  oss << x_shape << ";" << y_shape << ";" << w_shape << ";" << param.workspace;
+  std::string key = oss.str();
+  std::unordered_map<std::string,conv::CudnnAlgorithms>::const_iterator iter = conv::g_cudnn_algo_reg.find(key);
+  if (iter != conv::g_cudnn_algo_reg.end()) {
+    *algo = iter->second.fwd;
+    *back_algo = iter->second.bwd;
+    *back_algo_w = iter->second.flt;
+    return;
+  }
 
   size_t workspace_byte = param.workspace << 20;
   cudnnTensorDescriptor_t in_desc;
@@ -208,7 +230,6 @@ void TuneCudnnConvolution(ConvolutionParam param,
     if (i == nalgo) {
       LOG(FATAL) << "Failed to find an convolution algorithm.";
     } else {
-      *forward_workspace_byte = fwd_algo[i].memory;
       *algo = fwd_algo[i].algo;
     }
 
@@ -229,7 +250,6 @@ void TuneCudnnConvolution(ConvolutionParam param,
     if (i == nalgo) {
       LOG(FATAL) << "Failed to find an convolution algorithm.";
     } else {
-      *backward_workspace_byte = bwd_filter_algo[i].memory;
       *back_algo_w = bwd_filter_algo[i].algo;
     }
 
@@ -250,13 +270,19 @@ void TuneCudnnConvolution(ConvolutionParam param,
     if (i == nalgo) {
       LOG(FATAL) << "Failed to find an convolution algorithm.";
     } else {
-      *backward_workspace_byte = std::max(*backward_workspace_byte, bwd_data_algo[i].memory);
       *back_algo = bwd_data_algo[i].algo;
     }
   }, ctx, {}, {var});
   Engine::Get()->WaitForVar(var);
   Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
 
+  conv::CudnnAlgorithms algs;
+  algs.fwd = *algo;
+  algs.bwd = *back_algo;
+  algs.flt = *back_algo_w;
+  std::lock_guard<std::mutex> guard(conv::g_reg_mutex);
+  conv::g_cudnn_algo_reg[key] = algs;
+
   CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc), CUDNN_STATUS_SUCCESS);
   CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc), CUDNN_STATUS_SUCCESS);
   CHECK_EQ(cudnnDestroyTensorDescriptor(bias_desc), CUDNN_STATUS_SUCCESS);

From b852065162951d95aceb150fbc33ce35f05e4006 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Thu, 21 Jul 2016 01:03:44 -0700
Subject: [PATCH 118/126] turn off auto tune by default

---
 docs/how_to/env_var.md            |  3 +++
 src/operator/convolution-inl.h    |  6 +++---
 src/operator/cudnn_convolution.cc | 13 +++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/docs/how_to/env_var.md b/docs/how_to/env_var.md
index c63ba7a12a53..d15e11386bde 100644
--- a/docs/how_to/env_var.md
+++ b/docs/how_to/env_var.md
@@ -31,6 +31,9 @@ Usually you do not need to change these settings, but they are listed here for r
 * MXNET_KVSTORE_BIGARRAY_BOUND (default=1e6)
 	- The minimum size of "big array".
 	- When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads will be used for reduction.
+* MXNET_CUDNN_AUTOTUNE_DEFAULT (default=0)
+    - The default value of cudnn_tune for convolution layers.
+    - Auto tuning is turn off by default. Set to 1 to turn on by default for benchmarking.
 
 Settings for Minimum Memory Usage
 ---------------------------------
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 2d74b2383203..57f66e2b9f9c 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -62,11 +62,11 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     .add_enum("off", conv::kOff)
     .add_enum("limited_workspace", conv::kLimited)
     .add_enum("fastest", conv::kFastest)
-    .set_default(dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 1))
+    .set_default(dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 0))
     .describe("Whether to find convolution algo by running performance test."
               "Leads to higher startup time but may give better speed."
-              "auto tune is turned on by default but can vastly slow down models with bucketing."
-              "Set environment varialbe MXNET_CUDNN_AUTOTUNE_DEFAULT=0 to turn off by default.");
+              "auto tune is turned off by default."
+              "Set environment varialbe MXNET_CUDNN_AUTOTUNE_DEFAULT=1 to turn on by default.");
   }
 };
 
diff --git a/src/operator/cudnn_convolution.cc b/src/operator/cudnn_convolution.cc
index 40051b0b975a..b3d6b481b012 100644
--- a/src/operator/cudnn_convolution.cc
+++ b/src/operator/cudnn_convolution.cc
@@ -5,24 +5,24 @@
  * \author Junyuan Xie
 */
 #include "./cudnn_convolution-inl.h"
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+
 #include <sstream>
 #include <mutex>
 #include <unordered_map>
-#include <mxnet/base.h>
-#include <mxnet/ndarray.h>
 
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
 namespace conv {
-struct CudnnAlgorithms
-{
+struct CudnnAlgorithms {
   cudnnConvolutionFwdAlgo_t fwd;
   cudnnConvolutionBwdDataAlgo_t bwd;
   cudnnConvolutionBwdFilterAlgo_t flt;
 };
 
-std::unordered_map<std::string,CudnnAlgorithms> g_cudnn_algo_reg;
+std::unordered_map<std::string, CudnnAlgorithms> g_cudnn_algo_reg;
 std::mutex g_reg_mutex;
 }  // namespace conv
 // TODO(xxx): Refactor with Init CuDNN function, remove redandent code in initalization
@@ -49,7 +49,8 @@ void TuneCudnnConvolution(ConvolutionParam param,
   std::ostringstream oss;
   oss << x_shape << ";" << y_shape << ";" << w_shape << ";" << param.workspace;
   std::string key = oss.str();
-  std::unordered_map<std::string,conv::CudnnAlgorithms>::const_iterator iter = conv::g_cudnn_algo_reg.find(key);
+  std::unordered_map<std::string, conv::CudnnAlgorithms>::const_iterator iter =
+    conv::g_cudnn_algo_reg.find(key);
   if (iter != conv::g_cudnn_algo_reg.end()) {
     *algo = iter->second.fwd;
     *back_algo = iter->second.bwd;

From 407871f4ee2562c1185aa3d34f8a784ecc50b1da Mon Sep 17 00:00:00 2001
From: Yan Li <godricly.li@gmail.com>
Date: Fri, 22 Jul 2016 13:28:58 +0800
Subject: [PATCH 119/126] Infershape_fix

---
 src/operator/convolution-inl.h |  3 ++-
 src/operator/pooling-inl.h     | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 57f66e2b9f9c..4a3425fdbdbd 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -366,7 +366,8 @@ class ConvolutionProp : public OperatorProperty {
           << "incorrect stride size: " << param_.stride;
       CHECK_GT(param_.dilate.Size(), 0) \
           << "incorrect dilate size: " << param_.dilate;
-      CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
+      CHECK(ksize_y <= dshape[2] + 2 * param_.pad[0]
+            && ksize_x <= dshape[3] + 2 * param_.pad[1])
           << "kernel size exceed input";
       (*out_shape)[conv::kOut][1] = param_.num_filter;
       (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 630e61852bee..f1fda56713e9 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -190,18 +190,25 @@ class PoolingProp : public OperatorProperty {
     if (dshape.ndim() ==  0) return false;
     if (param_.kernel.ndim() == 2) {
       CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+
       if (param_.global_pool) {
         oshape[2] = 1;
         oshape[3] = 1;
       } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]
+              && param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+            << "kernel size exceed input";
         oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / param_.stride[0];
         oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
       }
-      CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
       out_shape->clear();
       out_shape->push_back(oshape);
     } else if (param_.kernel.ndim() == 3) {
       CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK(param_.kernel[0] < dshape[2] + 2 * param_.pad[0]
+            && param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]
+            && param_.kernel[2] <= dshape[4] + 2 * param_.pad[2])
+          << "kernel size exceed input";
       if (param_.global_pool) {
         oshape[2] = 1;
         oshape[3] = 1;
@@ -211,7 +218,6 @@ class PoolingProp : public OperatorProperty {
         oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
         oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / param_.stride[2];
       }
-      CHECK(oshape[2] > 0 && oshape[3] > 0 && oshape[4] > 0) << "Pooling: kernel size exceed input";
       out_shape->clear();
       out_shape->push_back(oshape);
     }

From 70fa289e4e00761be4ac0b9b2b5569f918d194e6 Mon Sep 17 00:00:00 2001
From: Yan Li <godricly.li@gmail.com>
Date: Fri, 22 Jul 2016 13:42:12 +0800
Subject: [PATCH 120/126] CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5dc64ed541e6..ee6fbcf057d3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -108,4 +108,5 @@ List of Contributors
 * [Depeng Liang](https://github.com/Ldpe2G)
 * [Kiko Qiu](https://github.com/kikoqiu)
 * [Yang Bo](https://github.com/Atry)
-* [Jonas Amaro](https://github.com/jonasrla)
\ No newline at end of file
+* [Jonas Amaro](https://github.com/jonasrla)
+* [Yan Li](https://github.com/Godricly)

From fde63e89ff6e0bc0ca29afc2c68b111aa731c3c6 Mon Sep 17 00:00:00 2001
From: Michal Jamroz <michal@yerbamate.com.pl>
Date: Sat, 23 Jul 2016 02:03:42 +0200
Subject: [PATCH 121/126] Update im2rec.py (#2798)

typo dudes, typo
---
 tools/im2rec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/im2rec.py b/tools/im2rec.py
index bdb398e18a93..7df1f5a6c72f 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -100,7 +100,7 @@ def image_encode(item, q_out):
                 img = img[:, margin:margin + img.shape[0]]
         if args.resize:
             if img.shape[0] > img.shape[1]:
-                newsize = (args.resize, img.shape[0] * args.resize / img.shape[1]
+                newsize = (args.resize, img.shape[0] * args.resize / img.shape[1])
             else:
                 newsize = (img.shape[1] * args.resize / img.shape[0], args.resize)
             img = cv2.resize(img, newsize)

From a1b404cc20b0ba8dae356d80df30054a8b66096f Mon Sep 17 00:00:00 2001
From: cjfan <jasonvan@163.com>
Date: Sun, 24 Jul 2016 02:40:27 +0800
Subject: [PATCH 122/126] DataBatch support gpu (#2797)

* DataBatch support gpu

* DataBatch support gpu and storage data use NDArray

* DataBatch support gpu and storage data use NDArray

* DataBatch support gpu and storage data use NDArray

* fix DataBatch init error and support shuffer
---
 python/mxnet/io.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 382db89a448a..532c6d12ebf2 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -16,6 +16,7 @@
 from .base import check_call, ctypes2docstring
 from .ndarray import NDArray
 from .ndarray import array
+from .ndarray import concatenate
 
 
 class DataBatch(object):
@@ -307,11 +308,11 @@ def _init_data(data, allow_empty, default_name):
         raise TypeError("Input must be NDArray, numpy.ndarray, " + \
                 "a list of them or dict with them as values")
     for k, v in data.items():
-        if isinstance(v, NDArray):
-            data[k] = v.asnumpy()
-    for k, v in data.items():
-        if not isinstance(v, np.ndarray):
-            raise TypeError(("Invalid type '%s' for %s, "  % (type(v), k)) + \
+        if not isinstance(v, NDArray):
+            try:
+                data[k] = array(v)
+            except:
+                raise TypeError(("Invalid type '%s' for %s, "  % (type(v), k)) + \
                     "should be NDArray or numpy.ndarray")
 
     return list(data.items())
@@ -348,8 +349,8 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False, last_batch_han
         if shuffle:
             idx = np.arange(self.data[0][1].shape[0])
             np.random.shuffle(idx)
-            self.data = [(k, v[idx]) for k, v in self.data]
-            self.label = [(k, v[idx]) for k, v in self.label]
+            self.data = [(k, array(v.asnumpy()[idx], v.context)) for k, v in self.data]
+            self.label = [(k, array(v.asnumpy()[idx], v.context)) for k, v in self.label]
 
         self.data_list = [x[1] for x in self.data] + [x[1] for x in self.label]
         self.num_source = len(self.data_list)
@@ -411,11 +412,10 @@ def _getdata(self, data_source):
         """Load data from underlying arrays, internal use only"""
         assert(self.cursor < self.num_data), "DataIter needs reset."
         if self.cursor + self.batch_size <= self.num_data:
-            return [array(x[1][self.cursor:self.cursor+self.batch_size]) for x in data_source]
+            return [x[1][self.cursor:self.cursor+self.batch_size] for x in data_source]
         else:
             pad = self.batch_size - self.num_data + self.cursor
-            return [array(np.concatenate((x[1][self.cursor:], x[1][:pad]),
-                                         axis=0)) for x in data_source]
+            return [concatenate([x[1][self.cursor:], x[1][:pad]]) for x in data_source]
 
     def getdata(self):
         return self._getdata(self.data)

From 3fb29a343f2666d197de826265a8980eff219e20 Mon Sep 17 00:00:00 2001
From: Allan Lu <allan@ream.at>
Date: Sat, 23 Jul 2016 23:07:54 -0700
Subject: [PATCH 123/126] Fix amalgamation compiling issue. (#2814)

* .gitignore
Add Eclipse project files

* Fix amalgamation compiling issue (#2602, #2808). Add minimum Android API
level requirement in README.

* Fix amalgamation compiling issue (#2602, #2808)

Keep ${OPENBLAS_ROOT}/include and ${OPENBLAS_ROOT}/lib in Makefile
---
 .gitignore                   |  5 +++++
 amalgamation/Makefile        | 12 ++++++------
 amalgamation/README.md       |  4 ++--
 amalgamation/amalgamation.py |  5 +++--
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3eda10de52c5..749197668afc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -110,3 +110,8 @@ scala-package/*/*/target/
 !scala-package/*/bin
 *.bak
 */node_modules/
+
+# Eclipse project config
+.project
+.cproject
+.pydevproject
diff --git a/amalgamation/Makefile b/amalgamation/Makefile
index 1111305b5bba..c23210a1a8b5 100644
--- a/amalgamation/Makefile
+++ b/amalgamation/Makefile
@@ -1,14 +1,14 @@
 export MXNET_ROOT=`pwd`/..
 # Change this to path of openblas
-export OPENBLAS_ROOT=`pwd`/OpenBLAS
+export OPENBLAS_ROOT=${MXNET_ROOT}/../OpenBLAS
 
 # Whether use minimum build without blas and SSE, this will make the library super slow
 ifndef MIN
-	export MIN= 0
+	export MIN=0
 endif
 
 ifndef ANDROID
-        export ANDROID=0
+    export ANDROID=0
 endif
 
 
@@ -17,8 +17,8 @@ endif
 
 CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall
 ifneq ($(MIN), 1)
-	CFLAGS+= -I${OPENBLAS_ROOT}/include
-	LDFLAGS+=-L${OPENBLAS_ROOT}/lib -lopenblas
+	CFLAGS += -I${OPENBLAS_ROOT} -I${OPENBLAS_ROOT}/include
+	LDFLAGS+= -L${OPENBLAS_ROOT} -L${OPENBLAS_ROOT}/lib -lopenblas
 endif
 
 
@@ -68,4 +68,4 @@ ${MXNET_ROOT}/lib/libmxnet_predict.so:  mxnet_predict-all.o
 	ls -alh $@
 
 clean:
-	rm -f *.d *.o
+	rm -f *.d *.o *.so *.a mxnet_predict-all.cc
diff --git a/amalgamation/README.md b/amalgamation/README.md
index f96a11d7ed57..9d4e3fe9c8a3 100644
--- a/amalgamation/README.md
+++ b/amalgamation/README.md
@@ -24,7 +24,7 @@ This module is created by [Jack Deng](https://github.com/jdeng).
 
 Android
 ---------------
-Setup NDK and build your standalone toolchain. [Instructions](http://developer.android.com/ndk/guides/standalone_toolchain.html#itc) Use the Advanced Method!!! In particular set PATH, CC and CXX.
+Setup NDK and build your standalone toolchain. [Instructions](http://developer.android.com/ndk/guides/standalone_toolchain.html#itc) Use the Advanced Method!!! In particular set PATH, CC and CXX. The minimum API level required is 16.
 
 Example:
 ```
@@ -33,7 +33,7 @@ export CC=arm-linux-androideabi-gcc   # or export CC=arm-linux-androideabi-clang
 export CXX=arm-linux-androideabi-g++  # or export CXX=arm-linux-androideabi-clang++
 ```
 
-Build OpenBlas for Android: [Build OpenBlas](https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android)
+Build OpenBLAS for Android: [Build OpenBLAS](https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android) Please put OpenBLAS source code outside mxnet directory.
 Modify OPENBLAS_ROOT in Makefile
 Type ```make ANDROID=1```
 
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 9016db58535b..be854180ceb1 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -30,14 +30,15 @@ def get_sources(def_file):
     sources = []
     files = []
     visited = set()
+    mxnet_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
     for line in open(def_file):
         files = files + line.strip().split(' ')
 
     for f in files:
         f = f.strip()
-        if not f or f.endswith('.o') or f == '\\': continue
+        if not f or f.endswith('.o:') or f == '\\': continue
         fn = os.path.relpath(f)
-        if fn.find('/usr/') < 0 and fn not in visited:
+        if os.path.abspath(f).startswith(mxnet_path) and fn not in visited:
             sources.append(fn)
             visited.add(fn)
     return sources

From 2fa4f39271537e4fa4ac255ea8e70f1a928e3bfa Mon Sep 17 00:00:00 2001
From: sxjscience <xshiab@ust.hk>
Date: Sat, 23 Jul 2016 23:35:56 +0800
Subject: [PATCH 124/126] Accelerate batch_dot using gemmbatched

Update MShadow

Fix wrong stream
---
 mshadow                                |  2 +-
 src/operator/matrix_op-inl.h           | 29 ++++++++--
 tests/python/unittest/test_operator.py | 75 +++++++-------------------
 3 files changed, 44 insertions(+), 62 deletions(-)

diff --git a/mshadow b/mshadow
index 867be36a5ada..44d61f8ef9d8 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 867be36a5adabba2a2565fdddb1f88a2b68f9005
+Subproject commit 44d61f8ef9d86e85e7bc62b2a1d4dc40554672f1
diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index b21c6ab88e47..8acbdac363b0 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -300,8 +300,15 @@ void BatchDotForward_(const TBlob& lhs,
 
   if (lhs.shape_.ndim() == 3 && rhs.shape_.ndim() == 3) {
     mshadow::Tensor<xpu, 3, real_t> out = ret->get<xpu, 3, real_t>(s);
-    ASSIGN_DISPATCH(out, req, (batch_dot<false, false>(lhs.get<xpu, 3, real_t>(s),
-                                                       rhs.get<xpu, 3, real_t>(s))));
+    mshadow::Tensor<xpu, 3, real_t> mlhs = lhs.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mrhs = rhs.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 1, real_t*> workspace =
+      env.resource[0].get_space_typed<xpu, 1, real_t*>(mshadow::Shape1(3 * out.size(0)), s);
+    if (kNullOp != req) {
+      mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, 1.0f,
+                                       (kAddTo == req) ? 1.0f : 0.0f,
+                                       workspace);
+    }
   } else {
     LOG(FATAL) << "not reached";
   }
@@ -328,8 +335,21 @@ void BatchDotBackward_(const OutputGrad& out_grad,
     mshadow::Tensor<xpu, 3, real_t> mrhs_data = rhs.data.get<xpu, 3, real_t>(s);
     mshadow::Tensor<xpu, 3, real_t> mlhs_grad = lhs_grad->get<xpu, 3, real_t>(s);
     mshadow::Tensor<xpu, 3, real_t> mrhs_grad = rhs_grad->get<xpu, 3, real_t>(s);
-    ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, (batch_dot<true, false>(mlhs_data, mout_grad)));
-    ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, (batch_dot<false, true>(mout_grad, mrhs_data)));
+    mshadow::Tensor<xpu, 2, real_t*> workspace =
+      env.resource[0].get_space_typed<xpu, 2, real_t*>(
+        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
+    mshadow::Tensor<xpu, 1, real_t*> rhs_workspace = workspace[0];
+    mshadow::Tensor<xpu, 1, real_t*> lhs_workspace = workspace[1];
+    if (kNullOp != req_rhs_grad) {
+      mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, 1.0f,
+                                      (kAddTo == req_rhs_grad) ? 1.0f : 0.0f,
+                                      rhs_workspace);
+    }
+    if (kNullOp != req_lhs_grad) {
+      mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, 1.0f,
+                                      (kAddTo == req_lhs_grad) ? 1.0f : 0.0f,
+                                      lhs_workspace);
+    }
   } else {
     LOG(FATAL) << "not reached";
   }
@@ -672,6 +692,7 @@ MXNET_REGISTER_SIMPLE_OP(batch_dot, XPU)
 .set_function(XPU::kDevMask, BatchDotForward_<XPU>, kNoInplace, kRegisterSymbolic)
 .set_shape_function(BatchDotShape)
 .set_gradient(XPU::kDevMask, BatchDotBackward_<XPU>, kNoInplace)
+.set_resource_request(ResourceRequest::kTempSpace)
 .describe("Calculate batched dot product of two matrices."
           " (batch, M, K) batch_dot (batch, K, N) --> (batch, M, N)");
 }  // namespace op
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d0a9ecffcdd7..a9fc189cc727 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1190,10 +1190,10 @@ def test_dot(ctx=mx.cpu()):
                 c = mx.sym.dot(a, b)
                 exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
                 outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
-                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
                 exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
-                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
-                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
+                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
+                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
 
 
 def test_batch_dot(ctx=mx.cpu()):
@@ -1214,13 +1214,23 @@ def test_batch_dot(ctx=mx.cpu()):
                     a = mx.sym.Variable('a')
                     b = mx.sym.Variable('b')
                     c = mx.sym.batch_dot(a, b)
-                    exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                    exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape, grad_req='write')
+                    exe_add = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape, grad_req='add')
+                    a_init_grad_npy = np.random.normal(size=(batch_size, m, k))
+                    b_init_grad_npy = np.random.normal(size=(batch_size, k, n))
+                    exe_add.grad_dict['a'][:] = a_init_grad_npy
+                    exe_add.grad_dict['b'][:] = b_init_grad_npy
                     outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
-                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-5
+                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
                     exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
-                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-5
-                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-5
-
+                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
+                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
+                    exe_add.forward(is_train=True, a=a_npy, b=b_npy)
+                    exe_add.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                    assert reldiff(exe_add.grad_dict['a'].asnumpy(),
+                                   agrad_npy + a_init_grad_npy) < 1E-3
+                    assert reldiff(exe_add.grad_dict['b'].asnumpy(),
+                                   bgrad_npy + b_init_grad_npy) < 1E-3
 
 def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
     
@@ -1368,55 +1378,6 @@ def test_correlation():
     unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False)
     unittest_correlation((5,1,6,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False)
     unittest_correlation((5,1,11,11), kernel_size = 5,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = False)
-    
-def test_dot(ctx=mx.cpu()):
-    for m in range(1, 5):
-        for k in range(1, 5):
-            for n in range(1, 5):
-                a_npy = np.random.normal(0, 1, (m, k))
-                b_npy = np.random.normal(0, 1, (k, n))
-                c_npy = np.empty((m, n))
-                ograd_npy = np.random.normal(0, 1, (m, n))
-                agrad_npy = np.empty((m, k))
-                bgrad_npy = np.empty((k, n))
-                c_npy[:, :] = np.dot(a_npy[:, :], b_npy[:, :])
-                bgrad_npy[:, :] = np.dot(a_npy[:, :].T, ograd_npy[:, :])
-                agrad_npy[:, :] = np.dot(ograd_npy[:, :], b_npy[:, :].T)
-                a = mx.sym.Variable('a')
-                b = mx.sym.Variable('b')
-                c = mx.sym.dot(a, b)
-                exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
-                outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
-                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
-                exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
-                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
-                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
-
-
-def test_batch_dot(ctx=mx.cpu()):
-    for batch_size in range(1, 5):
-        for m in range(1, 5):
-            for k in range(1, 5):
-                for n in range(1, 5):
-                    a_npy = np.random.normal(0, 1, (batch_size, m, k))
-                    b_npy = np.random.normal(0, 1, (batch_size, k, n))
-                    c_npy = np.empty((batch_size, m, n))
-                    ograd_npy = np.random.normal(0, 1, (batch_size, m, n))
-                    agrad_npy = np.empty((batch_size, m, k))
-                    bgrad_npy = np.empty((batch_size, k, n))
-                    for i in range(batch_size):
-                        c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
-                        bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])
-                        agrad_npy[i, :, :] = np.dot(ograd_npy[i, :, :], b_npy[i, :, :].T)
-                    a = mx.sym.Variable('a')
-                    b = mx.sym.Variable('b')
-                    c = mx.sym.batch_dot(a, b)
-                    exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
-                    outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
-                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
-                    exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
-                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
-                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
 
 
 def test_support_vector_machine_l1_svm():

From 0460049d160ec3643eda1e4624774c30badad5ec Mon Sep 17 00:00:00 2001
From: Sebastian Bodenstein <sebastianbod@gmail.com>
Date: Mon, 25 Jul 2016 01:58:39 +0200
Subject: [PATCH 125/126] [Op] cuDNN RNN Symbol (#2795)

* - first commit

* - removed unnecssary commented out code
- fixed error in output shape inference

* - some renaming
- added cudnn destructors

* - added dropout

* - major refactor
- completed forward evaluation

* - added parameter size test
- fixed bug where cudnnGetRNNParamsSize needs to be called after cudnnSetRNNDescriptor

* - checks for contiguous input tensors
- more consistent param names
- removed 'batch_first' option for now. Might add it later again

* - fixed input names

* - added backward method

* - small fix for in/out names

* - fixed bug: parameters can't have underscore

* - fixed off-by-two error in weight shape inference for bidirectional net
- moved calculated param to cudnn_rnn-inl.h

* - added option to control num outputs

* - removed lint

* - correct handling of backward dependencies

* - fix lint

* - first commit

* - removed unnecssary commented out code
- fixed error in output shape inference

* - some renaming
- added cudnn destructors

* - added dropout

* - major refactor
- completed forward evaluation

* - added parameter size test
- fixed bug where cudnnGetRNNParamsSize needs to be called after cudnnSetRNNDescriptor

* - checks for contiguous input tensors
- more consistent param names
- removed 'batch_first' option for now. Might add it later again

* - fixed input names

* - added backward method

* - small fix for in/out names

* - fixed bug: parameters can't have underscore

* - fixed off-by-two error in weight shape inference for bidirectional net
- moved calculated param to cudnn_rnn-inl.h

* - added option to control num outputs

* - removed lint

* - correct handling of backward dependencies

* - fix lint

* - fix type narrowing bug

* - fixed incorrect dropout parameter
- added dropout states
- fixed incorrect handling of variable outputs

* - fix incorrect cell state forward handling

* - fixed lint by replacing unsigned long long with uint64_t
---
 src/operator/cudnn_rnn-inl.h | 504 +++++++++++++++++++++++++++++++++++
 src/operator/rnn-inl.h       | 315 ++++++++++++++++++++++
 src/operator/rnn.cc          |  42 +++
 src/operator/rnn.cu          |  30 +++
 4 files changed, 891 insertions(+)
 create mode 100644 src/operator/cudnn_rnn-inl.h
 create mode 100644 src/operator/rnn-inl.h
 create mode 100644 src/operator/rnn.cc
 create mode 100644 src/operator/rnn.cu

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
new file mode 100644
index 000000000000..5707846a781f
--- /dev/null
+++ b/src/operator/cudnn_rnn-inl.h
@@ -0,0 +1,504 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file cudnn_rnn-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
+#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
+
+#include <vector>
+#include <map>
+#include <string>
+#include <utility>
+#include <cstdint>
+#include "./rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+template<typename DType>
+class CuDNNRNNOp : public Operator {
+ public:
+  explicit CuDNNRNNOp(RNNParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+    // RNN Direction
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Other
+    if (param_.mode == rnn_enum::kLstm)
+      param_.lstm_q_ = true;
+    else
+      param_.lstm_q_ = false;
+  }
+
+  ~CuDNNRNNOp() {
+    if (init_cudnn_) {
+      for (int i = 0; i < x_desc_vec_.size(); ++i) {
+        CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+      }
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);
+
+      CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudaFree(dropout_states_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+        out_expected = 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    void * hy_ptr = NULL;
+    if (param_.state_outputs)
+      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    DType * cx_ptr = NULL;
+    DType * cy_ptr = NULL;
+    if (param_.mode == rnn_enum::kLstm) {
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+    }
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    if (ctx.is_train) {
+      CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy_ptr,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_,
+                                      temp_space.dptr_ + workspace_size_,
+                                      reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+    } else {
+      // inference mode
+      CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy_ptr,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+      out_expected = 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_grad.size(), in_expected);
+    CHECK_EQ(out_grad.size(), out_expected);
+
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    // only need kStateOut grad output_states is true
+    void * dhy_ptr = NULL;
+    if (param_.state_outputs)
+      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    // Deal with lstm
+    void * dcx_ptr = NULL;
+    void * dcy_ptr = NULL;
+    void * cx_ptr = NULL;
+
+    if (param_.mode == rnn_enum::kLstm) {
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+    }
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_,
+                                rnn_desc_,
+                                param_.seq_length_,
+                                y_desc_vec_.data(),
+                                y.dptr_,
+                                dy_desc_vec_.data(),
+                                dy.dptr_,
+                                dhy_desc_,
+                                dhy_ptr,
+                                dcy_desc_,
+                                dcy_ptr,
+                                w_desc_,
+                                w.dptr_,
+                                hx_desc_,
+                                hx.dptr_,
+                                cx_desc_,
+                                cx_ptr,
+                                dx_desc_vec_.data(),
+                                dx.dptr_,
+                                dhx_desc_,
+                                dhx.dptr_,
+                                dcx_desc_,
+                                dcx_ptr,
+                                temp_space.dptr_,
+                                workspace_byte_,
+                                temp_space.dptr_ + workspace_size_,
+                                reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    x_desc_vec_.data(),
+                                    x.dptr_,
+                                    hx_desc_,
+                                    hx.dptr_,
+                                    y_desc_vec_.data(),
+                                    y.dptr_,
+                                    temp_space.dptr_,
+                                    workspace_byte_,
+                                    dw_desc_,
+                                    dw.dptr_,
+                                    temp_space.dptr_ + workspace_size_,
+                                    reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+  }
+
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    #if CUDNN_MAJOR == 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+      out_expected = 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      // get input + output tensors
+      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
+
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+        CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
+
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+      }
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
+
+      // set the state tensors
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
+
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcy_desc_), CUDNN_STATUS_SUCCESS);
+
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+
+      // Create Dropout descriptors
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_,
+                                        &dropout_byte_), CUDNN_STATUS_SUCCESS);
+      dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudaMalloc(&dropout_states_, dropout_byte_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
+                                        s->dnn_handle_,
+                                        param_.p,  // keep probability
+                                        dropout_states_,
+                                        dropout_byte_,
+                                        seed_), CUDNN_STATUS_SUCCESS);
+      // RNN descriptors
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      // Get temp space sizes
+      CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &workspace_byte_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
+
+      // check that number of params are correct
+      size_t cudnn_param_size;
+      CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                    rnn_desc_,
+                                    x_desc_vec_[0],
+                                    &cudnn_param_size,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
+
+      // Set param descriptors
+      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS);
+      int dim_w[3] = {1, 1, 1};
+      dim_w[0] = w.shape_[0];
+      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  cudnnDataType_t dtype_;
+  bool init_cudnn_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  void *dropout_states_;
+  uint64_t seed_ = 1337ull;
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_, reserve_space_size_, dropout_size_;
+
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
+
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
+
+  #if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format_;
+  #endif
+  RNNParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
new file mode 100644
index 000000000000..a70138adb7ce
--- /dev/null
+++ b/src/operator/rnn-inl.h
@@ -0,0 +1,315 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_RNN_INL_H_
+#define MXNET_OPERATOR_RNN_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace rnn_enum {
+  enum RNNOpInputs {kData, kParams, kState, kStateCell};
+  enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
+  enum RNNOpResource {kTempSpace};
+}
+
+// A utility function to calculate input size
+inline int rnn_single_param_size(int inputSize,
+                                int hiddenSize,
+                                int mode) {
+  int size = hiddenSize * (hiddenSize + inputSize + 2);
+  // Different RNN's have different num weights
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
+      size *= 1;
+      break;
+    case rnn_enum::kRnnTanh:
+      size *= 1;
+      break;
+    case rnn_enum::kLstm:
+      size *= 4;
+      break;
+    case rnn_enum::kGru:
+      size *= 3;
+      break;
+  }
+  return size;
+}
+
+inline int rnn_param_size(int layerNum,
+                          int inputSize,
+                          int hiddenSize,
+                          bool bidirectional,
+                          int mode) {
+  // get size of first layer
+  int size = rnn_single_param_size(inputSize, hiddenSize, mode);
+  // get size of remaining layers
+  if (bidirectional) {
+    size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+    size *= 2;
+  } else {
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
+  }
+  return size;
+}
+
+struct RNNParam : public dmlc::Parameter<RNNParam> {
+  uint32_t state_size;
+  uint32_t num_layers;
+  bool bidirectional, state_outputs;
+  int mode;
+  float p, pkeep_;
+  int seq_length_, batch_size_, input_size_;
+  bool lstm_q_;  // whether type is lstm
+
+  DMLC_DECLARE_PARAMETER(RNNParam) {
+    DMLC_DECLARE_FIELD(state_size)
+    .describe("size of the state for each layer");
+
+    DMLC_DECLARE_FIELD(num_layers)
+    .describe("number of stacked layers");
+
+    DMLC_DECLARE_FIELD(bidirectional).set_default(false)
+    .describe("whether to use bidirectional recurrent layers");
+
+    DMLC_DECLARE_FIELD(mode)
+    .add_enum("rnn_relu", rnn_enum::kRnnRelu)
+    .add_enum("rnn_tanh", rnn_enum::kRnnTanh)
+    .add_enum("lstm", rnn_enum::kLstm)
+    .add_enum("gru", rnn_enum::kGru)
+    .describe("the type of RNN to compute");
+
+    DMLC_DECLARE_FIELD(p).set_default(0.)
+    .set_range(0, 1)
+    .describe("Fraction of the input that gets dropped out at training time");
+
+    DMLC_DECLARE_FIELD(state_outputs).set_default(false)
+    .describe("Whether to have the states as symbol outputs.");
+  }
+};
+
+template<typename xpu, typename DType>
+class RNNOp : public Operator {
+ public:
+  explicit RNNOp(RNNParam p) {
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(sbodenstein): add MShadow implementation
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(sbodenstein): add MShadow implementation
+  }
+
+ private:
+  RNNParam param_;
+};  // class RNNOp
+
+template<typename xpu>
+Operator* CreateOp(RNNParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class RNNProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (param_.mode == rnn_enum::kLstm) {
+      return {"data", "parameters", "state", "state_cell"};
+    } else {
+      return {"data", "parameters", "state"};
+    }
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    std::vector<std::string> outputs = {"output"};
+    if (!param_.state_outputs)
+      return outputs;
+    else
+      outputs.push_back("state");
+    if (param_.mode == rnn_enum::kLstm)
+      outputs.push_back("state_cell");
+    return outputs;
+  }
+
+  int NumOutputs() const override {
+    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
+    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
+    return num_outputs;
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (param_.mode == rnn_enum::kLstm) {
+      CHECK_EQ(in_shape->size(), 4) << "Input:[data, parameters, state, cell_state]";
+    } else {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, parameters, state]";
+    }
+    const TShape &dshape = (*in_shape)[rnn_enum::kData];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 3) \
+        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+    // data: [sequence len, batch, input dimension]
+    int batch_size = dshape[1];
+    int input_size = dshape[2];
+    int numDirections = param_.bidirectional ? 2 : 1;
+    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kState,
+                       Shape3(total_layers, batch_size, param_.state_size));
+    if (param_.mode == rnn_enum::kLstm)
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                        rnn_enum::kStateCell,
+                        Shape3(total_layers, batch_size, param_.state_size));
+
+    // calculate parameter vector length
+    int param_size = rnn_param_size(param_.num_layers,
+                                    input_size,
+                                    param_.state_size,
+                                    param_.bidirectional,
+                                    param_.mode);
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+
+    out_shape->clear();
+    // output: [sequence len, batch, output size]
+    TShape oshape = dshape;
+    oshape[2] = numDirections * param_.state_size;
+    out_shape->push_back(oshape);
+    if (!param_.state_outputs) {
+      return true;
+    } else {
+      // outStateShape: [layer_num, batch, state size]
+      TShape outStateShape = dshape;
+      outStateShape[0] = total_layers;
+      outStateShape[1] = batch_size;
+      outStateShape[2] = param_.state_size;
+      out_shape->push_back(outStateShape);
+      // Deal with lstm cell state
+      if (param_.mode == rnn_enum::kLstm)
+        out_shape->push_back(outStateShape);
+      return true;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    if (!param_.state_outputs) {
+      return true;
+    } else {
+      out_type->push_back(dtype);
+      // Deal with lstm cell state
+      if (param_.mode == rnn_enum::kLstm)
+        out_type->push_back(dtype);
+      return true;
+    }
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new RNNProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "RNN";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
+        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
+
+    if (param_.state_outputs) {
+      dep.push_back(out_data[rnn_enum::kStateOut]);
+      dep.push_back(out_grad[rnn_enum::kStateOut]);
+    }
+
+    if (param_.mode == rnn_enum::kLstm) {
+      dep.push_back(in_data[rnn_enum::kStateCell]);
+      if (param_.state_outputs) {
+        dep.push_back(out_data[rnn_enum::kStateCellOut]);
+        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
+      }
+    }
+    return dep;
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  RNNParam param_;
+};  // class RNNProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
new file mode 100644
index 000000000000..3067c8e986c1
--- /dev/null
+++ b/src/operator/rnn.cc
@@ -0,0 +1,42 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(RNNParam param, int dtype) {
+  LOG(FATAL) << "RNN is only available for gpu at the moment.";
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new RNNOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator *RNNProp::CreateOperatorEx(Context ctx,
+                                  std::vector<TShape> *in_shape,
+                                  std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+
+MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+.describe("Apply a recurrent layer to input.")
+.add_argument("data", "Symbol", "Input data to RNN")
+.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
+.add_argument("state", "Symbol", "initial hidden state of the RNN")
+.add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
+.add_arguments(RNNParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
new file mode 100644
index 000000000000..bf914026019d
--- /dev/null
+++ b/src/operator/rnn.cu
@@ -0,0 +1,30 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+#include <algorithm>
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+#include "./cudnn_rnn-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(RNNParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNRNNOp<DType>(param);
+  })
+#else
+  LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet

From fc0bb3920e60fbe70508e7c74822e4722c7e501f Mon Sep 17 00:00:00 2001
From: Jian Guo <precedenceguo@users.noreply.github.com>
Date: Tue, 26 Jul 2016 11:04:05 +0800
Subject: [PATCH 126/126] extend Module to fix_param_names that saves gradient
 computation and update faster rcnn example (#2794)

* extend Module to fix_param_names that saves gradient computation and update faster rcnn example

* update faster rcnn example to full module api usage

* fix lint

* fix faster rcnn training script

* faster rcnn example remove imcompatible old solver
---
 example/rcnn/README.md                     | 100 +++++-----
 example/rcnn/{tools/fast-rcnn => }/demo.py |  32 ++-
 example/rcnn/rcnn/module.py                |  32 +--
 example/rcnn/rcnn/solver.py                | 102 ----------
 example/rcnn/rcnn/tester.py                |   2 +-
 example/rcnn/test.py                       |  12 ++
 example/rcnn/tools/fast-rcnn/__init__.py   |   0
 example/rcnn/tools/fast-rcnn/test.py       |  57 ------
 example/rcnn/tools/fast-rcnn/train.py      | 101 ----------
 example/rcnn/tools/test_final.py           |  61 ------
 example/rcnn/tools/test_net.py             |  38 ----
 example/rcnn/tools/test_rcnn.py            |  34 ++--
 example/rcnn/tools/test_rpn.py             |  10 +-
 example/rcnn/tools/train_alternate.py      | 216 ---------------------
 example/rcnn/tools/train_net.py            |  64 ------
 example/rcnn/tools/train_rcnn.py           |  34 ++--
 example/rcnn/tools/train_rpn.py            |  19 +-
 example/rcnn/train_alternate.py            | 104 ++++++++++
 python/mxnet/module/executor_group.py      |  11 +-
 python/mxnet/module/module.py              |   8 +-
 20 files changed, 266 insertions(+), 771 deletions(-)
 rename example/rcnn/{tools/fast-rcnn => }/demo.py (69%)
 delete mode 100644 example/rcnn/rcnn/solver.py
 create mode 100644 example/rcnn/test.py
 delete mode 100644 example/rcnn/tools/fast-rcnn/__init__.py
 delete mode 100644 example/rcnn/tools/fast-rcnn/test.py
 delete mode 100644 example/rcnn/tools/fast-rcnn/train.py
 delete mode 100644 example/rcnn/tools/test_final.py
 delete mode 100644 example/rcnn/tools/test_net.py
 delete mode 100644 example/rcnn/tools/train_alternate.py
 delete mode 100644 example/rcnn/tools/train_net.py
 create mode 100644 example/rcnn/train_alternate.py

diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index 93d897cdfcf4..60f5527cb907 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -17,59 +17,63 @@ Faster R-CNN utilize an alternate optimization training process between RPN
 and Fast R-CNN. Fast R-CNN weights are used to initiate RPN for training.
 
 ## Getting Started
+* Install python package `easydict`, `cv2`, `matplotlib`. MXNet require `numpy`.
+* Install MXNet with version no later than Commit 8a3424e, preferably the latest master.
+  Follow the instructions at http://mxnet.readthedocs.io/en/latest/how_to/build.html. Install the python interface.
+* Try out detection result by running `python demo.py --prefix final --epoch 0 --image myimage.jpg --gpu 0`.
+  Suppose you have downloaded pretrained network and place the extracted file `final-0000.params` in this folder and there is an image named `myimage.jpg`.
 
-* Install a forked MXNet at [MXNet-detection](https://github.com/precedenceguo/mxnet/tree/detection).
-Follow the instructions at http://mxnet.readthedocs.io/en/latest/how_to/build.html. Install the python interface.
-Note that the link refers to `detection` branch of the fork. Use `git clone -b detection https://github.com/precedenceguo/mxnet.git`
-to clone or `git checkout detection` if you checked out the master.
-* Download data and place them to `data` folder according to `Data Folder Structure`.
-  You might want to create a symbolic link to VOCdevkit folder
-```
-Pascal VOCdevkit
-http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
-http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
-http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
-```
-* Data Folder Structure (suppose root is `data`)
-```
-demo
-rpn_data (created by rpn)
-selective_search_data (can be omitted)
-cache (created by imdb)
--- name + source + roidb.pkl (create by imdb)
--- name (created by detection and evaluation)
-VOCdevkit
--- VOC + year (JPEG images and annotations)
--- results (created by evaluation)
----- VOC + year
------- main
--------- comp4_det_val_aeroplane.txt
-```
+## Training and Testing Faster R-CNN
+* Install additional python package `scipy`.
+* Download Pascal VOC data and place them to `data` folder according to `Data Folder Structure`.
+  You might want to create a symbolic link to VOCdevkit folder by `ln -s /path/to/your/VOCdevkit data/VOCdevkit`.
 * Download VGG16 pretrained model, use `mxnet/tools/caffe_converter` to convert it,
-  rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder
+  rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder.
+  `model` folder will be used to place model checkpoints along the training process.
+* Start training by running `python train_alternate.py` after VOCdevkit is ready.
+  A typical command would be `python train_alternate.py --gpus 0`. This will train the network on the VOC07 trainval.
+  More control of training process can be found in the argparse help accessed by `python train_alternate.py -h`.
+* Start testing by run `python test.py` after completing the training process.
+  A typical command would be `python test.py --has_rpn --prefix model/final --epoch 8`. This will test the network on the VOC07 test.
+  Adding a `--vis` will turn on visualization and `-h` will show help as in the training process.
 
-## Training
-* Start training by run `python -m tools.train_alternate`. Variable args can be found by run
-`python -m tools.train_alternate --help`.
+## Training and Testing Fast R-CNN
+* Download Pascal VOC data and place them to `data` folder according to `Data Folder Structure`.
+  You might want to create a symbolic link to VOCdevkit folder by `ln -s /path/to/your/VOCdevkit data/VOCdevkit`.
+* Download precomputed selective search data and place them to `data` folder according to `Data Folder Structure`.
+* Download VGG16 pretrained model, use `mxnet/tools/caffe_converter` to convert it,
+  rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder.
+  `model` folder will be used to place model checkpoints along the training process.
+* Start training by running `python -m tools.train_rcnn --proposal ss` to use the selective search proposal.
+* Start testing by running `python -m tools.test_rcnn --proposal ss`.
+
+## Information
+* Download link to trained model
+  Baidu Yun: http://pan.baidu.com/s/1boRhGvH (ixiw) or Dropbox: https://www.dropbox.com/s/jrr83q0ai2ckltq/final-0000.params.tar.gz?dl=0
+* Download link to Pascal VOC and precomputed selective search proposals
+
+  ```
+  Pascal VOCdevkit
+  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
+  selective_search_data (by Ross Girshick)
+  Download link accessible at https://github.com/rbgirshick/fast-rcnn/blob/master/data/scripts/fetch_selective_search_data.sh
+  ```
 
-## Testing
-* Start testing by run `python -m tools.test_final`. Variable args can be found by run
-`python -m tools.test_final --help`.
+* Data Folder Structure (create a `data` folder if there is none)
 
-## Contributing Guide
-You are more than welcome to add new features to this implementation or fix any potential bugs. 
-Here are some topics to look at.
-* MXNet features superior and robust distributed training. This implementation 
-has not yet fully ultilized this power.
-* New approximate end to end training is available from Faster R-CNN python 
-implementation whose link can be found in Disclaimer. This implementation 
-does not support this feature.
-* MXNet has efficient data loading module which renders data IO irrelevant 
-in performance. This implementation has not used this module.
-* More object detection dataset is available online. The dataset module is designed 
-as simple and scalable. Welcome to add more dataset support to this implementation.
-* During inference, some operations are only conducted in cpu. Reimplement them may bring 
-better performance in testing time.
+  ```
+  VOCdevkit
+  -- VOC + year (JPEG images and annotations)
+  -- results (will be created by evaluation)
+  ---- VOC + year
+  ------ main
+  -------- comp4_det_val_aeroplane.txt
+  selective_search_data
+  rpn_data (will be created by rpn)
+  cache (will be created by imdb)
+  ```
 
 ## Disclaimer
 This repository used code from [MXNet](https://github.com/dmlc/mxnet),
diff --git a/example/rcnn/tools/fast-rcnn/demo.py b/example/rcnn/demo.py
similarity index 69%
rename from example/rcnn/tools/fast-rcnn/demo.py
rename to example/rcnn/demo.py
index 87e04da0feea..fb110849663b 100644
--- a/example/rcnn/tools/fast-rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -2,7 +2,6 @@
 import os
 import numpy as np
 import cv2
-import scipy.io as sio
 
 import mxnet as mx
 
@@ -10,14 +9,14 @@
 from helper.processing.nms import nms
 from rcnn.config import config
 from rcnn.detector import Detector
-from rcnn.symbol import get_vgg_rcnn_test
+from rcnn.symbol import get_vgg_test
 from rcnn.tester import vis_all_detection
 from utils.load_model import load_param
 
 
 def get_net(prefix, epoch, ctx):
     args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-    sym = get_vgg_rcnn_test()
+    sym = get_vgg_test()
     detector = Detector(sym, ctx, args, auxs)
     return detector
 
@@ -37,16 +36,14 @@ def demo_net(detector, image_name):
     :param image_name: image name
     :return: None
     """
-    # load demo data
-    im = cv2.imread(image_name + '.jpg')
-    im_array, im_scale = resize(im, config.TEST.SCALES[0], config.TRAIN.MAX_SIZE)
+    config.TEST.HAS_RPN = True
+    assert os.path.exists(image_name), image_name + ' not found'
+    im = cv2.imread(image_name)
+    im_array, im_scale = resize(im, config.SCALES[0], config.MAX_SIZE)
     im_array = transform(im_array, config.PIXEL_MEANS)
-    roi_array = sio.loadmat(image_name + '_boxes.mat')['boxes']
-    batch_index_array = np.zeros((roi_array.shape[0], 1))
-    projected_rois = roi_array * im_scale
-    roi_array = np.hstack((batch_index_array, projected_rois))
+    im_info = np.array([[im_array.shape[2], im_array.shape[3], im_scale]], dtype=np.float32)
 
-    scores, boxes = detector.im_detect(im_array, roi_array)
+    scores, boxes = detector.im_detect(im_array, im_info)
 
     all_boxes = [[] for _ in CLASSES]
     CONF_THRESH = 0.8
@@ -67,11 +64,10 @@ def demo_net(detector, image_name):
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='Demonstrate a Fast R-CNN network')
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=9, type=int)
+    parser = argparse.ArgumentParser(description='Demonstrate a Faster R-CNN network')
+    parser.add_argument('--image', dest='image', help='custom image', type=str)
+    parser.add_argument('--prefix', dest='prefix', help='saved model prefix', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model', type=int)
     parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
                         default=0, type=int)
     args = parser.parse_args()
@@ -81,5 +77,5 @@ def parse_args():
     args = parse_args()
     ctx = mx.gpu(args.gpu_id)
     detector = get_net(args.prefix, args.epoch, ctx)
-    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '000004'))
-    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '001551'))
+    demo_net(detector, args.image)
+    demo_net(detector, args.image)
diff --git a/example/rcnn/rcnn/module.py b/example/rcnn/rcnn/module.py
index 23fb4ce7dad8..6b5aef1d3d51 100644
--- a/example/rcnn/rcnn/module.py
+++ b/example/rcnn/rcnn/module.py
@@ -9,7 +9,6 @@
 from mxnet.initializer import Uniform
 from mxnet.module.base_module import BaseModule
 from mxnet.module.module import Module
-# import numpy as np
 
 class MutableModule(BaseModule):
     """A mutable module is a module that supports variable input data.
@@ -24,10 +23,11 @@ class MutableModule(BaseModule):
     work_load_list : list of number
     max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary
     max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    fixed_param_prefix : list of str, indicating fixed parameters
     """
     def __init__(self, symbol, data_names, label_names,
                  logger=logging, context=ctx.cpu(), work_load_list=None,
-                 max_data_shapes=None, max_label_shapes=None):
+                 max_data_shapes=None, max_label_shapes=None, fixed_param_prefix=None):
         super(MutableModule, self).__init__(logger=logger)
         self._symbol = symbol
         self._data_names = data_names
@@ -38,11 +38,21 @@ def __init__(self, symbol, data_names, label_names,
         self._curr_module = None
         self._max_data_shapes = max_data_shapes
         self._max_label_shapes = max_label_shapes
+        self._fixed_param_prefix = fixed_param_prefix
+
         if self._max_data_shapes is None:
             self._max_data_shapes = []
         if self._max_label_shapes is None:
             self._max_label_shapes = []
-        # self._monitor_weight = None
+        if self._fixed_param_prefix is None:
+            self._fixed_param_prefix = []
+
+        fixed_param_names = list()
+        for name in self._symbol.list_arguments():
+            for prefix in self._fixed_param_prefix:
+                if prefix in name:
+                    fixed_param_names.append(name)
+        self._fixed_param_names = fixed_param_names
 
     def _reset_bind(self):
         self.binded = False
@@ -121,7 +131,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                 max_label_shapes.append((name, shape))
 
         module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger,
-                        context=self._context, work_load_list=self._work_load_list)
+                        context=self._context, work_load_list=self._work_load_list,
+                        fixed_param_names=self._fixed_param_names)
         module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad,
                     force_rebind=False, shared_module=None)
         self._curr_module = module
@@ -154,7 +165,8 @@ def forward(self, data_batch, is_train=None):
         if shape_changed:
             module = Module(self._symbol, self._data_names, self._label_names,
                             logger=self.logger, context=self._context,
-                            work_load_list=self._work_load_list)
+                            work_load_list=self._work_load_list,
+                            fixed_param_names=self._fixed_param_names)
             module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad, force_rebind=False,
                         shared_module=self._curr_module)
@@ -162,10 +174,6 @@ def forward(self, data_batch, is_train=None):
 
         self._curr_module.forward(data_batch, is_train=is_train)
 
-        # arg_params = self._curr_module._arg_params
-        # if self._monitor_weight is not None:
-        #     print 'diff', np.sum(np.abs(arg_params['conv4_2_weight'].asnumpy() - self._monitor_weight))
-
     def backward(self, out_grads=None):
         assert self.binded and self.params_initialized
         self._curr_module.backward(out_grads=out_grads)
@@ -174,12 +182,6 @@ def update(self):
         assert self.binded and self.params_initialized and self.optimizer_initialized
         self._curr_module.update()
 
-        # arg_params = self._curr_module._arg_params
-        # if self._monitor_weight is not None:
-        #     self.get_params()
-        #     print 'diff2', np.sum(np.abs(arg_params['conv4_2_weight'].asnumpy() - self._monitor_weight))
-        # self._monitor_weight = arg_params['conv4_2_weight'].asnumpy()
-
     def get_outputs(self, merge_multi_context=True):
         assert self.binded and self.params_initialized
         return self._curr_module.get_outputs(merge_multi_context=merge_multi_context)
diff --git a/example/rcnn/rcnn/solver.py b/example/rcnn/rcnn/solver.py
deleted file mode 100644
index d82bdbfd0f15..000000000000
--- a/example/rcnn/rcnn/solver.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import mxnet as mx
-import logging
-import metric
-
-from callback import Speedometer
-from config import config
-
-
-class Solver(object):
-    def __init__(self, prefix,
-                 symbol, ctx=None,
-                 begin_epoch=0, num_epoch=None,
-                 kv_store='local',
-                 arg_params=None, aux_params=None,
-                 optimizer='sgd',
-                 mutable_data_shape=False, max_data_shape=None, max_label_shape=None, **kwargs):
-        self.prefix = prefix
-        self.symbol = symbol
-        self.ctx = ctx
-        if self.ctx is None:
-            self.ctx = [mx.cpu()]
-        self.begin_epoch = begin_epoch
-        self.num_epoch = num_epoch
-        self.kv_store = kv_store
-        self.arg_params = arg_params
-        self.aux_params = aux_params
-        self.optimizer = optimizer
-        self.updater = None
-        self.mutable_data_shape = mutable_data_shape
-        self.max_data_shape = max_data_shape
-        self.max_label_shape = max_label_shape
-        self.kwargs = kwargs.copy()
-
-        self.check_params()
-        self.arg_names = None
-        self.param_names = None
-        self.aux_names = None
-
-    def get_params(self, grad_req, data_shapes):
-        arg_names = self.symbol.list_arguments()
-        self.arg_names = arg_names
-
-        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(**dict(data_shapes))
-        if grad_req != 'null':
-            param_names = []
-            for name, shape in zip(arg_names, arg_shapes):
-                if not (name.endswith('data') or name.endswith('rois') or
-                        name.endswith('im_info') or name.endswith('gt_boxes') or
-                        name.endswith('inside_weight') or name.endswith('outside_weight') or
-                        name.endswith('label') or name.endswith('target') or
-                        name.startswith('conv1') or name.startswith('conv2')):
-                    if not (config.TRAIN.FINETUNE and name.startswith('conv')):
-                        param_names.append(name)
-            self.param_names = param_names
-
-        aux_names = self.symbol.list_auxiliary_states()
-        self.aux_names = aux_names
-
-    def check_params(self):
-        arg_names = set(self.symbol.list_arguments())
-        self.arg_params = {k: v for k, v in self.arg_params.items() if k in arg_names}
-        aux_names = set(self.symbol.list_arguments())
-        self.aux_params = {k: v for k, v in self.aux_params.items() if k in aux_names}
-
-    def fit(self, train_data,
-            grad_req='write',
-            frequent=20,
-            logger=None):
-        (kvstore, update_on_kvstore) = mx.model._create_kvstore(self.kv_store, len(self.ctx), self.arg_params)
-        if logger is None:
-            logger = logging
-
-        batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
-        epoch_end_callback = mx.callback.do_checkpoint(self.prefix)
-
-        self.get_params(grad_req, train_data.provide_data + train_data.provide_label)
-
-        if config.TRAIN.HAS_RPN is True:
-            eval_metric = metric.AccuracyMetric(use_ignore=True, ignore=-1)
-            cls_metric = metric.LogLossMetric(use_ignore=True, ignore=-1)
-        else:
-            eval_metric = metric.AccuracyMetric()
-            cls_metric = metric.LogLossMetric()
-        bbox_metric = metric.SmoothL1LossMetric()
-        eval_metrics = mx.metric.CompositeEvalMetric()
-        for child_metric in [eval_metric, cls_metric, bbox_metric]:
-            eval_metrics.add(child_metric)
-        mutable_data_shape = self.mutable_data_shape
-        max_data_shape = self.max_data_shape
-        max_label_shape = self.max_label_shape
-
-        self.optimizer = mx.optimizer.create(self.optimizer,
-                                             rescale_grad=(1.0 / config.TRAIN.BATCH_SIZE), **self.kwargs)
-        mx.model._train_multi_device(self.symbol, self.ctx, self.arg_names, self.param_names, self.aux_names,
-                                     self.arg_params, self.aux_params, self.begin_epoch, self.num_epoch,
-                                     epoch_size=None, optimizer=self.optimizer,
-                                     kvstore=kvstore, update_on_kvstore=update_on_kvstore,
-                                     train_data=train_data, eval_data=None, eval_metric=eval_metrics,
-                                     epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback,
-                                     logger=logger, work_load_list=None, monitor=None,
-                                     mutable_data_shape=mutable_data_shape, max_data_shape=max_data_shape,
-                                     max_label_shape=max_label_shape)
diff --git a/example/rcnn/rcnn/tester.py b/example/rcnn/rcnn/tester.py
index 1789c5a96855..0dc253e3878b 100644
--- a/example/rcnn/rcnn/tester.py
+++ b/example/rcnn/rcnn/tester.py
@@ -69,7 +69,7 @@ def pred_eval(detector, test_data, imdb, vis=False):
             # visualize the testing scale
             for box in boxes_this_image:
                 if isinstance(box, np.ndarray):
-                    box *= scale
+                    box[:, :4] *= scale
             vis_all_detection(databatch.data['data'], boxes_this_image,
                               imdb_classes=imdb.classes)
         i += 1
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
new file mode 100644
index 000000000000..74ffc40673c2
--- /dev/null
+++ b/example/rcnn/test.py
@@ -0,0 +1,12 @@
+import argparse
+import os
+
+import mxnet as mx
+
+from tools.test_rcnn import test_rcnn
+from tools.test_rcnn import parse_args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis, args.has_rpn)
diff --git a/example/rcnn/tools/fast-rcnn/__init__.py b/example/rcnn/tools/fast-rcnn/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/rcnn/tools/fast-rcnn/test.py b/example/rcnn/tools/fast-rcnn/test.py
deleted file mode 100644
index a2613e4602f8..000000000000
--- a/example/rcnn/tools/fast-rcnn/test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import argparse
-import logging
-import os
-
-import mxnet as mx
-
-from rcnn.loader import ROIIter
-from rcnn.detector import Detector
-from rcnn.symbol import get_vgg_rcnn_test
-from rcnn.tester import pred_eval
-from utils.load_data import load_test_ss_roidb
-from utils.load_model import load_param
-
-
-def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis):
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load testing data
-    voc, roidb = load_test_ss_roidb(imageset, year, root_path, devkit_path)
-    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
-
-    # load model
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-
-    # load symbol
-    sym = get_vgg_rcnn_test()
-
-    # detect
-    detector = Detector(sym, ctx, args, auxs)
-    pred_eval(detector, test_data, voc, vis=vis)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
-    parser.add_argument('--image_set', dest='image_set', help='can be test',
-                        default='test', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
-    parser.add_argument('--epoch', dest='epoch', help='model to test with',
-                        default=8, type=int)
-    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
-                        default=0, type=int)
-    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = mx.gpu(args.gpu_id)
-    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/fast-rcnn/train.py b/example/rcnn/tools/fast-rcnn/train.py
deleted file mode 100644
index b1d0b9837f16..000000000000
--- a/example/rcnn/tools/fast-rcnn/train.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import argparse
-import logging
-import os
-
-import mxnet as mx
-
-from rcnn.config import config
-from rcnn.loader import ROIIter
-from rcnn.solver import Solver
-from rcnn.symbol import get_vgg_rcnn
-from utils.load_data import load_ss_roidb
-from utils.load_model import load_param
-from utils.save_model import save_checkpoint
-
-
-def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load symbol
-    sym = get_vgg_rcnn()
-
-    # setup multi-gpu
-    config.TRAIN.BATCH_IMAGES *= len(ctx)
-    config.TRAIN.BATCH_SIZE *= len(ctx)
-
-    # load training data
-    voc, roidb, means, stds = load_ss_roidb(image_set, year, root_path, devkit_path, flip=True)
-    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
-                         ctx=ctx, work_load_list=work_load_list)
-
-    # infer max shape
-    max_data_shape = [('data', (1, 3, 1000, 1000))]
-
-    # load pretrained
-    args, auxs = load_param(pretrained, epoch, convert=True)
-
-    # initialize params
-    if not resume:
-        arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
-        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-        args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'])
-        args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
-        args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'])
-        args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
-
-    # train
-    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
-                    learning_rate=1e-3, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1),
-                    mutable_data_shape=True, max_data_shape=max_data_shape)
-    solver.fit(train_data, frequent=frequent)
-
-    # edit params and save
-    for epoch in range(begin_epoch + 1, end_epoch + 1):
-        arg_params, aux_params = load_param(pretrained, epoch, convert=True)
-        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
-        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds) + \
-                                       mx.nd.array(means)
-        save_checkpoint(prefix, epoch, arg_params, aux_params)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Train a Region Proposal Network')
-    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
-                        default='trainval', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=1, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'rcnn'), type=str)
-    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
-                        default='0', type=str)
-    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
-                        default=0, type=int)
-    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
-                        default=8, type=int)
-    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
-                        default=20, type=int)
-    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
-                        default='device', type=str)
-    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
-                        default=None, type=list)
-    parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
-    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
-              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
-              args.kv_store, args.work_load_list, args.resume)
diff --git a/example/rcnn/tools/test_final.py b/example/rcnn/tools/test_final.py
deleted file mode 100644
index 78fe07852ac3..000000000000
--- a/example/rcnn/tools/test_final.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import argparse
-import logging
-import os
-
-import mxnet as mx
-
-from rcnn.config import config
-from rcnn.loader import ROIIter
-from rcnn.detector import Detector
-from rcnn.symbol import get_vgg_test
-from rcnn.tester import pred_eval
-from utils.load_data import load_gt_roidb
-from utils.load_model import load_param
-
-
-def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis):
-    # set config
-    config.TEST.HAS_RPN = True
-
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load testing data
-    voc, roidb = load_gt_roidb(imageset, year, root_path, devkit_path)
-    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
-
-    # load model
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-
-    # load symbol
-    sym = get_vgg_test()
-
-    # detect
-    detector = Detector(sym, ctx, args, auxs)
-    pred_eval(detector, test_data, voc, vis=vis)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Test a Faster R-CNN network')
-    parser.add_argument('--image_set', dest='image_set', help='can be test',
-                        default='test', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
-    parser.add_argument('--epoch', dest='epoch', help='model to test with',
-                        default=8, type=int)
-    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
-                        default=0, type=int)
-    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = mx.gpu(args.gpu_id)
-    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/test_net.py b/example/rcnn/tools/test_net.py
deleted file mode 100644
index fd7ceafd9571..000000000000
--- a/example/rcnn/tools/test_net.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import logging
-from load_data import load_test_roidb
-from rcnn.data_iter import ROIIter
-from rcnn.symbol import get_symbol_vgg_test
-from load_model import load_param
-from rcnn.detector import Detector
-from rcnn.tester import pred_eval
-
-
-def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx):
-    """
-    wrapper for detector
-    :param imageset: image set to test on
-    :param year: year of image set
-    :param root_path: 'data' folder path
-    :param devkit_path: 'VOCdevkit' folder path
-    :param prefix: new model prefix
-    :param epoch: new model epoch
-    :param ctx: context to evaluate in
-    :return: None
-    """
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load testing data
-    voc, roidb = load_test_roidb(imageset, year, root_path, devkit_path)
-    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
-
-    # load model
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-
-    # load symbol
-    sym = get_symbol_vgg_test()
-
-    # detect
-    detector = Detector(sym, ctx, args, auxs)
-    pred_eval(detector, test_data, voc, vis=False)
diff --git a/example/rcnn/tools/test_rcnn.py b/example/rcnn/tools/test_rcnn.py
index 901828bfea7c..fdbc92c97acf 100644
--- a/example/rcnn/tools/test_rcnn.py
+++ b/example/rcnn/tools/test_rcnn.py
@@ -1,32 +1,35 @@
 import argparse
-import logging
 import os
 
 import mxnet as mx
 
+from rcnn.config import config
 from rcnn.loader import ROIIter
 from rcnn.detector import Detector
-from rcnn.symbol import get_vgg_rcnn_test
+from rcnn.symbol import get_vgg_test, get_vgg_rcnn_test
 from rcnn.tester import pred_eval
-from utils.load_data import load_test_rpn_roidb
+from utils.load_data import load_gt_roidb, load_test_ss_roidb, load_test_rpn_roidb
 from utils.load_model import load_param
 
 
-def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis):
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
+def test_rcnn(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis=False, has_rpn=True, proposal='rpn'):
+    # load symbol and testing data
+    if has_rpn:
+        sym = get_vgg_test()
+        config.TEST.HAS_RPN = True
+        config.TEST.RPN_PRE_NMS_TOP_N = 6000
+        config.TEST.RPN_POST_NMS_TOP_N = 300
+        voc, roidb = load_gt_roidb(imageset, year, root_path, devkit_path)
+    else:
+        sym = get_vgg_rcnn_test()
+        voc, roidb = eval('load_test_' + proposal + '_roidb')(imageset, year, root_path, devkit_path)
 
-    # load testing data
-    voc, roidb = load_test_rpn_roidb(imageset, year, root_path, devkit_path)
+    # get test data iter
     test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
 
     # load model
     args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
 
-    # load symbol
-    sym = get_vgg_rcnn_test()
-
     # detect
     detector = Detector(sym, ctx, args, auxs)
     pred_eval(detector, test_data, voc, vis=vis)
@@ -48,10 +51,15 @@ def parse_args():
     parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
                         default=0, type=int)
     parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    parser.add_argument('--has_rpn', dest='has_rpn', help='generate proposals on the fly',
+                        action='store_true')
+    parser.add_argument('--proposal', dest='proposal', help='can be ss for selective search or rpn',
+                        default='rpn', type=str)
     args = parser.parse_args()
     return args
 
 if __name__ == '__main__':
     args = parse_args()
     ctx = mx.gpu(args.gpu_id)
-    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
+    test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis,
+              args.has_rpn, args.proposal)
diff --git a/example/rcnn/tools/test_rpn.py b/example/rcnn/tools/test_rpn.py
index 1108cdcd9517..b93c1753a42c 100644
--- a/example/rcnn/tools/test_rpn.py
+++ b/example/rcnn/tools/test_rpn.py
@@ -10,13 +10,13 @@
 from utils.load_data import load_gt_roidb
 from utils.load_model import load_param
 
+# rpn generate proposal config
+config.TEST.HAS_RPN = True
+config.TEST.RPN_PRE_NMS_TOP_N = -1
+config.TEST.RPN_POST_NMS_TOP_N = 2000
 
-def test_rpn(image_set, year, root_path, devkit_path, prefix, epoch, ctx, vis):
-    # set config
-    config.TEST.HAS_RPN = True
-    config.TEST.RPN_PRE_NMS_TOP_N = -1
-    config.TEST.RPN_POST_NMS_TOP_N = 2000
 
+def test_rpn(image_set, year, root_path, devkit_path, prefix, epoch, ctx, vis=False):
     # load symbol
     sym = get_vgg_rpn_test()
 
diff --git a/example/rcnn/tools/train_alternate.py b/example/rcnn/tools/train_alternate.py
deleted file mode 100644
index e0d475c9396a..000000000000
--- a/example/rcnn/tools/train_alternate.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import argparse
-import logging
-import os
-
-import mxnet as mx
-
-from rcnn.config import config
-from rcnn.loader import AnchorLoader, ROIIter
-from rcnn.solver import Solver
-from rcnn.symbol import get_vgg_rpn, get_vgg_rpn_test, get_vgg_rcnn
-from utils.load_data import load_gt_roidb, load_rpn_roidb
-from utils.load_model import load_checkpoint, load_param
-from utils.save_model import save_checkpoint
-from utils.combine_model import combine_model
-
-
-def train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None):
-    # load symbol
-    sym = get_vgg_rpn()
-    feat_sym = get_vgg_rpn().get_internals()['rpn_cls_score_output']
-
-    # setup multi-gpu
-    config.TRAIN.BATCH_IMAGES *= len(ctx)
-    config.TRAIN.BATCH_SIZE *= len(ctx)
-
-    # load training data
-    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path, flip=True)
-    train_data = AnchorLoader(feat_sym, roidb, batch_size=config.TRAIN.BATCH_SIZE, shuffle=True, mode='train',
-                              ctx=ctx, work_load_list=work_load_list)
-
-    # infer max shape
-    max_data_shape = [('data', (1, 3, 1000, 1000))]
-    max_data_shape_dict = {k: v for k, v in max_data_shape}
-    _, feat_shape, _ = feat_sym.infer_shape(**max_data_shape_dict)
-    from rcnn.minibatch import assign_anchor
-    import numpy as np
-    label = assign_anchor(feat_shape[0], np.zeros((0, 5)), [[1000, 1000, 1.0]])
-    max_label_shape = [('label', label['label'].shape),
-                       ('bbox_target', label['bbox_target'].shape),
-                       ('bbox_inside_weight', label['bbox_inside_weight'].shape),
-                       ('bbox_outside_weight', label['bbox_outside_weight'].shape)]
-    print 'providing maximum shape', max_data_shape, max_label_shape
-
-    # load pretrained
-    args, auxs = load_param(pretrained, epoch, convert=True)
-
-    # initialize params
-    arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224))
-    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-    args['rpn_conv_3x3_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['rpn_conv_3x3_weight'])
-    args['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias'])
-    args['rpn_cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['rpn_cls_score_weight'])
-    args['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias'])
-    args['rpn_bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['rpn_bbox_pred_weight'])
-    args['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias'])
-
-    # train
-    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
-                    learning_rate=1e-3, lr_scheduler=mx.lr_scheduler.FactorScheduler(60000, 0.1),
-                    mutable_data_shape=True, max_data_shape=max_data_shape, max_label_shape=max_label_shape)
-    solver.fit(train_data, frequent=frequent)
-
-
-def test_rpn(image_set, year, root_path, devkit_path, trained, epoch, ctx):
-    from rcnn.rpn.generate import Detector, generate_detections
-
-    # load symbol
-    sym = get_vgg_rpn_test()
-
-    # load testing data
-    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path)
-    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
-
-    # load trained
-    args, auxs = load_param(trained, epoch, convert=True, ctx=ctx[0])
-
-    # start testing
-    detector = Detector(sym, ctx[0], args, auxs)
-    imdb_boxes = generate_detections(detector, test_data, voc, vis=False)
-    voc.evaluate_recall(roidb, candidate_boxes=imdb_boxes)
-
-
-def train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
-               prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None):
-    # load symbol
-    sym = get_vgg_rcnn()
-
-    # setup multi-gpu
-    config.TRAIN.BATCH_IMAGES *= len(ctx)
-    config.TRAIN.BATCH_SIZE *= len(ctx)
-
-    # load training data
-    voc, roidb, means, stds = load_rpn_roidb(image_set, year, root_path, devkit_path, flip=True)
-    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
-                         ctx=ctx, work_load_list=work_load_list)
-
-    # infer max shape
-    max_data_shape = [('data', (1, 3, 1000, 1000))]
-
-    # load pretrained
-    args, auxs = load_param(pretrained, epoch, convert=True)
-
-    # initialize params
-    arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
-    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-    args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'])
-    args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
-    args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'])
-    args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
-
-    # train
-    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
-                    learning_rate=1e-3, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1),
-                    mutable_data_shape=True, max_data_shape=max_data_shape)
-    solver.fit(train_data, frequent=frequent)
-
-    # edit params and save
-    for epoch in range(begin_epoch + 1, end_epoch + 1):
-        arg_params, aux_params = load_checkpoint(prefix, epoch)
-        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
-        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds) + \
-                                       mx.nd.array(means)
-        save_checkpoint(prefix, epoch, arg_params, aux_params)
-
-
-def alternate_train(image_set, year, root_path, devkit_path, pretrained, epoch,
-                    ctx, begin_epoch, rpn_epoch, rcnn_epoch, frequent, kv_store, work_load_list=None):
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-    config.TRAIN.BG_THRESH_LO = 0.0
-
-    logging.info('########## TRAIN RPN WITH IMAGENET INIT')
-    config.TRAIN.HAS_RPN = True
-    config.TRAIN.BATCH_SIZE = 1
-    train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
-              'model/rpn1', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
-
-    logging.info('########## GENERATE RPN DETECTION')
-    config.TEST.HAS_RPN = True
-    config.TEST.RPN_PRE_NMS_TOP_N = -1
-    config.TEST.RPN_POST_NMS_TOP_N = 2000
-    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn1', rpn_epoch, ctx)
-
-    logging.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION')
-    config.TRAIN.HAS_RPN = False
-    config.TRAIN.BATCH_SIZE = 128
-    train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
-               'model/rcnn1', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
-
-    logging.info('########## TRAIN RPN WITH RCNN INIT')
-    config.TRAIN.HAS_RPN = True
-    config.TRAIN.BATCH_SIZE = 1
-    config.TRAIN.FINETUNE = True
-    train_rpn(image_set, year, root_path, devkit_path, 'model/rcnn1', rcnn_epoch,
-              'model/rpn2', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
-
-    logging.info('########## GENERATE RPN DETECTION')
-    config.TEST.HAS_RPN = True
-    config.TEST.RPN_PRE_NMS_TOP_N = -1
-    config.TEST.RPN_POST_NMS_TOP_N = 2000
-    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn2', rpn_epoch, ctx)
-
-    logger.info('########## COMBINE RPN2 WITH RCNN1')
-    combine_model('model/rpn2', rpn_epoch, 'model/rcnn1', rcnn_epoch, 'model/rcnn2', 0)
-
-    logger.info('########## TRAIN RCNN WITH RPN INIT AND DETECTION')
-    config.TRAIN.HAS_RPN = False
-    config.TRAIN.BATCH_SIZE = 128
-    train_rcnn(image_set, year, root_path, devkit_path, 'model/rcnn2', 0,
-               'model/rcnn2', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
-
-    logger.info('########## COMBINE RPN2 WITH RCNN2')
-    combine_model('model/rpn2', rpn_epoch, 'model/rcnn2', rcnn_epoch, 'model/final', 0)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Train Faster R-CNN Network')
-    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
-                        default='trainval', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=1, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'rcnn'), type=str)
-    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
-                        default='0', type=str)
-    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
-                        default=0, type=int)
-    parser.add_argument('--rpn_epoch', dest='rpn_epoch', help='end epoch of rpn training',
-                        default=8, type=int)
-    parser.add_argument('--rcnn_epoch', dest='rcnn_epoch', help='end epoch of rcnn training',
-                        default=8, type=int)
-    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
-                        default=20, type=int)
-    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
-                        default='device', type=str)
-    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
-                        default=None, type=list)
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
-    alternate_train(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
-                    ctx, args.begin_epoch, args.rpn_epoch, args.rcnn_epoch, args.frequent,
-                    args.kv_store, args.work_load_list)
diff --git a/example/rcnn/tools/train_net.py b/example/rcnn/tools/train_net.py
deleted file mode 100644
index 0214b11326f4..000000000000
--- a/example/rcnn/tools/train_net.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import mxnet as mx
-import logging
-from rcnn.config import config
-from load_data import load_train_roidb
-from rcnn.data_iter import ROIIter
-from rcnn.symbol import get_symbol_vgg
-from load_model import load_checkpoint, load_param
-from rcnn.solver import Solver
-from save_model import save_checkpoint
-
-
-def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent):
-    """
-    wrapper for solver
-    :param image_set: image set to train on
-    :param year: year of image set
-    :param root_path: 'data' folder
-    :param devkit_path: 'VOCdevkit' folder
-    :param pretrained: prefix of pretrained model
-    :param epoch: epoch of pretrained model
-    :param prefix: prefix of new model
-    :param ctx: context to train in
-    :param begin_epoch: begin epoch number
-    :param end_epoch: end epoch number
-    :param frequent: frequency to print
-    :return: None
-    """
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load training data
-    voc, roidb, means, stds = load_train_roidb(image_set, year, root_path, devkit_path, flip=True)
-    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train')
-
-    # load pretrained
-    args, auxs = load_param(pretrained, epoch, convert=True, ctx=ctx)
-    del args['fc8_bias']
-    del args['fc8_weight']
-
-    # load symbol
-    sym = get_symbol_vgg()
-
-    # initialize params
-    arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
-    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-    args['cls_score_weight'] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict['cls_score_weight'], ctx=ctx)
-    args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'], ctx=ctx)
-    args['bbox_pred_weight'] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict['bbox_pred_weight'], ctx=ctx)
-    args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'], ctx=ctx)
-
-    # train
-    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, args, auxs, momentum=0.9, wd=0.0005,
-                    learning_rate=0.001, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1))
-    solver.fit(train_data, frequent=frequent)
-
-    # edit params and save
-    for epoch in range(begin_epoch + 1, end_epoch + 1):
-        arg_params, aux_params = load_checkpoint(prefix, epoch)
-        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds, ctx=ctx)).T
-        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds, ctx=ctx) + \
-                                       mx.nd.array(means, ctx=ctx)
-        save_checkpoint(prefix, epoch, arg_params, aux_params)
diff --git a/example/rcnn/tools/train_rcnn.py b/example/rcnn/tools/train_rcnn.py
index c1c9790149bc..432c6a950a79 100644
--- a/example/rcnn/tools/train_rcnn.py
+++ b/example/rcnn/tools/train_rcnn.py
@@ -10,16 +10,14 @@
 from rcnn.metric import AccuracyMetric, LogLossMetric, SmoothL1LossMetric
 from rcnn.module import MutableModule
 from rcnn.symbol import get_vgg_rcnn
-from utils.load_data import load_rpn_roidb
+from utils.load_data import load_ss_roidb, load_rpn_roidb
 from utils.load_model import load_checkpoint, load_param
 from utils.save_model import save_checkpoint
 
-config.TRAIN.BG_THRESH_LO = 0.0
-config.TRAIN.ASPECT_GROUPING = False
 
-
-def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
+def train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
+               prefix, ctx, begin_epoch, end_epoch, frequent, kv_store,
+               work_load_list=None, resume=False, proposal='rpn'):
     # set up logger
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
@@ -32,7 +30,7 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
     config.TRAIN.BATCH_SIZE *= len(ctx)
 
     # load training data
-    voc, roidb, means, stds = load_rpn_roidb(image_set, year, root_path, devkit_path, flip=True)
+    voc, roidb, means, stds = eval('load_' + proposal + '_roidb')(image_set, year, root_path, devkit_path, flip=True)
     train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
                          ctx=ctx, work_load_list=work_load_list)
 
@@ -53,12 +51,10 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
         args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
 
     # prepare training
-    fixed_params_names = []
-    for name in args.keys():
-        if config.TRAIN.FINETUNE and name.startswith('conv'):
-            fixed_params_names.append(name)
-        elif name.startswith('conv1') or name.startswith('conv2'):
-            fixed_params_names.append(name)
+    if config.TRAIN.FINETUNE:
+        fixed_param_prefix = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
+    else:
+        fixed_param_prefix = ['conv1', 'conv2']
     data_names = [k[0] for k in train_data.provide_data]
     label_names = [k[0] for k in train_data.provide_label]
     batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
@@ -82,7 +78,7 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
     # train
     mod = MutableModule(sym, data_names=data_names, label_names=label_names,
                         logger=logger, context=ctx, work_load_list=work_load_list,
-                        max_data_shapes=max_data_shape)
+                        max_data_shapes=max_data_shape, fixed_param_prefix=fixed_param_prefix)
     mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
             batch_end_callback=batch_end_callback, kvstore=kv_store,
             optimizer='sgd', optimizer_params=optimizer_params,
@@ -98,7 +94,7 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='Train a Region Proposal Network')
+    parser = argparse.ArgumentParser(description='Train a Fast R-CNN Network')
     parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
                         default='trainval', type=str)
     parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
@@ -127,6 +123,8 @@ def parse_args():
                         default=None, type=list)
     parser.add_argument('--finetune', dest='finetune', help='second round finetune', action='store_true')
     parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
+    parser.add_argument('--proposal', dest='proposal', help='can be ss for selective search or rpn',
+                        default='rpn', type=str)
     args = parser.parse_args()
     return args
 
@@ -135,6 +133,6 @@ def parse_args():
     ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
     if args.finetune:
         config.TRAIN.FINETUNE = True
-    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
-              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
-              args.kv_store, args.work_load_list, args.resume)
+    train_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+               args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
+               args.kv_store, args.work_load_list, args.resume, args.proposal)
diff --git a/example/rcnn/tools/train_rpn.py b/example/rcnn/tools/train_rpn.py
index b2c100b29095..1b3f489b490b 100644
--- a/example/rcnn/tools/train_rpn.py
+++ b/example/rcnn/tools/train_rpn.py
@@ -13,12 +13,12 @@
 from utils.load_data import load_gt_roidb
 from utils.load_model import load_param
 
+# rpn config
 config.TRAIN.HAS_RPN = True
 config.TRAIN.BATCH_SIZE = 1
-config.TRAIN.ASPECT_GROUPING = False
 
 
-def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
+def train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
               prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
     # set up logger
     logger = logging.getLogger()
@@ -66,12 +66,10 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
         args['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias'])
 
     # prepare training
-    fixed_params_names = []
-    for name in args.keys():
-        if config.TRAIN.FINETUNE and name.startswith('conv'):
-            fixed_params_names.append(name)
-        elif name.startswith('conv1') or name.startswith('conv2'):
-            fixed_params_names.append(name)
+    if config.TRAIN.FINETUNE:
+        fixed_param_prefix = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
+    else:
+        fixed_param_prefix = ['conv1', 'conv2']
     data_names = [k[0] for k in train_data.provide_data]
     label_names = [k[0] for k in train_data.provide_label]
     batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
@@ -95,7 +93,8 @@ def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
     # train
     mod = MutableModule(sym, data_names=data_names, label_names=label_names,
                         logger=logger, context=ctx, work_load_list=work_load_list,
-                        max_data_shapes=max_data_shape, max_label_shapes=max_label_shape)
+                        max_data_shapes=max_data_shape, max_label_shapes=max_label_shape,
+                        fixed_param_prefix=fixed_param_prefix)
     mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
             batch_end_callback=batch_end_callback, kvstore=kv_store,
             optimizer='sgd', optimizer_params=optimizer_params,
@@ -140,6 +139,6 @@ def parse_args():
     ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
     if args.finetune:
         config.TRAIN.FINETUNE = True
-    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+    train_rpn(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
               args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
               args.kv_store, args.work_load_list, args.resume)
diff --git a/example/rcnn/train_alternate.py b/example/rcnn/train_alternate.py
new file mode 100644
index 000000000000..5e3ba7f07780
--- /dev/null
+++ b/example/rcnn/train_alternate.py
@@ -0,0 +1,104 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import AnchorLoader, ROIIter
+from tools.train_rpn import train_rpn
+from tools.train_rcnn import train_rcnn
+from tools.test_rpn import test_rpn
+from utils.combine_model import combine_model
+
+
+def alternate_train(image_set, test_image_set, year, root_path, devkit_path, pretrained, epoch,
+                    ctx, begin_epoch, rpn_epoch, rcnn_epoch, frequent, kv_store, work_load_list=None):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    config.TRAIN.BG_THRESH_LO = 0.0
+
+    logging.info('########## TRAIN RPN WITH IMAGENET INIT')
+    config.TRAIN.HAS_RPN = True
+    config.TRAIN.BATCH_SIZE = 1
+    train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
+              'model/rpn1', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## GENERATE RPN DETECTION')
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn1', rpn_epoch, ctx[0])
+
+    logging.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION')
+    config.TRAIN.HAS_RPN = False
+    config.TRAIN.BATCH_SIZE = 128
+    train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
+               'model/rcnn1', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## TRAIN RPN WITH RCNN INIT')
+    config.TRAIN.HAS_RPN = True
+    config.TRAIN.BATCH_SIZE = 1
+    config.TRAIN.FINETUNE = True
+    train_rpn(image_set, year, root_path, devkit_path, 'model/rcnn1', rcnn_epoch,
+              'model/rpn2', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## GENERATE RPN DETECTION')
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn2', rpn_epoch, ctx[0])
+
+    logger.info('########## COMBINE RPN2 WITH RCNN1')
+    combine_model('model/rpn2', rpn_epoch, 'model/rcnn1', rcnn_epoch, 'model/rcnn2', 0)
+
+    logger.info('########## TRAIN RCNN WITH RPN INIT AND DETECTION')
+    config.TRAIN.HAS_RPN = False
+    config.TRAIN.BATCH_SIZE = 128
+    train_rcnn(image_set, year, root_path, devkit_path, 'model/rcnn2', 0,
+               'model/rcnn2', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
+
+    logger.info('########## COMBINE RPN2 WITH RCNN2')
+    combine_model('model/rpn2', rpn_epoch, 'model/rcnn2', rcnn_epoch, 'model/final', 0)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train Faster R-CNN Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--test_image_set', dest='test_image_set', help='can be test or val',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--rpn_epoch', dest='rpn_epoch', help='end epoch of rpn training',
+                        default=8, type=int)
+    parser.add_argument('--rcnn_epoch', dest='rcnn_epoch', help='end epoch of rcnn training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    alternate_train(args.image_set, args.test_image_set, args.year, args.root_path, args.devkit_path,
+                    args.pretrained, args.epoch, ctx, args.begin_epoch, args.rpn_epoch, args.rcnn_epoch,
+                    args.frequent, args.kv_store, args.work_load_list)
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 094553cfc77d..62e815e487ec 100644
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -57,10 +57,13 @@ class DataParallelExecutorGroup(object):
         of the data/label inputs.
     logger : Logger
         Default is `logging`.
+    fixed_param_names: list of str
+        Indicate parameters to be fixed during training. Parameters in this list will not allocate
+        space for gradient, nor do gradient calculation.
     """
     def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_names,
                  for_training, inputs_need_grad, shared_group=None, input_types=None,
-                 logger=logging):
+                 logger=logging, fixed_param_names=None):
         self.param_names = param_names
         self.arg_names = symbol.list_arguments()
         self.aux_names = symbol.list_auxiliary_states()
@@ -75,6 +78,10 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
         self.input_types = input_types
         self.logger = logger
 
+        self.fixed_param_names = fixed_param_names
+        if self.fixed_param_names is None:
+            self.fixed_param_names = []
+
         if shared_group is not None:
             self.shared_data_arrays = shared_group.shared_data_arrays
         else:
@@ -335,7 +342,7 @@ def _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group):
         grad_req = {}
         for name in self.arg_names:
             if self.for_training:
-                if name in self.param_names:
+                if name in self.param_names and name not in self.fixed_param_names:
                     grad_req[name] = 'write'
                 elif name in data_names:
                     grad_req[name] = 'write' if self.inputs_need_grad else 'null'
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 06141d945ef8..36f92f084881 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -33,9 +33,11 @@ class Module(BaseModule):
         Default is `cpu()`.
     work_load_list : list of number
         Default `None`, indicating uniform workload.
+    fixed_param_names: list of str
+        Default `None`, indicating no network parameters are fixed.
     """
     def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
-                 logger=logging, context=ctx.cpu(), work_load_list=None):
+                 logger=logging, context=ctx.cpu(), work_load_list=None, fixed_param_names=None):
         super(Module, self).__init__(logger=logger)
 
         if isinstance(context, ctx.Context):
@@ -54,6 +56,7 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
         arg_names = symbol.list_arguments()
         input_names = data_names + label_names
         self._param_names = [x for x in arg_names if x not in input_names]
+        self._fixed_param_names = fixed_param_names
         self._aux_names = symbol.list_auxiliary_states()
         self._data_names = data_names
         self._label_names = label_names
@@ -255,7 +258,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                                                      self._work_load_list, data_shapes,
                                                      label_shapes, self._param_names,
                                                      for_training, inputs_need_grad,
-                                                     shared_group, logger=self.logger)
+                                                     shared_group, logger=self.logger,
+                                                     fixed_param_names=self._fixed_param_names)
         if shared_module is not None:
             self.params_initialized = True
             self._arg_params = shared_module._arg_params