From 841c6e73d6b79b0b3c2b4fb7831ee968c80a51ea Mon Sep 17 00:00:00 2001
From: charlie <charliehaley@gmail.com>
Date: Mon, 4 Jan 2016 10:34:45 -0600
Subject: [PATCH 01/32] 5x Speedup of Preprocessing.py - lost ability to pass
 lambda function though

---
 example/kaggle-ndsb2/Preprocessing.py | 43 +++++++++++++++------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
index fb55b4634066..6fa23441a3a0 100644
--- a/example/kaggle-ndsb2/Preprocessing.py
+++ b/example/kaggle-ndsb2/Preprocessing.py
@@ -10,6 +10,7 @@
 import numpy as np
 import dicom
 from skimage import io, transform
+from joblib import Parallel, delayed
 
 def mkdir(fname):
    try:
@@ -53,29 +54,33 @@ def write_label_csv(fname, frames, label_map):
    fo.close()
 
 
+def get_data(lst):
+   data = []
+   result=[]
+   for path in lst:
+       f = dicom.read_file(path)
+       img = crop_resize(f.pixel_array.astype(float) / np.max(f.pixel_array),64)
+       dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
+       scipy.misc.imsave(dst_path, img)
+       result.append(dst_path)
+       data.append(img)
+   data = np.array(data, dtype=np.uint8)
+   data = data.reshape(data.size)
+   data = np.array(data,dtype=np.str_)
+   data = data.reshape(data.size)
+   return [data,result]
+
+
 def write_data_csv(fname, frames, preproc):
    """Write data to csv file"""
    fdata = open(fname, "w")
-   dwriter = csv.writer(fdata)
-   counter = 0
-   result = []
-   for lst in frames:
-       data = []
-       for path in lst:
-           f = dicom.read_file(path)
-           img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
-           dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
-           scipy.misc.imsave(dst_path, img)
-           result.append(dst_path)
-           data.append(img)
-       data = np.array(data, dtype=np.uint8)
-       data = data.reshape(data.size)
-       dwriter.writerow(data)
-       counter += 1
-       if counter % 100 == 0:
-           print("%d slices processed" % counter)
-   print("All finished, %d slices in total" % counter)
+   dr=Parallel()(delayed(get_data)(lst) for lst in frames)
+   data,result=zip(*dr)
+   for entry in data:
+      fdata.write(','.join(entry)+'\r\n')
+   print("All finished, %d slices in total" % len(data))
    fdata.close()
+   result=np.ravel(result)
    return result
 
 

From 8655b4f4c909282658f1d83a031d298865ca9070 Mon Sep 17 00:00:00 2001
From: charlie <charliehaley@gmail.com>
Date: Mon, 4 Jan 2016 11:10:32 -0600
Subject: [PATCH 02/32] imported dill for passing lambda function

---
 example/kaggle-ndsb2/Preprocessing.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
index 6fa23441a3a0..fd4f0e3c91f7 100644
--- a/example/kaggle-ndsb2/Preprocessing.py
+++ b/example/kaggle-ndsb2/Preprocessing.py
@@ -11,6 +11,7 @@
 import dicom
 from skimage import io, transform
 from joblib import Parallel, delayed
+import dill
 
 def mkdir(fname):
    try:
@@ -54,12 +55,12 @@ def write_label_csv(fname, frames, label_map):
    fo.close()
 
 
-def get_data(lst):
+def get_data(lst,preproc):
    data = []
    result=[]
    for path in lst:
        f = dicom.read_file(path)
-       img = crop_resize(f.pixel_array.astype(float) / np.max(f.pixel_array),64)
+       img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
        dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
        scipy.misc.imsave(dst_path, img)
        result.append(dst_path)
@@ -74,7 +75,7 @@ def get_data(lst):
 def write_data_csv(fname, frames, preproc):
    """Write data to csv file"""
    fdata = open(fname, "w")
-   dr=Parallel()(delayed(get_data)(lst) for lst in frames)
+   dr=Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
    data,result=zip(*dr)
    for entry in data:
       fdata.write(','.join(entry)+'\r\n')

From e70338b060e96408d9be25f042f7957797826137 Mon Sep 17 00:00:00 2001
From: Ruixiang Zhang <sodabeta7@gmail.com>
Date: Wed, 6 Jan 2016 16:45:47 +0800
Subject: [PATCH 03/32] fix bug

---
 tools/accnn/accnn.py          |  3 +-
 tools/accnn/rank_selection.py |  9 +++---
 tools/accnn/utils.py          | 55 +++++++++++------------------------
 3 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/tools/accnn/accnn.py b/tools/accnn/accnn.py
index a5e3c8fdd5bf..22aad24a83f0 100644
--- a/tools/accnn/accnn.py
+++ b/tools/accnn/accnn.py
@@ -12,7 +12,7 @@
 parser.add_argument('-m', '--model',  help='the model to speed up')
 parser.add_argument('-g', '--gpus', default='0', help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--load-epoch',type=int, default=1, help="load the model on an epoch using the model-prefix")
-parser.add_argument('--save-model', help='output model prefix')
+parser.add_argument('--save-model', type=str, default='new-model', help='output model prefix')
 parser.add_argument('--config', default=None, help='specify the config file')
 parser.add_argument('--ratio', type=float, default=2, help='speed up ratio')
 args = parser.parse_args()
@@ -25,6 +25,7 @@
   config['conv_params'] = rank_selection.get_ranksel(model, args.ratio)
   config['fc_params'] = {}
   json.dump(config, open('config-rksel-%.1f.json'%(args.ratio), 'w'), indent=2)
+  args.config = config
 
 new_model = model
 Args = collections.namedtuple('ConvArgs', 'layer K')
diff --git a/tools/accnn/rank_selection.py b/tools/accnn/rank_selection.py
index 57e3bcc8acd1..ee0fd98acb9d 100644
--- a/tools/accnn/rank_selection.py
+++ b/tools/accnn/rank_selection.py
@@ -33,12 +33,13 @@ def get_ranksel(model, ratio):
   for node in nodes:
     if node['op'] == 'Convolution':        
       input_nodes = [nodes[int(j[0])] for j in node['inputs']]
-      data = [input_node['name'] for input_node in input_nodes\
+      data = [input_node for input_node in input_nodes\
                                   if not input_node['name'].startswith(node['name'])][0]      
-      if utils.is_input(node):
+
+      if utils.is_input(data):
         ishape = (3, 224, 224)
       else:
-        ishape = out_shape_dic[data + '_output'][1:]
+        ishape = out_shape_dic[data['name'] + '_output'][1:]
       C.append(calc_complexity(ishape, node))
       D.append(int(node['param']['num_filter']))
       S.append(calc_eigenvalue(model, node))
@@ -81,6 +82,6 @@ def get_ranksel(model, ratio):
   res = [0]*n
   nowc = target_c
   for i in xrange(n-1,-1,-1):    
-    res[i] = dpc[i][nowc][0]
+    res[i] = dpc[i][nowc][0] + 1
     nowc = dpc[i][nowc][1]
   return dict(zip(conv_names, res))
diff --git a/tools/accnn/utils.py b/tools/accnn/utils.py
index a57a384b1fab..6ac13dab4a05 100644
--- a/tools/accnn/utils.py
+++ b/tools/accnn/utils.py
@@ -1,6 +1,7 @@
 import mxnet as mx
 import copy
 import json
+import ast
 
 def load_model(args):
   devs = mx.cpu() if args.gpus == None else [mx.gpu(int(i)) for i in args.gpus.split(',')]  
@@ -14,10 +15,7 @@ def topsort(nodes):
     if node.has_key('inputs'):
       for j in node['inputs']:
         deg[i] += 1
-        g[j[0]].append(i)
-        if node['name'] == '':
-          print node
-          print '!!!',j[0]
+        g[j[0]].append(i)        
   from collections import deque
   q = deque([i for i in xrange(n) if deg[i]==0])
   res = []  
@@ -38,7 +36,18 @@ def topsort(nodes):
 def is_input(node):
   name = node['name']
   return len(node['inputs']) == 0 and ('weight' not in name) and ('bias' not in name) and ('label' not in name)
-  
+
+def sym_factory(node, data):
+  name = node['name']
+  params = {}
+  if 'param' in node:    
+    for k, v in node['param'].iteritems():
+      try:
+        params[k] = ast.literal_eval(v)
+      except ValueError, e:
+        params[k] = v
+  return getattr(mx.symbol, node['op'])(data=data, name=name, **params)
+
 def replace_conv_layer(layer_name, old_model, sym_handle, arg_handle):
   conf = json.loads(old_model.symbol.tojson())
   sym_dict = {}
@@ -57,45 +66,15 @@ def replace_conv_layer(layer_name, old_model, sym_handle, arg_handle):
       try:
         data=sym_dict[datas[0]]
       except Exception, e:
-        print 'can not find symbol %s'%(datas[0])      
+        print 'can not find symbol %s'%(datas[0])
         raise e    
       if node['name'] == layer_name:
         sym = sym_handle(data, node)          
       else:
-        if node['op'] == 'Convolution':           
-          kernel = eval(node['param']['kernel'])
-          pad = eval(node['param']['pad'])
-          num_filter = int(node['param']['num_filter'])
-          name = node['name']
-          sym = mx.symbol.Convolution(data=data, kernel=kernel, pad=pad, num_filter=num_filter, name=name)        
-        elif node['op'] == 'Activation':
-          sym = mx.symbol.Activation(data=data, act_type=node['param']['act_type'], name=node['name'])
-        elif node['op'] == 'Pooling':
-          kernel = eval(node['param']['kernel'])
-          pad = eval(node['param']['pad'])
-          pool_type = node['param']['pool_type']
-          stride = eval(node['param']['stride'])
-          sym = mx.symbol.Pooling(data=data, kernel=kernel, pad=pad, pool_type=pool_type, stride=stride, name=node['name'])
-        elif node['op'] == 'Dropout':
-          p = float(node['param']['p'])
-          sym = mx.symbol.Dropout(data=data, p=p, name=node['name'])
-        elif node['op'] == 'FullyConnected':
-          no_bias = True if node['param']['no_bias']=='True' else False
-          num_hidden = int(node['param']['num_hidden'])
-          sym = mx.symbol.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, name=node['name'])
-        elif node['op'] == 'Flatten':        
-          sym = mx.symbol.Flatten(data=data, name=node['name'])
-        elif node['op'] == 'SoftmaxOutput':        
-          sym = mx.symbol.SoftmaxOutput(data=data, name='softmax')
-          res_sym = sym      
-        elif node['op'] == 'Reshape':
-          target_shape = eval(node['param']['target_shape'])
-          sym = mx.symbol.Reshape(data=data, target_shape=target_shape)
-          res_sym = sym
-        else:
-          raise Exception("Invalid symbol")
+        sym = sym_factory(node, data)        
     if sym:
       sym_dict[node['name']] = sym
+      res_sym = sym
 
   arg_params = copy.deepcopy(old_model.arg_params)
   if layer_name:  

From d9aeb99cf0f76e0d9b5f787045acefbbd9a53be8 Mon Sep 17 00:00:00 2001
From: Alexander Skidanov <skidanov.alexander@gmail.com>
Date: Tue, 5 Jan 2016 21:50:24 -0800
Subject: [PATCH 04/32] Fixing ConcatOp when one of the inputs doesn't have
 gradient

Fixes #1130
---
 src/operator/channel_op_common.h       | 11 +++++-----
 src/operator/concat-inl.h              | 19 ++++++++++++++--
 tests/python/unittest/test_operator.py | 30 ++++++++++++++++++++------
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 7646dab26365..6c1281c80891 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -64,7 +64,8 @@ inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim> > &input,
 template<typename xpu, int dim>
 void Split(const mshadow::Tensor<xpu, dim> &input,
            std::vector<mshadow::Tensor<xpu, dim> > *output,
-           const int dimension) {
+           const int dimension,
+           std::vector<bool> mask = std::vector<bool>(31, true)) {
   using mshadow::expr::concat;
   using mshadow::expr::slice;
   std::vector<mshadow::Tensor<xpu, dim> > out = *output;
@@ -74,7 +75,7 @@ void Split(const mshadow::Tensor<xpu, dim> &input,
     case 0: {
       for (index_t i = 0; i < size; ++i) {
         index_t end = begin + out[i].size(0);
-        out[i] = slice<0>(input, begin, end);
+        if (mask[i]) out[i] = slice<0>(input, begin, end);
         begin = end;
       }
       break;
@@ -82,7 +83,7 @@ void Split(const mshadow::Tensor<xpu, dim> &input,
     case 1: {
       for (index_t i = 0; i < size; ++i) {
         index_t end = begin + out[i].size(1);
-        out[i] = slice<1>(input, begin, end);
+        if (mask[i]) out[i] = slice<1>(input, begin, end);
         begin = end;
       }
       break;
@@ -90,7 +91,7 @@ void Split(const mshadow::Tensor<xpu, dim> &input,
     case 2: {
       for (index_t i = 0; i < size; ++i) {
         index_t end = begin + out[i].size(2);
-        out[i] = slice<2>(input, begin, end);
+        if (mask[i]) out[i] = slice<2>(input, begin, end);
         begin = end;
       }
       break;
@@ -98,7 +99,7 @@ void Split(const mshadow::Tensor<xpu, dim> &input,
     case 3: {
       for (index_t i = 0; i < size; ++i) {
         index_t end = begin + out[i].size(3);
-        out[i] = slice<3>(input, begin, end);
+        if (mask[i]) out[i] = slice<3>(input, begin, end);
         begin = end;
       }
       break;
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index 6315e9968a99..50b577ffa6ac 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -99,6 +99,7 @@ class ConcatOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > grad_in(size_);
     Tensor<xpu, 4> grad;
+    std::vector<bool> mask(size_, true);
     if (out_grad[concat_enum::kOut].ndim() < 4) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
@@ -107,8 +108,15 @@ class ConcatOp : public Operator {
           dshape = Shape4(in_grad[i].shape_[0], in_grad[i].shape_[1], 1, 1);
         else
           dshape = Shape4(in_grad[i].shape_[0], in_grad[i].shape_[1], in_grad[i].shape_[2], 1);
-        grad_in[i] = in_grad[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         dim += in_grad[i].shape_[dimension_];
+        if (req[i] == kNullOp) {
+          // Input doesn't need a gradient, don't propagate any
+          mask[i] = false;
+          // set the dimension so that Split knows how much to advance
+          grad_in[i].shape_[dimension_] = dshape[dimension_];
+          continue;
+        }
+        grad_in[i] = in_grad[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         CHECK_EQ(req[i], kWriteTo);
       }
       Shape<4> dshape_out;
@@ -122,12 +130,19 @@ class ConcatOp : public Operator {
       grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
+        if (req[i] == kNullOp) {
+          // Input doesn't need a gradient, don't propagate any
+          mask[i] = false;
+          // set the dimension so that Split knows how much to advance
+          grad_in[i].shape_[dimension_] = in_grad[i].shape_[dimension_];
+          continue;
+        }
         grad_in[i] = in_grad[i].get<xpu, 4, real_t>(s);
         CHECK_EQ(req[i], kWriteTo);
       }
       grad = out_grad[concat_enum::kOut].get<xpu, 4, real_t>(s);
     }
-    Split(grad, &grad_in, dimension_);
+    Split(grad, &grad_in, dimension_, mask);
   }
 
  private:
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index decbb08bbec7..190b6919315d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -69,7 +69,9 @@ def check_slice_channel(dim, num):
     exe.backward(o_nd)
     assert reldiff(grad_nd[0].asnumpy(), np.hstack([ins[i] + i for i in range(num)])) < 1e-5
 
-def check_concat_with_shape(shapes, dimension):
+def check_concat_with_shape(shapes, dimension, skip_second):
+    # if skip_second is True, second argument will not have gradient.
+    # it is to test #1130
     n = len(shapes)
     # forward
     target_dim = 0
@@ -83,12 +85,19 @@ def check_concat_with_shape(shapes, dimension):
         arr[i][:] = shapes[i][dimension]
     arr_np = [np.copy(narray.asnumpy()) for narray in arr]
     arr_grad = [mx.nd.empty(shape) for shape in shapes]
+    dict_grad = {}
+    arg_names = out.list_arguments()
+
+    for name, g in zip(arg_names, arr_grad):
+        if not skip_second or name != 'arg1':
+            dict_grad[name] = g
+    
     args = out.list_arguments()
     arg_shapes, out_shapes, aux_shapes = out.infer_shape(**dict(zip(args, shapes)))
     out_grad = mx.nd.empty(out_shapes[0])
     exec1 = out.bind(mx.Context('cpu'),
                      args=arr,
-                     args_grad=arr_grad)
+                     args_grad=dict_grad)
     exec1.forward()
     out1 = exec1.outputs[0]
     ret = np.concatenate([narray.asnumpy() for narray in arr], axis=dimension)
@@ -97,8 +106,12 @@ def check_concat_with_shape(shapes, dimension):
     out1.copyto(out_grad)
     out_grad[:] += 1
     exec1.backward([out_grad])
-    for grad, np_grad in zip(arr_grad, arr_np):
-        assert same(grad.asnumpy(), np_grad + 1)
+
+    for i, name in enumerate(arg_names):
+        if not skip_second or name != 'arg1':
+            grad = dict_grad[name]
+            np_grad = arr_np[i]
+            assert same(grad.asnumpy(), np_grad + 1)
 
 def test_concat():
     for dimension in range(4):
@@ -116,7 +129,8 @@ def test_concat():
                         shapes.append((merge[i], a))
                     elif dimension == 1:
                         shapes.append((a, merge[i]))
-                    check_concat_with_shape(shapes,dimension)
+                    check_concat_with_shape(shapes,dimension,True)
+                    check_concat_with_shape(shapes,dimension,False)
         #test 3D
         if dimension<3:
             for dim in range(2, 6):
@@ -128,7 +142,8 @@ def test_concat():
                         shapes.append((a,merge[i],b))
                     elif dimension ==2:
                         shapes.append((a,b,merge[i]))
-                check_concat_with_shape(shapes,dimension)
+                check_concat_with_shape(shapes,dimension,True)
+                check_concat_with_shape(shapes,dimension,False)
         # test 4D
         for dim in range(2, 6):
             shapes = []
@@ -141,7 +156,8 @@ def test_concat():
                     shapes.append((a,b,merge[i],c))
                 elif dimension ==3:
                     shapes.append((a,b,c,merge[i]))
-            check_concat_with_shape(shapes,dimension)
+            check_concat_with_shape(shapes,dimension,True)
+            check_concat_with_shape(shapes,dimension,False)
 
 def test_slice_channel():
     check_slice_channel(2, 4)

From d249c90536d1eed00bde0c55bddfc407876c5b76 Mon Sep 17 00:00:00 2001
From: sxjscience <sxjscience001@gmail.com>
Date: Thu, 7 Jan 2016 20:04:11 +0800
Subject: [PATCH 05/32] Update Submodule

---
 mshadow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mshadow b/mshadow
index 01ce2c5d5214..29863c50443d 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 01ce2c5d5214847b59ef4980e29c08179ab1d518
+Subproject commit 29863c50443d8338cdfcfb5836a4dd854b2b0e75

From ea8e575c96c0eefbc1c1a185d2eb4b22391dc16f Mon Sep 17 00:00:00 2001
From: charlie <charliehaley@gmail.com>
Date: Thu, 7 Jan 2016 16:20:58 -0600
Subject: [PATCH 06/32] style fixes - spacing

---
 example/kaggle-ndsb2/Preprocessing.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
index fd4f0e3c91f7..ee32a7775a73 100644
--- a/example/kaggle-ndsb2/Preprocessing.py
+++ b/example/kaggle-ndsb2/Preprocessing.py
@@ -57,7 +57,7 @@ def write_label_csv(fname, frames, label_map):
 
 def get_data(lst,preproc):
    data = []
-   result=[]
+   result = []
    for path in lst:
        f = dicom.read_file(path)
        img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
@@ -75,13 +75,13 @@ def get_data(lst,preproc):
 def write_data_csv(fname, frames, preproc):
    """Write data to csv file"""
    fdata = open(fname, "w")
-   dr=Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
-   data,result=zip(*dr)
+   dr = Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
+   data,result = zip(*dr)
    for entry in data:
       fdata.write(','.join(entry)+'\r\n')
    print("All finished, %d slices in total" % len(data))
    fdata.close()
-   result=np.ravel(result)
+   result = np.ravel(result)
    return result
 
 

From 592b78e531d62c32935238996287af52a11a8225 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Mon, 28 Dec 2015 23:13:08 -0800
Subject: [PATCH 07/32] ndarray type support.

---
 Makefile                           |   8 ++
 include/mxnet/c_api.h              |  32 ++++-
 include/mxnet/ndarray.h            |  36 +++--
 include/mxnet/resource.h           |   8 +-
 python/mxnet/base.py               |   1 +
 python/mxnet/ndarray.py            |  85 +++++++++---
 src/c_api/c_api.cc                 |  32 ++++-
 src/common/tblob_op_registry.cc    |   3 +-
 src/ndarray/ndarray.cc             |  27 ++--
 src/ndarray/ndarray_function-inl.h | 182 +++++++++++++++++--------
 src/ndarray/ndarray_function.cc    |  13 +-
 src/ndarray/ndarray_function.cu    |  60 ++++++---
 src/ndarray/ndarray_function.h     |   6 +-
 src/ndarray/unary_function-inl.h   |  72 ++++++++--
 src/operator/dropout-inl.h         |   2 +-
 src/operator/leaky_relu-inl.h      |   2 +-
 src/operator/mshadow_op.h          | 209 +++++++++++++++++------------
 17 files changed, 548 insertions(+), 230 deletions(-)

diff --git a/Makefile b/Makefile
index 08356bb02f79..b236f7606996 100644
--- a/Makefile
+++ b/Makefile
@@ -198,10 +198,18 @@ rpkg:	roxygen
 	cp -rf dmlc-core/include/* R-package/inst/include/
 	R CMD build --no-build-vignettes R-package
 
+ifneq ($(EXTRA_OPERATORS),)
 clean:
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~
 	cd $(DMLC_CORE); make clean; cd -
 	cd $(PS_PATH); make clean; cd -
+	$(RM) -r $(EXTRA_OPERATORS)/build
+else
+clean:
+	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~
+	cd $(DMLC_CORE); make clean; cd -
+	cd $(PS_PATH); make clean; cd -
+endif
 
 clean_all: clean
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 6087fde01ea5..e31461cfe12d 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -147,6 +147,26 @@ MXNET_DLL int MXNDArrayCreate(const mx_uint *shape,
                               int dev_id,
                               int delay_alloc,
                               NDArrayHandle *out);
+
+/*!
+ * \brief create a NDArray with specified shape and data type
+ * \param shape the pointer to the shape
+ * \param ndim the dimension of the shape
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ * \param delay_alloc whether to delay allocation until
+ *    the narray is first mutated
+ * \param dtype data type of created array
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape,
+                              mx_uint ndim,
+                              int dev_type,
+                              int dev_id,
+                              int delay_alloc,
+                              int dtype,
+                              NDArrayHandle *out);
 /*!
  * \brief create a NDArray handle that is loaded from raw bytes.
  * \param buf the head of the raw bytes
@@ -205,7 +225,7 @@ MXNET_DLL int MXNDArrayLoad(const char* fname,
  * \param size the memory size we want to copy from.
  */
 MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
-                                       const mx_float *data,
+                                       const void *data,
                                        size_t size);
 /*!
  * \brief Perform a synchronize copyto a continugous CPU memory region.
@@ -219,7 +239,7 @@ MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
  * \param size the memory size we want to copy into.
  */
 MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
-                                     mx_float *data,
+                                     void *data,
                                      size_t size);
 /*!
  * \brief Wait until all the pending writes with respect NDArray are finished.
@@ -277,6 +297,14 @@ MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
  */
 MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle,
                                mx_float **out_pdata);
+/*!
+ * \brief get the type of the data in NDArray
+ * \param handle the handle to the narray
+ * \param out_dtype pointer holder to get type of data
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle,
+                               int *out_dtype);
 /*!
  * \brief get the context of the NDArray
  * \param handle the handle to the narray
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index f9473aea6b17..f493e32c06cf 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -38,8 +38,9 @@ class MXNET_API NDArray {
    * \param delay_alloc whether delay the allocation
    */
   NDArray(const TShape &shape, Context ctx,
-          bool delay_alloc = false)
-      : ptr_(std::make_shared<Chunk>(shape.Size(), ctx, delay_alloc)), shape_(shape), offset_(0) {
+          bool delay_alloc = false, int dtype = mshadow::default_type_flag)
+      : ptr_(std::make_shared<Chunk>(shape.Size(), ctx, delay_alloc, dtype)),
+        shape_(shape), offset_(0), dtype_(dtype) {
   }
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
@@ -61,8 +62,11 @@ class MXNET_API NDArray {
    * \return the data TBlob
    */
   inline TBlob data() const {
-    return TBlob(static_cast<real_t*>(ptr_->shandle.dptr) + offset_, \
-                 shape_, ptr_->shandle.ctx.dev_mask());
+    MSHADOW_TYPE_SWITCH(dtype_, DType, {
+      return TBlob(static_cast<DType*>(ptr_->shandle.dptr)
+        + offset_, shape_, ptr_->shandle.ctx.dev_mask());
+    });
+    return TBlob();
   }
   /*!
    * \return the context of NDArray, this function is only valid when the NDArray is not empty
@@ -70,6 +74,12 @@ class MXNET_API NDArray {
   inline Context ctx() const {
     return ptr_->shandle.ctx;
   }
+  /*!
+   * \return the data type of NDArray, this function is only valid when the NDArray is not empty
+   */
+  inline int dtype() const {
+    return dtype_;
+  }
   /*! \return whether this ndarray is not initialized */
   inline bool is_none() const {
     return ptr_.get() == nullptr;
@@ -191,9 +201,10 @@ class MXNET_API NDArray {
    *  not wrapped by NDArray(thus dependency not being tracked).
    *
    * \param data the data source to copy from.
-   * \param size the memory size we want to copy from.
+   * \param size the size of the source array, in sizeof(DType) not raw btyes.
+   * \param dtype the data type of source array.
    */
-  void SyncCopyFromCPU(const real_t *data, size_t size) const;
+  void SyncCopyFromCPU(const void *data, size_t size) const;
   /*!
    * \brief Do a synchronize copy to a continugous CPU memory region.
    *
@@ -202,9 +213,10 @@ class MXNET_API NDArray {
    *  not wrapped by NDArray(thus dependency not being tracked).
    *
    * \param data the data source to copyinto.
-   * \param size the memory size we want to copy into.
+   * \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
+   * \param dtype the data type of target array.
    */
-  void SyncCopyToCPU(real_t *data, size_t size) const;
+  void SyncCopyToCPU(void *data, size_t size) const;
   /*!
    * \brief Slice a NDArray
    * \param begin begin index in first dim
@@ -291,13 +303,13 @@ class MXNET_API NDArray {
         shandle.ctx = Context::GPU(dev_id);
       }
       shandle.dptr = data.dptr_;
-      shandle.size = data.shape_.Size() * sizeof(real_t);
+      shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
     }
     /*! \brief construct a new chunk */
-    Chunk(uint64_t size, Context ctx, bool delay_alloc_)
+    Chunk(uint64_t size, Context ctx, bool delay_alloc_, int dtype)
         : static_data(false), delay_alloc(true) {
       var = Engine::Get()->NewVariable();
-      shandle.size = size * sizeof(real_t);
+      shandle.size = size * mshadow::mshadow_sizeof(dtype);
       shandle.ctx = ctx;
       if (!delay_alloc_) this->CheckAndAlloc();
     }
@@ -326,6 +338,8 @@ class MXNET_API NDArray {
   TShape shape_;
   /*! \brief offset in chunk */
   size_t offset_;
+  /*! \brief type of data */
+  int dtype_;
 };
 
 /*!
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index ded89adcb254..ad6a6f11fd95 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -61,12 +61,12 @@ struct Resource {
    * \return the mshadow random number generator requested.
    * \tparam xpu the device type of random number generator.
    */
-  template<typename xpu>
-  inline mshadow::Random<xpu>* get_random(
+  template<typename xpu, typename DType>
+  inline mshadow::Random<xpu, DType>* get_random(
       mshadow::Stream<xpu> *stream) const {
     CHECK_EQ(req.type, ResourceRequest::kRandom);
-    mshadow::Random<xpu> *ret =
-        static_cast<mshadow::Random<xpu>*>(ptr_);
+    mshadow::Random<xpu, DType> *ret =
+        static_cast<mshadow::Random<xpu, DType>*>(ptr_);
     ret->set_stream(stream);
     return ret;
   }
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index d36b9fa6d7d6..feb2fb6cafba 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -46,6 +46,7 @@ def _load_lib():
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
 mx_float_p = ctypes.POINTER(mx_float)
+mx_real_t = np.float32
 NDArrayHandle = ctypes.c_void_p
 FunctionHandle = ctypes.c_void_p
 SymbolCreatorHandle = ctypes.c_void_p
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 235d50b4534d..a50a1406c2f1 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -7,12 +7,28 @@
 import sys
 import numpy as np
 from .base import _LIB, string_types, numeric_types
-from .base import c_array, py_str, c_str
-from .base import mx_uint, mx_float, mx_float_p, NDArrayHandle, FunctionHandle
+from .base import c_array, py_str, c_str, mx_real_t
+from .base import mx_uint, mx_float, NDArrayHandle, FunctionHandle
 from .base import ctypes2buffer
 from .base import check_call, ctypes2docstring
 from .context import Context
 
+_DTYPE_NP_TO_MX = {
+    np.float32 : 0,
+    np.float64 : 1,
+    np.float16 : 2,
+    np.uint8   : 3,
+    np.int32   : 4
+}
+
+_DTYPE_MX_TO_NP = {
+    0 : np.float32,
+    1 : np.float64,
+    2 : np.float16,
+    3 : np.uint8,
+    4 : np.int32
+}
+
 def _new_empty_handle():
     """Return a new empty handle.
 
@@ -26,7 +42,7 @@ def _new_empty_handle():
     check_call(_LIB.MXNDArrayCreateNone(ctypes.byref(hdl)))
     return hdl
 
-def _new_alloc_handle(shape, ctx, delay_alloc):
+def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """Return a new handle with specified shape and context.
 
     Empty handle is only used to hold results
@@ -36,12 +52,13 @@ def _new_alloc_handle(shape, ctx, delay_alloc):
     a new empty ndarray handle
     """
     hdl = NDArrayHandle()
-    check_call(_LIB.MXNDArrayCreate(
+    check_call(_LIB.MXNDArrayCreateEx(
         c_array(mx_uint, shape),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
         ctypes.c_int(int(delay_alloc)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
         ctypes.byref(hdl)))
     return hdl
 
@@ -230,16 +247,16 @@ def _sync_copyfrom(self, source_array):
         """
         if not isinstance(source_array, np.ndarray):
             try:
-                source_array = np.array(source_array, dtype=np.float32)
+                source_array = np.array(source_array, dtype=self.dtype)
             except:
                 raise TypeError('array must be an array_like data,' +
                                 'type %s is not supported' % str(type(array)))
-        source_array = np.ascontiguousarray(source_array, dtype=np.float32)
+        source_array = np.ascontiguousarray(source_array, dtype=self.dtype)
         if source_array.shape != self.shape:
             raise ValueError('array shape do not match the shape of NDArray')
         check_call(_LIB.MXNDArraySyncCopyFromCPU(
             self.handle,
-            source_array.ctypes.data_as(mx_float_p),
+            source_array.ctypes.data_as(ctypes.c_void_p),
             ctypes.c_size_t(source_array.size)))
 
     def _slice(self, start, stop):
@@ -307,6 +324,19 @@ def context(self):
             self.handle, ctypes.byref(dev_typeid), ctypes.byref(dev_id)))
         return Context(Context.devtype2str[dev_typeid.value], dev_id.value)
 
+    @property
+    def dtype(self):
+        """Get data type of current NDArray.
+
+        Returns
+        -------
+        an numpy.dtype object representing type of current ndarray
+        """
+        mx_dtype = ctypes.c_int()
+        check_call(_LIB.MXNDArrayGetDType(
+            self.handle, ctypes.byref(mx_dtype)))
+        return _DTYPE_MX_TO_NP[mx_dtype.value]
+
     def asnumpy(self):
         """Return a copied numpy array of current array.
 
@@ -315,10 +345,10 @@ def asnumpy(self):
         array : numpy.ndarray
             A copy of array content.
         """
-        data = np.empty(self.shape, dtype=np.float32)
+        data = np.empty(self.shape, dtype=self.dtype)
         check_call(_LIB.MXNDArraySyncCopyToCPU(
             self.handle,
-            data.ctypes.data_as(mx_float_p),
+            data.ctypes.data_as(ctypes.c_void_p),
             ctypes.c_size_t(data.size)))
         return data
 
@@ -336,6 +366,23 @@ def asscalar(self):
             raise ValueError("The current array is not a scalar")
         return self.asnumpy()[0]
 
+    def astype(self, dtype):
+        """Return a copied numpy array of current array with specified type.
+
+        Parameters
+        ----------
+        dtype : numpy.dtype or string
+            Desired type of result array.
+
+        Returns
+        -------
+        array : numpy.ndarray
+            A copy of array content.
+        """
+        res = empty(self.shape, ctx=self.context, dtype=dtype)
+        self.copyto(res)
+        return res
+
     def copyto(self, other):
         """Copy the content of current array to other.
 
@@ -360,7 +407,7 @@ def copyto(self, other):
                 return
             return NDArray._copyto(self, out=other)
         elif isinstance(other, Context):
-            hret = NDArray(_new_alloc_handle(self.shape, other, True))
+            hret = NDArray(_new_alloc_handle(self.shape, other, True, self.dtype))
             return NDArray._copyto(self, out=hret)
         else:
             raise TypeError('copyto do not support type ' + str(type(other)))
@@ -388,7 +435,7 @@ def onehot_encode(indices, out):
     # pylint: enable= no-member, protected-access
 
 
-def empty(shape, ctx=None):
+def empty(shape, ctx=None, dtype=mx_real_t):
     """Create an empty uninitialized new NDArray, with specified shape.
 
     Parameters
@@ -408,9 +455,9 @@ def empty(shape, ctx=None):
         shape = (shape, )
     if ctx is None:
         ctx = Context.default_ctx
-    return NDArray(handle=_new_alloc_handle(shape, ctx, False))
+    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
 
-def zeros(shape, ctx=None):
+def zeros(shape, ctx=None, dtype=mx_real_t):
     """Create a new NDArray filled with 0, with specified shape.
 
     Parameters
@@ -425,11 +472,11 @@ def zeros(shape, ctx=None):
     out: Array
         The created NDArray.
     """
-    arr = empty(shape, ctx)
+    arr = empty(shape, ctx, dtype)
     arr[:] = 0.0
     return arr
 
-def ones(shape, ctx=None):
+def ones(shape, ctx=None, dtype=mx_real_t):
     """Create a new NDArray filled with 1, with specified shape.
 
     Parameters
@@ -444,12 +491,12 @@ def ones(shape, ctx=None):
     out: Array
         The created NDArray.
     """
-    arr = empty(shape, ctx)
+    arr = empty(shape, ctx, dtype)
     arr[:] = 1.0
     return arr
 
 
-def array(source_array, ctx=None):
+def array(source_array, ctx=None, dtype=mx_real_t):
     """Create a new NDArray that copies content from source_array.
 
     Parameters
@@ -468,10 +515,10 @@ def array(source_array, ctx=None):
 
     if not isinstance(source_array, np.ndarray):
         try:
-            source_array = np.array(source_array, dtype=np.float32)
+            source_array = np.array(source_array, dtype=dtype)
         except:
             raise TypeError('source_array must be array like object')
-    arr = empty(source_array.shape, ctx)
+    arr = empty(source_array.shape, ctx, dtype)
     arr[:] = source_array
     return arr
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b8a03b1c276f..714950776ec5 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -128,6 +128,22 @@ int MXNDArrayCreate(const mx_uint *shape,
   API_END();
 }
 
+int MXNDArrayCreateEx(const mx_uint *shape,
+                    mx_uint ndim,
+                    int dev_type,
+                    int dev_id,
+                    int delay_alloc,
+                    int dtype,
+                    NDArrayHandle *out) {
+  API_BEGIN();
+  *out = new NDArray(
+      TShape(shape, shape + ndim),
+      Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
+      delay_alloc != 0,
+      dtype);
+  API_END();
+}
+
 int MXNDArrayLoadFromRawBytes(const void *buf,
                               size_t size,
                               NDArrayHandle *out) {
@@ -156,7 +172,7 @@ int MXNDArraySaveRawBytes(NDArrayHandle handle,
 }
 
 int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
-                             const mx_float *data,
+                             const void *data,
                              size_t size) {
   API_BEGIN();
   static_cast<NDArray*>(handle)->SyncCopyFromCPU(data, size);
@@ -164,7 +180,7 @@ int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
 }
 
 int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
-                           mx_float *data,
+                           void *data,
                            size_t size) {
   API_BEGIN();
   static_cast<NDArray*>(handle)->SyncCopyToCPU(data, size);
@@ -292,6 +308,18 @@ int MXNDArrayGetData(NDArrayHandle handle,
   API_END();
 }
 
+int MXNDArrayGetDType(NDArrayHandle handle,
+                     int *out_dtype) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  if (!arr->is_none()) {
+    *out_dtype = arr->dtype();
+  } else {
+    *out_dtype = -1;
+  }
+  API_END();
+}
+
 int MXNDArrayGetContext(NDArrayHandle handle,
                         int *out_dev_type,
                         int *out_dev_id) {
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
index 064cc4b1cc6f..9e7de6d1171b 100644
--- a/src/common/tblob_op_registry.cc
+++ b/src/common/tblob_op_registry.cc
@@ -279,9 +279,10 @@ void TBlobOpRegEntryImpl::RegisterUnary() {
     if (unary_infer_ != nullptr) dshape = unary_infer_(dshape);
 
     if (out->is_none()) {
-      *out = NDArray(dshape, src.ctx(), true);
+      *out = NDArray(dshape, src.ctx(), true, src.dtype());
     } else {
       CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+      CHECK(out->dtype() == src.dtype()) << "target data type mismatch";
       CHECK(out->shape() == dshape) << "target shape mismatch "
       << out->shape() << " vs. " << dshape;
     }
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index e844f3f5a525..3b33ea28283d 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -34,7 +34,7 @@ void BinaryOp(const NDArray &lhs,
   }
   // if out is none, allocate space
   if (out->is_none()) {
-    *out = NDArray(OP::GetShape(lhs.shape(), rhs.shape()), lhs.ctx(), true);
+    *out = NDArray(OP::GetShape(lhs.shape(), rhs.shape()), lhs.ctx(), true, lhs.dtype());
   } else {
     // no check if both of them are on cpu
     if (lhs.ctx().dev_mask() != cpu::kDevMask ||
@@ -118,7 +118,7 @@ void ScalarOp(const NDArray &lhs,
               const real_t &rhs,
               NDArray *out) {
   if (out->is_none()) {
-    *out = NDArray(lhs.shape(), lhs.ctx(), true);
+    *out = NDArray(lhs.shape(), lhs.ctx(), true, lhs.dtype());
   } else {
     CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
     CHECK(out->shape() == lhs.shape()) << "target shape mismatch";
@@ -276,7 +276,7 @@ void ClipOp(const NDArray &src,
             const real_t &a_min, const real_t &a_max,
             NDArray *out) {
   if (out->is_none()) {
-    *out = NDArray(src.shape(), src.ctx(), true);
+    *out = NDArray(src.shape(), src.ctx(), true, src.dtype());
   } else {
     CHECK(out->ctx() == src.ctx()) << "target context mismatch";
     CHECK(out->shape() == src.shape()) << "target shape mismatch";
@@ -466,12 +466,9 @@ void NDArray::Save(dmlc::Stream *strm) const {
   }
   // save type flag
   int32_t type_flag = save_data.type_flag_;
-  CHECK(type_flag == mshadow::DataType<real_t>::kFlag)
-      << "Only support float NDArray so far";
   strm->Write(&type_flag, sizeof(type_flag));
   CHECK(save_data.CheckContiguous());
-  // save data: need to change this after more type mask is supported
-  size_t type_size = sizeof(real_t);
+  size_t type_size = mshadow::mshadow_sizeof(type_flag);
   strm->Write(save_data.dptr_, type_size * shape_.Size());
 }
 
@@ -488,12 +485,10 @@ bool NDArray::Load(dmlc::Stream *strm) {
   // load type flag
   int32_t type_flag;
   if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
-  CHECK(type_flag == mshadow::DataType<real_t>::kFlag)
-      << "Only support float NDArray so far, type_flag=" << type_flag;
   // load data into CPU
-  NDArray temp(shape, Context::CPU());
+  NDArray temp(shape, Context::CPU(), false, type_flag);
   TBlob load_data = temp.data();
-  size_t type_size = sizeof(real_t);
+  size_t type_size = mshadow::mshadow_sizeof(type_flag);
   size_t nread = type_size * shape.Size();
 
   if (strm->Read(load_data.dptr_, nread) != nread) return false;
@@ -536,19 +531,19 @@ void NDArray::Load(dmlc::Stream* fi,
 }
 
 NDArray NDArray::Copy(Context ctx) const {
-  NDArray ret(shape(), ctx, true);
+  NDArray ret(shape(), ctx, true, dtype_);
   CopyFromTo(*this, &ret);
   return ret;
 }
 
-void NDArray::SyncCopyFromCPU(const real_t *data, size_t size) const {
+void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   this->WaitToWrite();
   TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
   Context ctx = this->ctx();
   TBlob dst = this->data();
-  TBlob src((real_t*)data, dshape, cpu::kDevMask); // NOLINT(*)
+  TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_); // NOLINT(*)
 
   RunContext run_ctx;
   run_ctx.stream = nullptr;
@@ -568,14 +563,14 @@ void NDArray::SyncCopyFromCPU(const real_t *data, size_t size) const {
   }
 }
 
-void NDArray::SyncCopyToCPU(real_t *data, size_t size) const {
+void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   this->WaitToRead();
   TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
   Context ctx = this->ctx();
   TBlob src = this->data();
-  TBlob dst(data, dshape, cpu::kDevMask); // NOLINT(*)
+  TBlob dst(data, dshape, cpu::kDevMask, this->dtype_); // NOLINT(*)
 
   RunContext run_ctx;
   run_ctx.stream = nullptr;
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 20f9eb8c65a0..22a4843922df 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -40,9 +40,15 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
                         TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-      = F<typename OP::mshadow_op>(lhs.FlatTo2D<xpu, real_t>(s),
-                                   rhs.FlatTo2D<xpu, real_t>(s));
+  CHECK_EQ(ret->type_flag_, lhs.type_flag_)
+    << "Only support input/output with the same data type";
+  CHECK_EQ(ret->type_flag_, rhs.type_flag_)
+    << "Only support input/output with the same data type";
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    ret->FlatTo2D<xpu, DType>(s)
+      = F<typename OP::mshadow_op>(lhs.FlatTo2D<xpu, DType>(s),
+                                   rhs.FlatTo2D<xpu, DType>(s));
+  });
 }
 
 template<typename xpu, typename OP>
@@ -50,9 +56,14 @@ inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
                      TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-    = dot(lhs.FlatTo2D<xpu, real_t>(s),
-          rhs.FlatTo2D<xpu, real_t>(s));
+  CHECK_EQ(ret->type_flag_, lhs.type_flag_)
+    << "Only support input/output with the same data type";
+  CHECK_EQ(ret->type_flag_, rhs.type_flag_)
+    << "Only support input/output with the same data type";
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    ret->FlatTo2D<xpu, DType>(s) = dot(lhs.FlatTo2D<xpu, DType>(s),
+                                       rhs.FlatTo2D<xpu, DType>(s));
+  });
 }
 
 template<typename xpu, typename OP>
@@ -60,9 +71,16 @@ inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
                         TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->get<xpu, 2, real_t>(s)
-      = one_hot_encode(index.get<xpu, 1, real_t>(s),
-                       rhs.shape_[1]);
+  // TODO(eric): support mixed type encoding, i.e. int index and float rhs.
+  CHECK_EQ(ret->type_flag_, mshadow::default_type_flag)
+    << "one_hot_encode only support float32 as input/output";
+  CHECK_EQ(rhs.type_flag_, mshadow::default_type_flag)
+    << "one_hot_encode only support float32 as input/output";
+  CHECK_EQ(index.type_flag_, mshadow::default_type_flag)
+    << "one_hot_encode only support float32 as input/output";
+  ret->get<xpu, 2, real_t>(s) =
+    one_hot_encode(index.get<xpu, 1, real_t>(s),
+                   rhs.shape_[1]);
 }
 
 template<typename xpu, typename OP>
@@ -70,6 +88,13 @@ inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
                                   TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  // TODO(eric): support mixed type choose, i.e. int index and float rhs.
+  CHECK_EQ(ret->type_flag_, mshadow::default_type_flag)
+    << "mat_choose_row_element only support float32 as input/output";
+  CHECK_EQ(rhs.type_flag_, mshadow::default_type_flag)
+    << "mat_choose_row_element only support float32 as input/output";
+  CHECK_EQ(lhs.type_flag_, mshadow::default_type_flag)
+    << "mat_choose_row_element only support float32 as input/output";
   ret->get<xpu, 1, real_t>(s)
       = mat_choose_row_element(lhs.get<xpu, 2, real_t>(s),
                                rhs.get<xpu, 1, real_t>(s));
@@ -80,26 +105,35 @@ inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
                         TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, lhs.type_flag_)
+    << "Only support input/output with the same data type";
   if (reverse) {
-    ret->FlatTo2D<xpu, real_t>(s)
-      = F<typename OP::mshadow_op>(rhs, lhs.FlatTo2D<xpu, real_t>(s));
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      ret->FlatTo2D<xpu, DType>(s)
+        = F<typename OP::mshadow_op>(scalar(DType(rhs)), lhs.FlatTo2D<xpu, DType>(s));
+    });
   } else {
-    ret->FlatTo2D<xpu, real_t>(s)
-      = F<typename OP::mshadow_op>(lhs.FlatTo2D<xpu, real_t>(s), rhs);
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      ret->FlatTo2D<xpu, DType>(s)
+        = F<typename OP::mshadow_op>(lhs.FlatTo2D<xpu, DType>(s), scalar(DType(rhs)));
+    });
   }
 }
 
-
 template<>
 void EvalClip<DEVICE>(const TBlob &src, const real_t &a_min, const real_t &a_max,
                       TBlob *ret, RunContext ctx) {
   typedef DEVICE xpu;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  ret->FlatTo2D<xpu, real_t>(s)
-    = F<ClipMax::mshadow_op>(
-        F<ClipMin::mshadow_op>(src.FlatTo2D<xpu, real_t>(s), a_min),
-        a_max);
+  CHECK_EQ(ret->type_flag_, src.type_flag_)
+    << "Only support input/output with the same data type";
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    ret->FlatTo2D<xpu, DType>(s)
+      = F<ClipMax::mshadow_op>(
+          F<ClipMin::mshadow_op>(src.FlatTo2D<xpu, DType>(s), scalar(DType(a_min))),
+          scalar(DType(a_max)));
+  });
 }
 
 template<>
@@ -111,9 +145,24 @@ void EvalRandom<DEVICE, UniformDistribution>(
     RunContext ctx) {
   typedef DEVICE xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 2, real_t> tmp = ret->FlatTo2D<xpu, real_t>(s);
-  mshadow::Random<xpu> *prnd = resource.get_random<xpu>(s);
-  prnd->SampleUniform(&tmp, a, b);
+  switch (ret->type_flag_) {
+  case mshadow::kFloat32:
+    {
+      mshadow::Random<xpu, float> *prnd = resource.get_random<xpu, float>(s);
+      mshadow::Tensor<xpu, 2, float> tmp = ret->FlatTo2D<xpu, float>(s);
+      prnd->SampleUniform(&tmp, float(a), float(b));  // NOLINT(*)
+      break;
+    }
+  case mshadow::kFloat64:
+    {
+      mshadow::Random<xpu, double> *prnd = resource.get_random<xpu, double>(s);
+      mshadow::Tensor<xpu, 2, double> tmp = ret->FlatTo2D<xpu, double>(s);
+      prnd->SampleUniform(&tmp, double(a), double(b));  // NOLINT(*)
+      break;
+    }
+  default:
+    LOG(FATAL) << "Random only support float32 and float64";
+  }
 }
 
 template<>
@@ -125,15 +174,32 @@ void EvalRandom<DEVICE, GaussianDistribution>(
     RunContext ctx) {
   typedef DEVICE xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 2, real_t> tmp = ret->FlatTo2D<xpu, real_t>(s);
-  mshadow::Random<xpu> *prnd = resource.get_random<xpu>(s);
-  prnd->SampleGaussian(&tmp, mu, sigma);
+  switch (ret->type_flag_) {
+  case mshadow::kFloat32:
+    {
+      mshadow::Random<xpu, float> *prnd = resource.get_random<xpu, float>(s);
+      mshadow::Tensor<xpu, 2, float> tmp = ret->FlatTo2D<xpu, float>(s);
+      prnd->SampleGaussian(&tmp, float(mu), float(sigma));  // NOLINT(*)
+      break;
+    }
+  case mshadow::kFloat64:
+    {
+      mshadow::Random<xpu, double> *prnd = resource.get_random<xpu, double>(s);
+      mshadow::Tensor<xpu, 2, double> tmp = ret->FlatTo2D<xpu, double>(s);
+      prnd->SampleGaussian(&tmp, double(mu), double(sigma));  // NOLINT(*)
+      break;
+    }
+  default:
+    LOG(FATAL) << "Random only support float32 and float64";
+  }
 }
 
 template<>
 void Eval<DEVICE>(const real_t &rhs, TBlob *ret, RunContext ctx) {
   mshadow::Stream<DEVICE> *s = ctx.get_stream<DEVICE>();
-  ret->FlatTo2D<DEVICE, real_t>(s) = rhs;
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    ret->FlatTo2D<DEVICE, DType>(s) = DType(rhs);
+  });
 }
 
 template<>
@@ -144,39 +210,45 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
   using namespace mshadow;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 2> out = dst->FlatTo2D<xpu, real_t>(s);
+  for (size_t i = 1; i < source.size(); ++i) {
+    CHECK_EQ(source[i].type_flag_, dst->type_flag_)
+      << "Only support input/output with the same data type";
+  }
+  MSHADOW_TYPE_SWITCH(dst->type_flag_, DType, {
+    Tensor<xpu, 2, DType> out = dst->FlatTo2D<xpu, DType>(s);
 
-  switch (source.size()) {
-    case 2: {
-      Tensor<xpu, 2> in_0 = source[0].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> in_1 = source[1].FlatTo2D<xpu, real_t>(s);
-      out = in_0 + in_1;
-      break;
-    }
-    case 3: {
-      Tensor<xpu, 2> in_0 = source[0].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> in_1 = source[1].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> in_2 = source[2].FlatTo2D<xpu, real_t>(s);
-      out = in_0 + in_1 + in_2;
-      break;
-    }
-    case 4: {
-      Tensor<xpu, 2> in_0 = source[0].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> in_1 = source[1].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> in_2 = source[2].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> in_3 = source[3].FlatTo2D<xpu, real_t>(s);
-      out = in_0 + in_1 + in_2 + in_3;
-      break;
-    }
-    default: {
-      Tensor<xpu, 2> in_0 = source[0].FlatTo2D<xpu, real_t>(s);
-      out = F<mshadow::op::identity>(in_0);
-      for (size_t i = 1; i < source.size(); ++i) {
-        out += source[i].FlatTo2D<xpu, real_t>(s);
+    switch (source.size()) {
+      case 2: {
+        Tensor<xpu, 2, DType> in_0 = source[0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = source[1].FlatTo2D<xpu, DType>(s);
+        out = in_0 + in_1;
+        break;
+      }
+      case 3: {
+        Tensor<xpu, 2, DType> in_0 = source[0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = source[1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_2 = source[2].FlatTo2D<xpu, DType>(s);
+        out = in_0 + in_1 + in_2;
+        break;
+      }
+      case 4: {
+        Tensor<xpu, 2, DType> in_0 = source[0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = source[1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_2 = source[2].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_3 = source[3].FlatTo2D<xpu, DType>(s);
+        out = in_0 + in_1 + in_2 + in_3;
+        break;
+      }
+      default: {
+        Tensor<xpu, 2, DType> in_0 = source[0].FlatTo2D<xpu, DType>(s);
+        out = F<mshadow::op::identity>(in_0);
+        for (size_t i = 1; i < source.size(); ++i) {
+          out += source[i].FlatTo2D<xpu, DType>(s);
+        }
+        break;
       }
-      break;
     }
-  }
+  });
 }
 
 // declarations
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index e6dcdcde91b3..cf6b180714ee 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -14,8 +14,17 @@ template<>
 void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                     Context from_ctx, Context to_ctx,
                     RunContext ctx) {
-  mshadow::Copy(to->FlatTo2D<cpu, real_t>(),
-                from.FlatTo2D<cpu, real_t>());
+  MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
+    if (to->type_flag_ == from.type_flag_) {
+        mshadow::Copy(to->FlatTo2D<cpu, DType>(),
+                      from.FlatTo2D<cpu, DType>());
+    } else {
+        MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+            to->FlatTo2D<cpu, DType>() =
+                mshadow::expr::tcast<DType>(from.FlatTo2D<cpu, SrcDType>());
+        })
+    }
+  })
 }
 }  // namespace ndarray
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index 3d17454c48ae..db667195cd5c 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -9,18 +9,26 @@ template<>
 void Copy<cpu, gpu>(const TBlob &from, TBlob *to,
                     Context from_ctx, Context to_ctx,
                     RunContext ctx) {
-  mshadow::Copy(to->FlatTo2D<gpu, real_t>(),
-                from.FlatTo2D<cpu, real_t>(),
-                static_cast<mshadow::Stream<gpu>*>(ctx.stream));
+  CHECK_EQ(to->type_flag_, from.type_flag_)
+    << "Source and target must have the same data type when copying across devices.";
+  MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
+    mshadow::Copy(to->FlatTo2D<gpu, DType>(),
+                  from.FlatTo2D<cpu, DType>(),
+                  static_cast<mshadow::Stream<gpu>*>(ctx.stream));
+  });
 }
 
 template<>
 void Copy<gpu, cpu>(const TBlob &from, TBlob *to,
                     Context from_ctx, Context to_ctx,
                     RunContext ctx) {
-  mshadow::Copy(to->FlatTo2D<cpu, real_t>(),
-                from.FlatTo2D<gpu, real_t>(),
-                static_cast<mshadow::Stream<gpu>*>(ctx.stream));
+  CHECK_EQ(to->type_flag_, from.type_flag_)
+    << "Source and target must have the same data type when copying across devices.";
+  MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
+    mshadow::Copy(to->FlatTo2D<cpu, DType>(),
+                  from.FlatTo2D<gpu, DType>(),
+                  static_cast<mshadow::Stream<gpu>*>(ctx.stream));
+  });
 }
 
 template<>
@@ -28,20 +36,32 @@ void Copy<gpu, gpu>(const TBlob &from, TBlob *to,
                     Context from_ctx, Context to_ctx,
                     RunContext ctx) {
   if (from_ctx.dev_id == to_ctx.dev_id) {
-     mshadow::Copy(to->FlatTo2D<gpu, real_t>(),
-                   from.FlatTo2D<gpu, real_t>(),
-                   static_cast<mshadow::Stream<gpu>*>(ctx.stream));
-   } else {
-     CHECK(from.CheckContiguous() && to->CheckContiguous())
-         << "copy across only support continugous memory";
-     mshadow::Stream<gpu> *s = static_cast<mshadow::Stream<gpu>*>(ctx.stream);
-     CHECK(s != NULL) << "need stream in GPU context";
-     cudaMemcpyPeerAsync(to->dptr_,
-                         to_ctx.dev_id,
-                         from.dptr_,
-                         from_ctx.dev_id,
-                         from.shape_.Size() * sizeof(real_t),
-                         s->stream_);
+    mshadow::Stream<gpu>* s = static_cast<mshadow::Stream<gpu>*>(ctx.stream);
+    MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
+      if (to->type_flag_ == from.type_flag_) {
+        mshadow::Copy(to->FlatTo2D<gpu, DType>(s),
+                      from.FlatTo2D<gpu, DType>(s),
+                      s);
+      } else {
+        MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+          to->FlatTo2D<gpu, DType>(s) =
+            mshadow::expr::tcast<DType>(from.FlatTo2D<gpu, SrcDType>(s));
+        })
+      }
+    })
+  } else {
+    CHECK(from.CheckContiguous() && to->CheckContiguous())
+      << "copy across only support continugous memory";
+    CHECK_EQ(to->type_flag_, from.type_flag_)
+      << "Source and target must have the same data type when copying across devices.";
+    mshadow::Stream<gpu> *s = static_cast<mshadow::Stream<gpu>*>(ctx.stream);
+    CHECK(s != NULL) << "need stream in GPU context";
+    cudaMemcpyPeerAsync(to->dptr_,
+                        to_ctx.dev_id,
+                        from.dptr_,
+                        from_ctx.dev_id,
+                        from.shape_.Size() * mshadow::mshadow_sizeof(to->type_flag_),
+                        s->stream_);
   }
 }
 }  // namespace ndarray
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 9f23c1a5c348..5b146f301343 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -43,7 +43,8 @@ struct Div : public BinaryBase {
 
 struct ClipMin : public BinaryBase {
   struct mshadow_op {
-    MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+    template<typename DType>
+    MSHADOW_XINLINE static DType Map(DType a, DType b) {
       if (a < b) {
         return b;
       } else {
@@ -55,7 +56,8 @@ struct ClipMin : public BinaryBase {
 
 struct ClipMax : public BinaryBase {
   struct mshadow_op {
-    MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+    template<typename DType>
+    MSHADOW_XINLINE static DType Map(DType a, DType b) {
       if (a > b) {
         return b;
       } else {
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
index 2776db0f1ecc..c43e09e4c384 100644
--- a/src/ndarray/unary_function-inl.h
+++ b/src/ndarray/unary_function-inl.h
@@ -6,6 +6,7 @@
 #ifndef MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
 #define MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
 
+#include <vector>
 #include "../common/tblob_op_registry.h"
 #include "../operator/mshadow_op.h"
 #include "../operator/operator_common.h"
@@ -28,8 +29,12 @@ void UnaryForward_(const TBlob &src,
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 2> out = ret->FlatTo2D<xpu, real_t>(s);
-  Assign(out, req, F<OP>(src.FlatTo2D<xpu, real_t>(s)));
+  CHECK_EQ(ret->type_flag_, src.type_flag_)
+    << "Unary function only support input/output with the same type";
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+    Assign(out, req, F<OP>(src.FlatTo2D<xpu, DType>(s)));
+  });
 }
 
 // backward function that takes input value of the op
@@ -42,10 +47,16 @@ void UnaryBackwardUseIn_(const arg::OutGrad& out_grad,
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
-  Assign(igrad, req,
-         (F<OP>(in_data0.data.FlatTo2D<xpu, real_t>(s)) *
-         out_grad.data.FlatTo2D<xpu, real_t>()));
+  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
+    << "Unary function only support input/output with the same type";
+  CHECK_EQ(in_grad->type_flag_, in_data0.data.type_flag_)
+    << "Unary function only support input/output with the same type";
+  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    Assign(igrad, req,
+           (F<OP>(in_data0.data.FlatTo2D<xpu, DType>(s)) *
+           out_grad.data.FlatTo2D<xpu, DType>()));
+  });
 }
 
 // backward function that takes output value of the op
@@ -58,10 +69,16 @@ void UnaryBackwardUseOut_(const arg::OutGrad& out_grad,
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
-  Assign(igrad, req,
-         (F<OP>(out_value.data.FlatTo2D<xpu, real_t>(s)) *
-         out_grad.data.FlatTo2D<xpu, real_t>()));
+  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
+    << "Unary function only support input/output with the same type";
+  CHECK_EQ(in_grad->type_flag_, out_value.data.type_flag_)
+    << "Unary function only support input/output with the same type";
+  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    Assign(igrad, req,
+           (F<OP>(out_value.data.FlatTo2D<xpu, DType>(s)) *
+           out_grad.data.FlatTo2D<xpu, DType>()));
+  });
 }
 
 // return a shape of scalar
@@ -94,6 +111,35 @@ void Reduce(const TBlob &src,
       src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(1, src.shape_.Size()), s);
   out = mshadow::expr::reduce_except_dim<0, Reducer>(in);
 }
+
+template<typename xpu, typename Reducer, bool get_mask>
+void ReduceChannel(const TBlob &src,
+                   TBlob *ret,
+                   OpReqType req,
+                   RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 2> out = ret->get_with_shape<xpu, 2, real_t>(
+    Shape2(src.shape_[0], src.Size()/src.shape_[0]/src.shape_[1]),
+    s);
+  Tensor<xpu, 3> in = src.get_with_shape<xpu, 3, real_t>(
+    Shape3(src.shape_[0], src.shape_[1], src.Size()/src.shape_[0]/src.shape_[1]),
+    s);
+  out = reduce_with_axis<Reducer, get_mask>(in, 1);
+}
+
+// return a shape of ReduceChannel output
+inline TShape ReduceChannelShape(const TShape& ishape) {
+  std::vector<mshadow::index_t> shape;
+  shape.push_back(ishape[0]);
+  for (index_t i = 2; i < ishape.ndim(); ++i) {
+    shape.push_back(ishape[i]);
+  }
+  return TShape(shape.begin(), shape.end());
+}
+
 // Register all unary operations here
 // The true means inplace can be enabled.
 // abs
@@ -176,6 +222,12 @@ MXNET_REGISTER_TBLOB_FUN(min, XPU)
 MXNET_REGISTER_TBLOB_FUN(sum, XPU)
 .set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::sum>, false, false)
 .set_shape_infer(ScalarShape)
+.describe("Take sum of the src."
+          "The result will be ndarray of shape (1,) on the same device.");
+// argmax channel
+MXNET_REGISTER_TBLOB_FUN(argmax_channel, XPU)
+.set_function(XPU::kDevMask, ReduceChannel<XPU, mshadow::red::maximum, true>, false, false)
+.set_shape_infer(ReduceChannelShape)
 .describe("Take sum of the src."
           "The result will be ndarray of shape (1,) on the same device.");
 }  // namespace ndarray
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index 1d117bf24c3d..229001eaa66b 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -58,7 +58,7 @@ class DropoutOp : public Operator {
     Tensor<xpu, 2> out = out_data[dropout::kOut].FlatTo2D<xpu, real_t>(s);
     if (ctx.is_train) {
       Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
-      Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu>(s);
+      Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, real_t>(s);
       mask = F<mshadow_op::threshold>(prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_);
       Assign(out, req[dropout::kOut], data * mask);
     } else {
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index e37c0ea247b3..656d7e6d7dca 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -105,7 +105,7 @@ class LeakyReLUOp : public Operator {
       }
       case leakyrelu::kRReLU: {
         if (ctx.is_train) {
-          Random<xpu>* prnd = ctx.requested[leakyrelu::kRandom].get_random<xpu>(s);
+          Random<xpu>* prnd = ctx.requested[leakyrelu::kRandom].get_random<xpu, real_t>(s);
           mask = prnd->uniform(mask.shape_);
           mask = mask * (param_.upper_bound - param_.lower_bound) + param_.lower_bound;
           Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, mask));
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 71e91deaecf7..fa7dc0474ed5 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -14,260 +14,301 @@ namespace op {
 namespace mshadow_op {
 /*! \brief identity Operation */
 struct identity {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(a);
   }
 };
 
 struct identity_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(1.0f);
   }
 };
 
 
 struct negation {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return -a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(-a);
   }
 };
 
 /*! \brief sigmoid unit */
 struct sigmoid {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f / (1.0f + expf(-a));
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(1.0f / (1.0f + expf(-a)));
   }
 };
 struct sigmoid_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a * (1.0f - a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(a * (1.0f - a));
   }
 };
 /*! \brief Rectified Linear Operation */
 struct relu {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a > 0.0f ? a : 0.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(a > 0.0f ? a : 0.0f);
   }
 };
 struct relu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a > 0.0f ? 1.0f : 0.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(a > 0.0f ? 1.0f : 0.0f);
   }
 };
 
 /*! \brief Leaky ReLU Operation */
 struct xelu {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a > 0.0f ? a : a * b;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a > 0.0f ? a : a * b);
   }
 };
 
 struct xelu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a > 0.0f ? 1.0f : b;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a > 0.0f ? 1.0f : b);
   }
 };
 
 /*! \brief Exponential Linear Unit */
 struct elu {
-  MSHADOW_XINLINE static real_t Map(real_t x, real_t a) {
-    return x > 0.0f ? x : a * (expf(x) - 1.0f);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType x, DType a) {
+    return DType(x > 0.0f ? x : a * (expf(x) - 1.0f));
   }
 };
 
 struct elu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t x, real_t a) {
-    return x > 0.0f ? 1.0f : a * expf(x);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType x, DType a) {
+    return DType(x > 0.0f ? 1.0f : a * expf(x));
   }
 };
 
 struct tanh {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return tanhf( a );
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(tanhf( a ));
   }
 };
 
 struct tanh_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f - a * a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(1.0f - a * a);
   }
 };
 
 /*! \brief SoftReLU, also known as softplus activation. */
 struct softrelu {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return log1pf(expf(a));
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(log1pf(expf(a)));
   }
 };
 struct softrelu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f - expf(-a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(1.0f - expf(-a));
   }
 };
 
 struct exp {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return expf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(expf(a));
   }
 };
 
 struct log {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return logf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(logf(a));
   }
 };
 
 struct log_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0f / a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(1.0f / a);
   }
 };
 
 struct cos {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return cosf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(cosf(a));
   }
 };
 
 struct cos_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return -sinf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(-sinf(a));
   }
 };
 
 struct sin {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return sinf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(sinf(a));
   }
 };
 
 struct sin_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return cosf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(cosf(a));
   }
 };
 struct square {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a * a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(a * a);
   }
 };
 
 struct square_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 2.0f * a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(2.0f * a);
   }
 };
 
 /*! \brief used for generate Bernoulli mask */
 struct threshold {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a < b ? 1.0f : 0.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a < b ? 1.0f : 0.0f);
   }
 };
 
 /*! \brief used for generate element of abs */
 struct abs {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return fabsf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(fabsf(a));
   }
 };
 
 /*! \brief used for generate element of power */
 struct sign {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    if (a < 0.0f) return -1.0f;
-    if (a > 0.0f) return 1.0f;
-    return 0.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    if (a < 0.0f) return DType(-1.0f);
+    if (a > 0.0f) return DType(1.0f);
+    return DType(0.0f);
   }
 };
 struct sign_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 0.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(0.0f);
   }
 };
 /*! \brief used for generate element of power */
 struct power {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return powf( a, b );
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(powf( a, b ));
   }
 };
 
 /*! \brief used for generate element of maximum */
 struct maximum {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a > b ? a : b;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a > b ? a : b);
   }
 };
 
 struct maximum_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a > b ? 1 : 0;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a > b ? 1 : 0);
   }
 };
 
 /*! \brief used for generate element of minimum */
 struct minimum {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a < b ? a : b;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a < b ? a : b);
   }
 };
 struct minimum_grad  {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a < b ? 1 : 0;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a < b ? 1 : 0);
   }
 };
 
 /*!\ \brief used for generate element sqrt */
 struct square_root {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return sqrt(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(sqrtf(a));
   }
 };
 
 struct square_root_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 0.5f / a;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(0.5f / a);
   }
 };
 
 /*!\ \brief used for generate element sqrt */
 struct reciprocal_square_root {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return 1.0/sqrt(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(1.0/sqrtf(a));
   }
 };
 
 struct reciprocal_square_root_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return -(1.0 / (2.0 * a * sqrt(a)));
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(-(1.0 / (2.0 * a * sqrtf(a))));
   }
 };
 
 /*! \brief used for generate element of round */
 struct round {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return roundf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(roundf(a));
   }
 };
 
 /*! \brief used for generate element of ceil */
 struct ceil {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return ceilf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(ceilf(a));
   }
 };
 
 /*! \brief used for generate element of floor */
 struct floor {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return floorf(a);
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return DType(floorf(a));
   }
 };
 
 /*! \brief used for generate gradient of MAE loss*/
 struct minus_sign {
-  MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-    return a-b > 0.0f ? 1.0f : -1.0f;
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a-b > 0.0f ? 1.0f : -1.0f);
   }
 };
 

From f29723e6d800f7144080a54aa86154f8d4b76b64 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sat, 2 Jan 2016 20:58:24 -0800
Subject: [PATCH 08/32] restore softmax grad scaling

---
 include/mxnet/ndarray.h           | 3 +--
 src/operator/softmax_output-inl.h | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index f493e32c06cf..cdfdb4f01b1d 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -36,6 +36,7 @@ class MXNET_API NDArray {
    * \param shape the shape of array
    * \param ctx context of NDArray
    * \param delay_alloc whether delay the allocation
+   * \param dtype data type of this ndarray
    */
   NDArray(const TShape &shape, Context ctx,
           bool delay_alloc = false, int dtype = mshadow::default_type_flag)
@@ -202,7 +203,6 @@ class MXNET_API NDArray {
    *
    * \param data the data source to copy from.
    * \param size the size of the source array, in sizeof(DType) not raw btyes.
-   * \param dtype the data type of source array.
    */
   void SyncCopyFromCPU(const void *data, size_t size) const;
   /*!
@@ -214,7 +214,6 @@ class MXNET_API NDArray {
    *
    * \param data the data source to copyinto.
    * \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
-   * \param dtype the data type of target array.
    */
   void SyncCopyToCPU(void *data, size_t size) const;
   /*!
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index fb026df72e55..194de5cd0bfb 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -101,7 +101,7 @@ class SoftmaxOutputOp : public Operator {
       } else {
           SoftmaxGrad(grad, out, label);
       }
-      grad *= param_.grad_scale;
+      grad *= param_.grad_scale/s3[2];
     } else {
       Tensor<xpu, 1> label = in_data[softmaxout_enum::kLabel].get<xpu, 1, real_t>(s);
       Tensor<xpu, 2> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, real_t>(s);

From 5da8558d87cbaa7a611094133b2e8628ce7b7179 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Mon, 4 Jan 2016 23:25:05 -0800
Subject: [PATCH 09/32] symbolic backend type support

---
 include/mxnet/operator.h            | 55 +++++++++++++++++
 src/operator/operator_common.h      | 33 +++++++++++
 src/symbol/graph_executor.cc        | 23 +++++++-
 src/symbol/graph_executor.h         |  6 +-
 src/symbol/graph_memory_allocator.h | 21 ++++---
 src/symbol/static_graph.cc          | 91 +++++++++++++++++++++++++++++
 src/symbol/static_graph.h           | 14 +++++
 7 files changed, 230 insertions(+), 13 deletions(-)

diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 4f15a63198af..8050854c674f 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -239,6 +239,43 @@ class OperatorProperty {
   virtual bool InferShape(std::vector<TShape> *in_shape,
                           std::vector<TShape> *out_shape,
                           std::vector<TShape> *aux_shape) const = 0;
+  /*!
+   * \brief infer the data types of outputs and unknown input arguments
+   * \param in_type the type of input arguments of the operator
+   *     this should be of same length as the vector returned by DescribeArgs
+   *     in_type allows unknown elements, which are checked by type.ndim() == 0.
+   *     For unknown types, Infertype will try to fill in the correct type in in_type
+   *     For known types, Infertype will check type consistency
+   *
+   *     common practice: set the type of data input, and usually weight's type can be infered
+   *
+   * \param out_type the type of outputs of the operator
+   *     Infertype will modify the vector to fill output Ttype
+   * \param aux_type the type of auxiliary states of the operator
+   *     Infertype will modify the vector to fill output Ttype
+   * \return true if the type inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_types are inconsistent.
+   */
+  virtual bool InferType(std::vector<int> *in_type,
+                          std::vector<int> *out_type,
+                          std::vector<int> *aux_type) {
+    CHECK_LE(in_type->size(), this->ListArguments().size());
+    int n_in = this->ListArguments().size();
+    for (unsigned i = 0; i < in_type->size(); ++i) {
+      CHECK_EQ(in_type->at(i), mshadow::default_type_flag);
+    }
+    in_type->clear();
+    for (int i = 0; i < n_in; ++i ) in_type->push_back(mshadow::default_type_flag);
+
+    int n_out = this->ListOutputs().size();
+    out_type->clear();
+    for (int i = 0; i < n_out; ++i ) out_type->push_back(mshadow::default_type_flag);
+
+    int n_aux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (int i = 0; i < n_aux; ++i ) aux_type->push_back(mshadow::default_type_flag);
+    return true;
+  }
   /*!
    * \brief Copy this OperatorProperty.
    * \return a pointer to the copied OperatorProperty
@@ -248,6 +285,24 @@ class OperatorProperty {
    * \brief Create a Operator on specific context
    */
   virtual Operator* CreateOperator(Context ctx) const = 0;
+  /*!
+   * \brief Create a Operator on specific context
+   */
+  virtual Operator* CreateOperatorEx(Context ctx,
+                                     std::vector<int> in_type,
+                                     std::vector<int> out_type,
+                                     std::vector<int> aux_type) {
+    for (unsigned i = 0; i < in_type.size(); ++i) {
+      CHECK_EQ(in_type[i], mshadow::default_type_flag);
+    }
+    for (unsigned i = 0; i < out_type.size(); ++i) {
+      CHECK_EQ(out_type[i], mshadow::default_type_flag);
+    }
+    for (unsigned i = 0; i < aux_type.size(); ++i) {
+      CHECK_EQ(aux_type[i], mshadow::default_type_flag);
+    }
+    return this->CreateOperator(ctx);
+  }
   /*!
    * \brief return the type string of the Operator
    *  subclasses override this function.
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 02de218dafbd..32a7365cef71 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -55,6 +55,17 @@ struct InferShapeError {
     : msg(msg), index(index) {}
 };
 
+/*! \brief exception throwed by InferShape error */
+struct InferTypeError {
+  /*! \brief analyze message */
+  std::string msg;
+  /*! \brief corresponding input index */
+  int index;
+  // constructor
+  InferTypeError(std::string msg, int index)
+    : msg(msg), index(index) {}
+};
+
 /*!
  * \brief macro assign shape to out if out is unknown otherwise check consistency
  *  Use macro so we can see the error file more clearly
@@ -77,6 +88,28 @@ struct InferShapeError {
     }                                                                   \
   }
 
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param type_array the type array to store the result
+ * \param index the index of in the array
+ * \param type the infered type
+ */
+#define TYPE_ASSIGN_CHECK(type_array, index, type)                      \
+  {                                                                     \
+    auto &out = (type_array)[index];                                    \
+    if (out == -1) {                                                    \
+      out = type;                                                       \
+    } else {                                                            \
+      if (out != type) {                                                \
+        std::ostringstream os;                                          \
+        os << "Type inconsistent, Provided " <<  '='<< out << ','       \
+           << " inferred type=" << type;                                \
+        throw ::mxnet::op::InferTypeError(os.str(), index);             \
+      }                                                                 \
+    }                                                                   \
+  }
+
 // helper macro to implement bind dispatch
 #if MXNET_USE_CUDA
 #define DO_BIND_DISPATCH(Method, ...)                                \
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 2a1b4884d43e..cfca3a617abd 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -529,6 +529,22 @@ void GraphExecutor::InitDataEntryInfo(const std::vector<NDArray> &in_args,
       op_nodes_[i].outputs[j].shape = out_shapes[i][j];
     }
   }
+  // type inference
+  std::vector<std::vector<int> > out_types(op_nodes_.size());
+  std::vector<std::vector<int> > aux_types(op_nodes_.size());
+  for (size_t i = 0; i < out_types.size(); ++i) {
+    out_types[i].resize(op_nodes_[i].outputs.size(), -1);
+  }
+  for (size_t i = 0; i < graph_.arg_nodes.size(); ++i) {
+    out_types[graph_.arg_nodes[i]][0] = in_args[i].dtype();
+  }
+  CHECK(graph_.InferNodeTypes(topo_order_, &out_types, &aux_types))
+      << "Type inference cannot be complete in bind";
+  for (size_t i = 0; i < out_types.size(); ++i) {
+    for (size_t j = 0; j < out_types[i].size(); ++j) {
+      op_nodes_[i].outputs[j].type_flag = out_types[i][j];
+    }
+  }
   // bind aux args
   size_t aux_ndarray_idx = 0;
   for (auto i : topo_order_) {
@@ -536,6 +552,7 @@ void GraphExecutor::InitDataEntryInfo(const std::vector<NDArray> &in_args,
     for (size_t j = 0; j < aux_shapes[i].size(); ++j) {
       DataEntryInfo &info = op_nodes_[i].aux_states[j];
       info.shape = aux_shapes[i][j];
+      info.type_flag = aux_types[i][j];
       info.type = kBindByExternal;
       if (mirror_source_map_.count(i) == 0) {
         if (graph_.nodes[i].backward_source_id == -1) {
@@ -614,7 +631,7 @@ void GraphExecutor::InitDataEntryMemory() {
       }
       if (out->type == kNotInitialized) {
         out->storage_id = allocator.Request(
-            op_nodes_[nid].ctx, out->shape, nid);
+            op_nodes_[nid].ctx, out->type_flag, out->shape, nid);
         out->type = kInternalAllocated;
       }
     }
@@ -639,7 +656,7 @@ void GraphExecutor::InitDataEntryMemory() {
     }
   }
   // one pass complete, allocate real memory
-  this->total_allocated_reals_ = allocator.InitStorages();
+  this->total_allocated_bytes_ = allocator.InitStorages();
   // get the real data NDArray into the DataEntryInfo
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
@@ -825,7 +842,7 @@ void GraphExecutor::Print(std::ostream &os) const {
       os << '\n';
     }
   }
-  os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n";
+  os << "Total " << (total_allocated_bytes_ >> 20UL) <<" MB allocated\n";
   os << "Total " << total_allocated_temp_ <<" TempSpace resource requested\n";
 }
 
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index ba218e330c3f..61d873538566 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -87,6 +87,8 @@ class GraphExecutor : public Executor {
     DataEntryType type;
     // shape of this entry
     TShape shape;
+    // data type of this entry
+    int type_flag;
     // storage id from allocator if it is internal allocation.
     GraphStorageAllocator::StorageID storage_id;
     // reference count on how many times this entry is being used.
@@ -216,8 +218,8 @@ class GraphExecutor : public Executor {
   std::vector<uint32_t> topo_order_;
   // whether to enable inplace space
   bool enable_inplace_allocation_;
-  // total allocated space in #reals
-  size_t total_allocated_reals_;
+  // total allocated space in bytes
+  size_t total_allocated_bytes_;
   // total allocated temp space
   size_t total_allocated_temp_;
   // number of forward nodes in the graph
diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h
index dba317fbd376..d0b910708ff2 100644
--- a/src/symbol/graph_memory_allocator.h
+++ b/src/symbol/graph_memory_allocator.h
@@ -46,7 +46,7 @@ class GraphStorageAllocator {
    * \param shape shape of the NDArray we want
    * \param node_id the node that is requesting the memory, used as hint.
    */
-  StorageID Request(Context ctx, TShape shape, uint32_t node_id);
+  StorageID Request(Context ctx, int type_flag, TShape shape, uint32_t node_id);
   /*!
    * \brief Release a memory.
    * \param id the storage ID of the memory.
@@ -72,6 +72,8 @@ class GraphStorageAllocator {
     StorageID id;
     /*! \brief the context of the storage */
     Context ctx;
+    /*! \brief the data type enum of the storage */
+    int type_flag;
     /*! \brief maximum size of the storage that is requested */
     size_t max_size;
     /*! \brief node index that released it last time */
@@ -86,7 +88,7 @@ class GraphStorageAllocator {
    * \param ctx the context of the graph
    * \param shape shape of the NDArray we want
    */
-  StorageID Alloc(Context ctx, size_t size);
+  StorageID Alloc(Context ctx, int type_flag, size_t size);
   /*!
    * \brief Initialize the colors of graph nodes.
    * \param topo_order the topological order in the graph.
@@ -137,21 +139,22 @@ void GraphStorageAllocator::InitColor(const std::vector<uint32_t>& topo_order) {
 }
 
 GraphStorageAllocator::StorageID
-GraphStorageAllocator::Alloc(Context ctx, size_t size) {
+GraphStorageAllocator::Alloc(Context ctx, int type_flag, size_t size) {
   StorageID id = static_cast<StorageID>(data_.size());
   std::unique_ptr<StorageEntry> ptr(new StorageEntry());
   ptr->id = id;
   ptr->ctx = ctx;
+  ptr->type_flag = type_flag;
   ptr->max_size = size;
   data_.push_back(std::move(ptr));
   return id;
 }
 
 GraphStorageAllocator::StorageID
-GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
+GraphStorageAllocator::Request(Context ctx, int type_flag, TShape shape, uint32_t node_id) {
   // search memory block in [size / match_range_, size * match_range_)
   size_t size = shape.Size();
-  if (match_range_ == 0) return this->Alloc(ctx, size);
+  if (match_range_ == 0) return this->Alloc(ctx, type_flag, size);
   auto begin = free_.lower_bound(size / match_range_);
   auto mid = free_.lower_bound(size);
   auto end = free_.upper_bound(size * match_range_);
@@ -160,6 +163,7 @@ GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
   for (auto it = mid; it != end; ++it) {
     StorageEntry *e = it->second;
     if (e->ctx != ctx) continue;
+    if (e->type_flag != type_flag) continue;
     if (node_color_[e->released_by_node] != node_color_[node_id]) continue;
     // Use exect matching strategy
     e->max_size = std::max(size, e->max_size);
@@ -172,6 +176,7 @@ GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
     --it;
     StorageEntry *e = it->second;
     if (e->ctx != ctx) continue;
+    if (e->type_flag != type_flag) continue;
     if (node_color_[e->released_by_node] != node_color_[node_id]) continue;
     // Use exect matching strategy
     e->max_size = std::max(size, e->max_size);
@@ -180,7 +185,7 @@ GraphStorageAllocator::Request(Context ctx, TShape shape, uint32_t node_id) {
     return e->id;
   }
   // cannot find anything return a new one.
-  return this->Alloc(ctx, size);
+  return this->Alloc(ctx, type_flag, size);
 }
 
 void GraphStorageAllocator::Release(StorageID id, uint32_t node_id) {
@@ -195,8 +200,8 @@ size_t GraphStorageAllocator::InitStorages() {
   for (size_t i = 0; i < data_.size(); ++i) {
     StorageEntry *e = data_[i].get();
     TShape shape = mshadow::Shape1(e->max_size);
-    e->data = NDArray(shape, e->ctx);
-    total += e->max_size;
+    e->data = NDArray(shape, e->ctx, false, e->type_flag);
+    total += e->max_size * mshadow::mshadow_sizeof(e->type_flag);
   }
   return total;
 }
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 0ad7de4d6c55..f5f9c9db78c7 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -161,6 +161,97 @@ bool StaticGraph::InferNodeShapes(const std::vector<uint32_t> &topo_order,
   return true;
 }
 
+bool StaticGraph::InferNodeTypes(const std::vector<uint32_t> &topo_order,
+                                  std::vector<std::vector<int> > *node_out_types,
+                                  std::vector<std::vector<int> > *node_aux_types) const {
+  for (uint32_t nid : topo_order) {
+    const Node& node = nodes[nid];
+    if (node.is_forward()) {
+      std::vector<int> in_type;
+      for (const DataEntry& e : node.inputs) {
+        in_type.push_back((*node_out_types)[e.source_id][e.index]);
+      }
+      try {
+        if (!node.op->InferType(&in_type,
+                                 &(*node_out_types)[nid],
+                                 &(*node_aux_types)[nid])) return false;
+      } catch (const op::InferTypeError &err) {
+        // error handling
+        const std::string &op_name = node.name;
+        std::string arg_name = node.op->ListArguments()[err.index];
+        std::ostringstream os;
+        os << "InferType Error in "
+           << op_name << "\'s" << ' ' << arg_name << " argument\n";
+        auto &source = nodes[node.inputs[err.index].source_id];
+        if (source.is_variable()) {
+          os << "Corresponding keyword of symbol: " << source.name << '\n' << err.msg;
+        }
+        throw dmlc::Error(os.str());
+      }
+      for (size_t i = 0; i < node.inputs.size(); ++i) {
+        const DataEntry& e = node.inputs[i];
+        (*node_out_types)[e.source_id][e.index] = in_type[i];
+      }
+    } else if (nodes[nid].is_backward()) {
+      // simply use types from forward pass to assign backward type
+      const Node& forward = nodes[node.backward_source_id];
+      CHECK(forward.is_forward());
+      std::vector<int>& in_grad_types = (*node_out_types)[nid];
+      CHECK(in_grad_types.size() == forward.inputs.size());
+      // assign the input type to output gradients
+      for (size_t i = 0; i < forward.inputs.size(); ++i) {
+        const DataEntry &e = forward.inputs[i];
+        try {
+          TYPE_ASSIGN_CHECK(in_grad_types, i, (*node_out_types)[e.source_id][e.index]);
+        } catch (const op::InferTypeError &err) {
+          const std::string &op_name = forward.name;
+          std::string arg_name = forward.op->ListArguments()[e.index];
+          std::ostringstream os;
+          os << "InferType Error in "
+             << op_name << "\'s" << ' ' << arg_name << " gradient argument\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
+      // consistent check for input types
+      auto& out_data_types = (*node_out_types)[node.backward_source_id];
+      // use BackwardInputs to select entries corresponding to node.inputs
+      auto in_type = forward.op->BackwardInputs(
+          out_data_types, in_grad_types, out_data_types);
+      for (size_t i = 0; i < node.inputs.size(); ++i) {
+        const DataEntry& e = node.inputs[i];
+        try {
+          TYPE_ASSIGN_CHECK((*node_out_types)[e.source_id], e.index, in_type[i]);
+        } catch (const op::InferTypeError &err) {
+          const std::string &op_name = nodes[e.source_id].name;
+          std::ostringstream os;
+          os << "InferType Error in "
+             << op_name << "\'s" << " gradient values\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
+
+      // set for auxilary states type.
+      auto& source_aux_types = (*node_aux_types)[node.backward_source_id];
+      for (size_t i = 0; i < source_aux_types.size(); ++i) {
+        try {
+          (*node_aux_types)[nid].push_back(source_aux_types[i]);
+        } catch (const op::InferTypeError &err) {
+          const std::string &op_name = nodes[nid].name;
+          std::ostringstream os;
+          os << "InferType Error in "
+             << op_name << "\'s" << " aux states\n"
+             << err.msg;
+          throw dmlc::Error(os.str());
+        }
+      }
+    }
+  }
+  // TODO(bing) assign type for head gradient
+  return true;
+}
+
 bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
                              std::vector<TShape> *out_shape,
                              std::vector<TShape> *aux_shape) const {
diff --git a/src/symbol/static_graph.h b/src/symbol/static_graph.h
index 639a47d8a4a7..a4b433153033 100644
--- a/src/symbol/static_graph.h
+++ b/src/symbol/static_graph.h
@@ -199,6 +199,20 @@ class StaticGraph {
   bool InferNodeShapes(const std::vector<uint32_t> &topo_order,
                        std::vector<std::vector<TShape> > *node_out_shapes,
                        std::vector<std::vector<TShape> > *node_aux_shapes) const;
+  /*!
+   * \brief infer the node types in the computation graph.
+   *
+   *  When calling this function, user can setup the shape information known into right position.
+   *  Unknown shape are indicated by shape.ndim() == 0.
+   *
+   * \param topo_order The topological order of node index, as created by TopoSort.
+   * \param node_out_types The types of the each outputs of nodes in the graph.
+   * \param node_aux_types The types of the each auxiliary states of nodes in the graph.
+   * \return if the shape inference is successful, return true, else return false.
+   */
+  bool InferNodeTypes(const std::vector<uint32_t> &topo_order,
+                       std::vector<std::vector<int> > *node_out_types,
+                       std::vector<std::vector<int> > *node_aux_types) const;
   /*!
    * \brief infer the shapes of outputs and unknown input arguments
    * \param in_shape the shape of input arguments of the operator

From a0b74a97c28089b028f340cfd7ba3fc913f0e3d1 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 8 Jan 2016 08:06:24 -0800
Subject: [PATCH 10/32] [IGNORE] add ipynb and input text

---
 .gitignore | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index c794bbaeef7e..89c5c7af2b56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,12 +82,12 @@ R-package/inst/*
 *.bin
 
 # ipython notebook
-example/notebooks/.ipynb_checkpoints/*
 *_pb2.py
+*.ipynb_checkpoints*
+input.txt*
 
 # Jetbrain
 .idea
 
 # ctags
-tags
-
+tags
\ No newline at end of file

From 1bead5bd25ecf53155c0e6c041f0311b67170e89 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 8 Jan 2016 08:09:11 -0800
Subject: [PATCH 11/32] [OP] Fix load of scalar binary op

---
 Makefile                                        | 3 ++-
 src/operator/elementwise_binary_scalar_op-inl.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 08356bb02f79..d09e92a9b685 100644
--- a/Makefile
+++ b/Makefile
@@ -145,6 +145,8 @@ $(EXTRA_OPERATORS)/build/%_gpu.o: $(EXTRA_OPERATORS)/%.cu
 	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $(EXTRA_OPERATORS)/build/$*_gpu.o $< >$(EXTRA_OPERATORS)/build/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" $<
 
+# NOTE: to statically link libmxnet.a we need the option
+# --Wl,--whole-archive -lmxnet --Wl,--no-whole-archive
 lib/libmxnet.a: $(ALL_DEP)
 	@mkdir -p $(@D)
 	ar crv $@ $(filter %.o, $?)
@@ -153,7 +155,6 @@ lib/libmxnet.so: $(ALL_DEP)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 
-# ps-lite
 $(PS_PATH)/build/libps.a:
 	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
 	ln -fs $(PS_PATH)/tracker .
diff --git a/src/operator/elementwise_binary_scalar_op-inl.h b/src/operator/elementwise_binary_scalar_op-inl.h
index 3a35cfba2232..dc1d60b485cd 100644
--- a/src/operator/elementwise_binary_scalar_op-inl.h
+++ b/src/operator/elementwise_binary_scalar_op-inl.h
@@ -252,7 +252,7 @@ class ElementwiseBinaryScalarOpProp : public OperatorProperty {
     param_.Init(kwargs);
   }
   std::map<std::string, std::string> GetParams() const override {
-    return std::map<std::string, std::string>();
+    return param_.__DICT__();
   }
 
   bool InferShape(std::vector<TShape> *in_shape,

From d2888515145fcf50ada7942aa6419dcf2a8fc56b Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Tue, 5 Jan 2016 23:30:59 -0800
Subject: [PATCH 12/32] python front end symbolic type support and cast op

---
 include/mxnet/c_api.h                 |  29 +++++
 include/mxnet/operator.h              |  21 +---
 include/mxnet/symbolic.h              |  32 ++++++
 python/mxnet/symbol.py                | 109 ++++++++++++++++--
 src/c_api/c_api.cc                    |  44 ++++++++
 src/operator/cast-inl.h               | 154 ++++++++++++++++++++++++++
 src/operator/cast.cc                  |  31 ++++++
 src/operator/cast.cu                  |  18 +++
 src/symbol/graph_executor.cc          |   4 +
 src/symbol/static_graph.cc            |  51 +++++++++
 src/symbol/static_graph.h             |  20 ++++
 src/symbol/symbol.cc                  |  36 +++++-
 tests/python/unittest/test_ndarray.py |  51 +++++----
 tests/python/unittest/test_symbol.py  |  13 ++-
 14 files changed, 561 insertions(+), 52 deletions(-)
 create mode 100644 src/operator/cast-inl.h
 create mode 100644 src/operator/cast.cc
 create mode 100644 src/operator/cast.cu

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index e31461cfe12d..ba182570180a 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -646,6 +646,35 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  const mx_uint **aux_shape_ndim,
                                  const mx_uint ***aux_shape_data,
                                  int *complete);
+/*!
+ * \brief infer type of unknown input types given the known one.
+ *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_type_data the content of the CSR
+ * \param in_type_size sizeof the returning array of in_types
+ * \param in_type_data returning array of pointers to head of the input type.
+ * \param out_type_size sizeof the returning array of out_types
+ * \param out_type_data returning array of pointers to head of the input type.
+ * \param aux_type_size sizeof the returning array of aux_types
+ * \param aux_type_data returning array of pointers to head of the auxiliary type.
+ * \param complete whether infer type completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
+                                mx_uint num_args,
+                                const char** keys,
+                                const int *arg_type_data,
+                                mx_uint *in_type_size,
+                                const int **in_type_data,
+                                mx_uint *out_type_size,
+                                const int **out_type_data,
+                                mx_uint *aux_type_size,
+                                const int **aux_type_data,
+                                int *complete);
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 8050854c674f..0722e8f5750f 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -262,7 +262,8 @@ class OperatorProperty {
     CHECK_LE(in_type->size(), this->ListArguments().size());
     int n_in = this->ListArguments().size();
     for (unsigned i = 0; i < in_type->size(); ++i) {
-      CHECK_EQ(in_type->at(i), mshadow::default_type_flag);
+      CHECK(in_type->at(i) == mshadow::default_type_flag ||
+            in_type->at(i) == -1) << "Unsupported data type " << in_type->at(i);
     }
     in_type->clear();
     for (int i = 0; i < n_in; ++i ) in_type->push_back(mshadow::default_type_flag);
@@ -285,24 +286,6 @@ class OperatorProperty {
    * \brief Create a Operator on specific context
    */
   virtual Operator* CreateOperator(Context ctx) const = 0;
-  /*!
-   * \brief Create a Operator on specific context
-   */
-  virtual Operator* CreateOperatorEx(Context ctx,
-                                     std::vector<int> in_type,
-                                     std::vector<int> out_type,
-                                     std::vector<int> aux_type) {
-    for (unsigned i = 0; i < in_type.size(); ++i) {
-      CHECK_EQ(in_type[i], mshadow::default_type_flag);
-    }
-    for (unsigned i = 0; i < out_type.size(); ++i) {
-      CHECK_EQ(out_type[i], mshadow::default_type_flag);
-    }
-    for (unsigned i = 0; i < aux_type.size(); ++i) {
-      CHECK_EQ(aux_type[i], mshadow::default_type_flag);
-    }
-    return this->CreateOperator(ctx);
-  }
   /*!
    * \brief return the type string of the Operator
    *  subclasses override this function.
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index e61e886aab32..0d9acfebc8e7 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -162,6 +162,38 @@ class Symbol {
                   std::vector<TShape> *arg_shapes,
                   std::vector<TShape> *out_shapes,
                   std::vector<TShape> *aux_shapes) const;
+
+  /*!
+   * \brief infer the types of outputs and unknown input arguments
+   * \param arg_types the type of input arguments of the operator
+   *     this should be of same length as the vector returned by ListArguments
+   *     in_type allows unknown elements, which are checked by type.ndim() == 0.
+   *     For unknown types, Infertype will try to fill in the correct type in in_type
+   *     For known types, Infertype will check type consistency
+   *
+   *     common practice: set the type of data input, and usually weight's type can be infered
+   *
+   * \param out_types Use to store the infered types of outputs.
+   * \param aux_types Use to store the infered types of auxiliary states
+   * \return true if the type inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_types are inconsistent.
+   */
+  bool InferType(std::vector<int> *arg_types,
+                  std::vector<int> *out_types,
+                  std::vector<int> *aux_types) const;
+  /*!
+   * \brief infer the types by providing types of known arguments.
+   * \param known_arg_types map of argument name to type of arguments with known types.
+   * \param arg_types used to store infered types of arguments.
+   * \param out_types used to store infered types of outputs.
+   * \param aux_types Use to store the infered types of auxiliary states
+   * \return true if the type inference is successful, false if there is not enough information.
+   * \throws dmlc::Error if the known arg_types are inconsistent.
+   */
+  bool InferType(const std::unordered_map<std::string, int> &known_arg_types,
+                  std::vector<int> *arg_types,
+                  std::vector<int> *out_types,
+                  std::vector<int> *aux_types) const;
   /*!
    * \brief interface for json serialization.
    * \param writer the JSON writer write json.
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 331e489f9c95..8d0912ffaab7 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-arguments
+# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines
 """Symbolic configuration API of mxnet."""
 from __future__ import absolute_import
 
@@ -7,14 +7,15 @@
 import ctypes
 from numbers import Number
 import sys
+import numpy
 from .base import _LIB
-from .base import c_array, c_str, mx_uint, py_str, string_types
+from .base import c_array, c_str, mx_uint, py_str, string_types, mx_real_t
 from .base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from .base import check_call, ctypes2docstring
 from .name import NameManager
 from .attribute import AttrScope
 from .context import Context
-from .ndarray import NDArray, zeros
+from .ndarray import NDArray, zeros, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from .executor import Executor
 
 
@@ -299,6 +300,87 @@ def list_auxiliary_states(self):
             self.handle, ctypes.byref(size), ctypes.byref(sarr)))
         return [py_str(sarr[i]) for i in range(size.value)]
 
+    def infer_type(self, *args, **kwargs):
+        """Infer the type of outputs and arguments of given known types of arguments.
+
+        User can either pass in the known types in positional way or keyword argument way.
+        Tuple of Nones is returned if there is not enough information passed in.
+        An error will be raised if there is inconsistency found in the known types passed in.
+
+        Parameters
+        ----------
+        *args :
+            Provide type of arguments in a positional way.
+            Unknown type can be marked as None
+
+        **kwargs :
+            Provide keyword arguments of known types.
+
+        Returns
+        -------
+        arg_types : list of numpy.dtype or None
+            List of types of arguments.
+            The order is in the same order as list_arguments()
+        out_types : list of numpy.dtype or None
+            List of types of outputs.
+            The order is in the same order as list_outputs()
+        aux_types : list of numpy.dtype or None
+            List of types of outputs.
+            The order is in the same order as list_auxiliary()
+        """
+        # pylint: disable=too-many-locals
+        if len(args) != 0 and len(kwargs) != 0:
+            raise ValueError('Can only specify known argument \
+                    types either by positional or kwargs way.')
+        sdata = []
+        if len(args) != 0:
+            keys = None
+            for s in args:
+                if s is not None:
+                    s = numpy.dtype(s).type
+                    if s not in _DTYPE_NP_TO_MX:
+                        raise TypeError('Argument need to be one of '+str(_DTYPE_NP_TO_MX))
+                    sdata.append(_DTYPE_NP_TO_MX[s])
+                else:
+                    sdata.append(-1)
+        else:
+            keys = []
+            for k, v in kwargs.items():
+                v = numpy.dtype(v).type
+                if v in _DTYPE_NP_TO_MX:
+                    keys.append(c_str(k))
+                    sdata.append(_DTYPE_NP_TO_MX[v])
+        arg_type_size = mx_uint()
+        arg_type_data = ctypes.POINTER(ctypes.c_int)()
+        out_type_size = mx_uint()
+        out_type_data = ctypes.POINTER(ctypes.c_int)()
+        aux_type_size = mx_uint()
+        aux_type_data = ctypes.POINTER(ctypes.c_int)()
+        complete = ctypes.c_int()
+        check_call(_LIB.MXSymbolInferType(
+            self.handle,
+            mx_uint(len(sdata)),
+            c_array(ctypes.c_char_p, keys),
+            c_array(ctypes.c_int, sdata),
+            ctypes.byref(arg_type_size),
+            ctypes.byref(arg_type_data),
+            ctypes.byref(out_type_size),
+            ctypes.byref(out_type_data),
+            ctypes.byref(aux_type_size),
+            ctypes.byref(aux_type_data),
+            ctypes.byref(complete)))
+        if complete.value != 0:
+            arg_types = [
+                _DTYPE_MX_TO_NP[arg_type_data[i]] for i in range(arg_type_size.value)]
+            out_types = [
+                _DTYPE_MX_TO_NP[out_type_data[i]] for i in range(out_type_size.value)]
+            aux_types = [
+                _DTYPE_MX_TO_NP[aux_type_data[i]] for i in range(aux_type_size.value)]
+            return (arg_types, out_types, aux_types)
+        else:
+            return (None, None, None)
+        # pylint: enable=too-many-locals
+
     def infer_shape(self, *args, **kwargs):
         """Infer the shape of outputs and arguments of given known shapes of arguments.
 
@@ -491,8 +573,9 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
             raise TypeError('Only Accept list of NDArrays or dict of str to NDArray')
         return c_array(NDArrayHandle, arg_handles), arg_arrays
 
-    def simple_bind(self, ctx, grad_req='write', **kwargs):
+    def simple_bind(self, ctx, grad_req='write', type_dict=None, **kwargs):
         """Bind current symbol to get an executor, allocate all the ndarrays needed.
+        Allows specifying data types.
 
         This function will ask user to pass in ndarray of position
         they like to bind to, and it will automatically allocate the ndarray
@@ -508,6 +591,8 @@ def simple_bind(self, ctx, grad_req='write', **kwargs):
             - 'write' means everytime gradient is write to specified args_grad NDArray.
             - 'add' means everytime gradient is add to the specified NDArray.
             - 'null' means no action is taken, the gradient may not be calculated.
+        type_dict  : dict of str->numpy.dtype
+            Input type dictionary, name->dtype
         kwargs : dict of str->shape
             Input shape dictionary, name->shape
 
@@ -516,21 +601,27 @@ def simple_bind(self, ctx, grad_req='write', **kwargs):
         executor : mxnet.Executor
             The generated Executor
         """
+        # pylint: disable=too-many-locals
+        if type_dict is None:
+            type_dict = {k: mx_real_t for k in self.list_arguments()}
         arg_shapes, _, aux_shapes = self.infer_shape(**kwargs)
-        if arg_shapes == None:
+        arg_types, _, aux_types = self.infer_type(**type_dict)
+        if arg_shapes == None or arg_types == None:
             raise ValueError("Input node is not complete")
         # alloc space
-        arg_ndarrays = [zeros(shape, ctx) for shape in arg_shapes]
+        arg_ndarrays = [zeros(shape, ctx, dtype=dtype)for dtype, shape in zip(arg_types,
+                                                                              arg_shapes)]
 
         if grad_req != 'null':
             grad_ndarrays = {}
-            for name, shape in zip(self.list_arguments(), arg_shapes):
+            for name, shape, dtype in zip(self.list_arguments(), arg_shapes, arg_types):
                 if not (name.endswith('data') or name.endswith('label')):
-                    grad_ndarrays[name] = zeros(shape, ctx)
+                    grad_ndarrays[name] = zeros(shape, ctx, dtype=dtype)
         else:
             grad_ndarrays = None
 
-        aux_ndarrays = [zeros(shape, ctx) for shape in aux_shapes]
+        aux_ndarrays = [zeros(shape, ctx, dtype=dtype) for shape, dtype in zip(aux_shapes,
+                                                                               aux_types)]
         executor = self.bind(ctx, arg_ndarrays, grad_ndarrays, grad_req, aux_ndarrays)
         return executor
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 714950776ec5..9018e02fd866 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -41,6 +41,8 @@ struct MXAPIThreadLocalEntry {
   std::vector<void *> ret_handles;
   /*! \brief result holder for returning shapes */
   std::vector<TShape> arg_shapes, out_shapes, aux_shapes;
+  /*! \brief result holder for returning type flags */
+  std::vector<int> arg_types, out_types, aux_types;
   /*! \brief result holder for returning shape dimensions */
   std::vector<mx_uint> arg_shape_ndim, out_shape_ndim, aux_shape_ndim;
   /*! \brief result holder for returning shape pointer */
@@ -731,6 +733,48 @@ int MXSymbolInferShape(SymbolHandle sym,
   API_END();
 }
 
+int MXSymbolInferType(SymbolHandle sym,
+                      mx_uint num_args,
+                      const char** keys,
+                      const int *arg_type_data,
+                      mx_uint *in_type_size,
+                      const int **in_type_data,
+                      mx_uint *out_type_size,
+                      const int **out_type_data,
+                      mx_uint *aux_type_size,
+                      const int **aux_type_data,
+                      int *complete) {
+  Symbol *s = static_cast<Symbol*>(sym);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  bool succ;
+  API_BEGIN();
+  if (keys == nullptr && num_args != 0) {
+    ret->arg_types.clear();
+    for (mx_uint i = 0; i < num_args; ++i) {
+      ret->arg_types.push_back(arg_type_data[i]);
+    }
+    succ = s->InferType(&(ret->arg_types), &(ret->out_types), &(ret->aux_types));
+  } else {
+    std::unordered_map<std::string, int> kwargs;
+    for (mx_uint i = 0; i < num_args; ++i) {
+      kwargs[keys[i]] = arg_type_data[i];
+    }
+    succ = s->InferType(kwargs, &(ret->arg_types), &(ret->out_types), &(ret->aux_types));
+  }
+  if (succ) {
+    *in_type_size = static_cast<mx_uint>(ret->arg_types.size());
+    *in_type_data = dmlc::BeginPtr(ret->arg_types);
+    *out_type_size = static_cast<mx_uint>(ret->out_types.size());
+    *out_type_data = dmlc::BeginPtr(ret->out_types);
+    *aux_type_size = static_cast<mx_uint>(ret->aux_types.size());
+    *aux_type_data = dmlc::BeginPtr(ret->aux_types);
+    *complete = 1;
+  } else {
+    *complete = 0;
+  }
+  API_END();
+}
+
 int MXExecutorPrint(ExecutorHandle handle, const char **out_str) {
   Executor *exec = static_cast<Executor*>(handle);
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
diff --git a/src/operator/cast-inl.h b/src/operator/cast-inl.h
new file mode 100644
index 000000000000..b463f65c4f67
--- /dev/null
+++ b/src/operator/cast-inl.h
@@ -0,0 +1,154 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cast-inl.h
+ * \brief cast operator
+ * \author Junyuan Xie
+*/
+#ifndef MXNET_OPERATOR_CAST_INL_H_
+#define MXNET_OPERATOR_CAST_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+// Declare enumeration of input order to make code more intuitive.
+// // These enums are only visible within this header
+namespace cast {
+enum CastOpInputs {kData};
+enum CastOpOutputs {kOut};
+}  // cast
+
+struct CastParam : public dmlc::Parameter<CastParam> {
+  // use int for enumeration
+  int dtype;
+  DMLC_DECLARE_PARAMETER(CastParam) {
+    DMLC_DECLARE_FIELD(dtype)
+    .add_enum("float32", mshadow::kFloat32)
+    .add_enum("float64", mshadow::kFloat64)
+    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("uint8", mshadow::kUint8)
+    .add_enum("int32", mshadow::kInt32)
+    .describe("Target data type.");
+  }
+};
+
+/**
+ * \brief This is the implementation of cast operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
+template<typename xpu>
+class CastOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    MSHADOW_TYPE_SWITCH(in_data[cast::kData].type_flag_, SrcDType, {
+      MSHADOW_TYPE_SWITCH(out_data[cast::kOut].type_flag_, DstDType, {
+        Tensor<xpu, 2, SrcDType> data = in_data[cast::kData].FlatTo2D<xpu, SrcDType>(s);
+        Tensor<xpu, 2, DstDType> out = out_data[cast::kOut].FlatTo2D<xpu, DstDType>(s);
+        Assign(out, req[cast::kOut], tcast<DstDType>(data));
+      })
+    })
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    MSHADOW_TYPE_SWITCH(in_grad[cast::kData].type_flag_, SrcDType, {
+      MSHADOW_TYPE_SWITCH(out_grad[cast::kOut].type_flag_, DstDType, {
+        Tensor<xpu, 2, DstDType> m_out_grad = out_grad[cast::kOut].FlatTo2D<xpu, DstDType>(s);
+        Tensor<xpu, 2, SrcDType> m_in_grad = in_grad[cast::kData].FlatTo2D<xpu, SrcDType>(s);
+        Assign(m_in_grad, req[cast::kData], tcast<SrcDType>(m_out_grad));
+      })
+    })
+  }
+};  // class CastOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(CastParam param);
+
+#if DMLC_USE_CXX11
+class CastProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    const TShape &dshape = in_shape->at(cast::kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) {
+    CHECK_EQ(in_type->size(), 1);
+    out_type->clear();
+    out_type->push_back(param_.dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new CastProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Cast";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[cast::kOut]};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  CastParam param_;
+};
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CAST_INL_H_
diff --git a/src/operator/cast.cc b/src/operator/cast.cc
new file mode 100644
index 000000000000..c0e3de576f33
--- /dev/null
+++ b/src/operator/cast.cc
@@ -0,0 +1,31 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cast.cc
+ * \brief cast op
+ * \author Junyuan Xie
+*/
+#include "./cast-inl.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(CastParam param) {
+  return new CastOp<cpu>();
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *CastProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(CastParam);
+
+MXNET_REGISTER_OP_PROPERTY(Cast, CastProp)
+.describe("Cast array to a different data type.")
+.add_argument("data", "Symbol", "Input data to cast function.")
+.add_arguments(CastParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/cast.cu b/src/operator/cast.cu
new file mode 100644
index 000000000000..f3b40af0c146
--- /dev/null
+++ b/src/operator/cast.cu
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cast.cu
+ * \brief
+ * \author Junyuan Xie
+*/
+#include "./cast-inl.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(CastParam param) {
+  return new CastOp<gpu>();
+}
+}  // op
+}  // namespace mxnet
+
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index cfca3a617abd..c00691bb6daf 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -571,6 +571,10 @@ void GraphExecutor::InitDataEntryInfo(const std::vector<NDArray> &in_args,
           << "Incorrect NDArray shape"
           << " Input: " << info.data.data().shape_
           << " Desired: " << info.shape;
+      CHECK_EQ(info.data.dtype(), info.type_flag)
+          << "Incorrect NDArray type"
+          << " Input: " << info.data.dtype()
+          << " Desired: " << info.type_flag;
     }
   }
 }
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index f5f9c9db78c7..b9e6d9cc1fc5 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -303,6 +303,57 @@ bool StaticGraph::InferShape(std::vector<TShape> *in_shape,
   return true;
 }
 
+bool StaticGraph::InferType(std::vector<int> *in_type,
+                             std::vector<int> *out_type,
+                             std::vector<int> *aux_type) const {
+  std::vector<std::vector<int> > node_out_types(nodes.size());
+  std::vector<std::vector<int> > node_aux_types(nodes.size());
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    int nout = 1;
+    if (nodes[i].is_forward()) {
+      nout = nodes[i].op->NumOutputs();
+    } else if (nodes[i].is_backward()) {
+      nout = static_cast<int>(nodes[nodes[i].backward_source_id].inputs.size());
+    }
+    node_out_types[i].resize(nout, -1);
+  }
+  CHECK(in_type->size() == arg_nodes.size())
+        << "Wrong number of inputs to infer type";
+  for (size_t i = 0; i < arg_nodes.size(); ++i) {
+    node_out_types[arg_nodes[i]][0] = (*in_type)[i];
+  }
+  if (!InferNodeTypes(this->TopoSort(),
+                       &node_out_types,
+                       &node_aux_types)) return false;
+  for (size_t i = 0; i < arg_nodes.size(); ++i) {
+    (*in_type)[i] = node_out_types[arg_nodes[i]][0];
+  }
+  out_type->resize(heads.size());
+  for (size_t i = 0; i < heads.size(); ++i) {
+    const DataEntry &e = heads[i];
+    (*out_type)[i] = node_out_types[e.source_id][e.index];
+  }
+
+  // set back auxiliary nodes.
+  aux_type->clear();
+  std::vector<uint32_t> head_nodes;
+  for (const auto& head : heads) {
+    head_nodes.push_back(head.source_id);
+  }
+  std::vector<uint32_t> fwd_nodes = PostDFSOrder(head_nodes, std::unordered_set<uint32_t>());
+  uint32_t counter = 0;
+  for (uint32_t nid : fwd_nodes) {
+    // backward consistentcy check.
+    CHECK(nid == counter++);
+    if (node_aux_types[nid].size() > 0) {
+      for (auto const &type : node_aux_types[nid]) {
+        aux_type->push_back(type);
+      }
+    }
+  }
+  return true;
+}
+
 StaticGraph::Node StaticGraph::CreateSumNode(
     const std::vector<DataEntry> &grad_source) {
   // find multiple gradients, need aggregate
diff --git a/src/symbol/static_graph.h b/src/symbol/static_graph.h
index a4b433153033..b16a233b59b7 100644
--- a/src/symbol/static_graph.h
+++ b/src/symbol/static_graph.h
@@ -232,6 +232,26 @@ class StaticGraph {
   bool InferShape(std::vector<TShape>* in_shape,
                   std::vector<TShape>* out_shape,
                   std::vector<TShape>* aux_shape) const;
+
+  /*!
+   * \brief infer the types of outputs and unknown input arguments
+   * \param in_type the type of input arguments of the operator
+   *     this should be of same length as the vector returned by ListArguments
+   *     in_type allows unknown elements, which are checked by type.ndim() == 0.
+   *     For unknown types, Infertype will try to fill in the correct type in in_type
+   *     For known types, Infertype will check type consistency
+   *
+   *     common practice: set the type of data input, and usually weight's type can be infered
+   *
+   * \param out_type the type of outputs of the operator
+   *     Infertype will modify the vector to fill output int
+   * \param aux_type the type of auxiliary states of the operator
+   *     Infertype will modify the vector to fill output int
+   * \return if the type inference is successful, return true, else return false.
+   */
+  bool InferType(std::vector<int>* in_type,
+                  std::vector<int>* out_type,
+                  std::vector<int>* aux_type) const;
   /*!
    * \brief Add a full backward pass in the static graph.
    *  This function will add gradient nodes for each heads,
diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index e451c246c350..5745adc0e24c 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -563,11 +563,45 @@ bool Symbol::InferShape(const std::unordered_map<std::string, TShape>& known_arg
     std::vector<std::string> keys(known_arg_shapes.size());
     std::transform(known_arg_shapes.begin(), known_arg_shapes.end(), keys.begin(),
                    [](decltype(*known_arg_shapes.begin())& kv)->std::string { return kv.first; });
-    KeywordArgumentMismatch("Symbol.InterShape", keys, ListArguments());
+    KeywordArgumentMismatch("Symbol.InferShape", keys, ListArguments());
   }
   return g.InferShape(arg_shapes, out_shapes, aux_shapes);
 }
 
+bool Symbol::InferType(std::vector<int> *arg_types,
+                        std::vector<int> *out_types,
+                        std::vector<int> *aux_types) const {
+  StaticGraph g;
+  this->ToStaticGraph(&g);
+  return g.InferType(arg_types, out_types, aux_types);
+}
+
+bool Symbol::InferType(const std::unordered_map<std::string, int>& known_arg_types,
+                        std::vector<int> *arg_types,
+                        std::vector<int> *out_types,
+                        std::vector<int> *aux_types) const {
+  StaticGraph g;
+  this->ToStaticGraph(&g);
+  arg_types->clear();
+  arg_types->resize(g.arg_nodes.size(), -1);
+  size_t nmatched = 0;
+  for (size_t i = 0; i < g.arg_nodes.size(); ++i) {
+    const std::string& name = g.nodes[g.arg_nodes[i]].name;
+    auto it = known_arg_types.find(name);
+    if (it != known_arg_types.end()) {
+      arg_types->at(i) = it->second;
+      ++nmatched;
+    }
+  }
+  if (nmatched != known_arg_types.size()) {
+    std::vector<std::string> keys(known_arg_types.size());
+    std::transform(known_arg_types.begin(), known_arg_types.end(), keys.begin(),
+                   [](decltype(*known_arg_types.begin())& kv)->std::string { return kv.first; });
+    KeywordArgumentMismatch("Symbol.InferType", keys, ListArguments());
+  }
+  return g.InferType(arg_types, out_types, aux_types);
+}
+
 
 void Symbol::Save(dmlc::JSONWriter *writer) const {
   StaticGraph g;
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index fca0093a09c9..ba6eae389ad6 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -6,7 +6,7 @@
 def reldiff(a, b):
     diff = np.sum(np.abs(a - b))
     norm = np.sum(np.abs(a))
-    reldiff = diff  / norm
+    reldiff = diff  / (norm + 1e-8)
     return reldiff
 
 
@@ -14,28 +14,32 @@ def same(a, b):
     return np.sum(a != b) == 0
 
 
-def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10):
+def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=[np.float32]):
     """check function consistency with uniform random numbers"""
     if isinstance(arg_shapes, int):
         assert dim
         shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim))
         arg_shapes = [shape] * arg_shapes
-    ndarray_arg = []
-    numpy_arg = []
-    for s in arg_shapes:
-        npy = np.random.uniform(rmin, 10, s)
-        narr = mx.nd.array(npy)
-        ndarray_arg.append(narr)
-        numpy_arg.append(npy)
-    out1 = uf(*ndarray_arg)
-    if npuf is None:
-        out2 = uf(*numpy_arg)
-    else:
-        out2 = npuf(*numpy_arg)
-    assert out1.shape == out2.shape
-    if isinstance(out1, mx.nd.NDArray):
-        out1 = out1.asnumpy()
-    assert reldiff(out1, out2) < 1e-6
+    for dtype in type_list:
+        ndarray_arg = []
+        numpy_arg = []
+        for s in arg_shapes:
+            npy = np.random.uniform(rmin, 10, s).astype(dtype)
+            narr = mx.nd.array(npy, dtype=dtype)
+            ndarray_arg.append(narr)
+            numpy_arg.append(npy)
+        out1 = uf(*ndarray_arg)
+        if npuf is None:
+            out2 = uf(*numpy_arg).astype(dtype)
+        else:
+            out2 = npuf(*numpy_arg).astype(dtype)
+        assert out1.shape == out2.shape
+        if isinstance(out1, mx.nd.NDArray):
+            out1 = out1.asnumpy()
+        if dtype == np.float16:
+            assert reldiff(out1, out2) < 1e-3
+        else:
+            assert reldiff(out1, out2) < 1e-6
 
 
 def random_ndarray(dim):
@@ -47,12 +51,15 @@ def test_ndarray_elementwise():
     np.random.seed(0)
     nrepeat = 10
     maxdim = 4
+    all_type = [np.float32, np.float64, np.float16, np.uint8, np.int32]
+    real_type = [np.float32, np.float64, np.float16]
     for repeat in range(nrepeat):
         for dim in range(1, maxdim):
-            check_with_uniform(lambda x, y: x + y, 2, dim)
-            check_with_uniform(lambda x, y: x - y, 2, dim)
-            check_with_uniform(lambda x, y: x * y, 2, dim)
-            check_with_uniform(lambda x, y: x / y, 2, dim)
+            check_with_uniform(lambda x, y: x + y, 2, dim, type_list=all_type)
+            check_with_uniform(lambda x, y: x - y, 2, dim, type_list=all_type)
+            check_with_uniform(lambda x, y: x * y, 2, dim, type_list=all_type)
+            check_with_uniform(lambda x, y: x / y, 2, dim, type_list=real_type)
+            check_with_uniform(lambda x, y: x / y, 2, dim, rmin=1, type_list=all_type)
             check_with_uniform(mx.nd.sqrt, 2, dim, np.sqrt, rmin=0)
             check_with_uniform(mx.nd.square, 2, dim, np.square, rmin=0)
             check_with_uniform(lambda x: mx.nd.norm(x).asscalar(), 1, dim, np.linalg.norm)
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index d561c82d2c1e..ae6ec238d40b 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -1,6 +1,7 @@
 import copy
 import os
 import mxnet as mx
+import numpy as np
 from common import models
 import pickle as pkl
 
@@ -11,7 +12,6 @@ def test_symbol_basic():
         m.list_arguments()
         m.list_outputs()
 
-
 def test_symbol_compose():
     data = mx.symbol.Variable('data')
     net1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=10)
@@ -68,8 +68,19 @@ def test_symbol_saveload():
     assert sym.tojson() == data2.tojson()
     os.remove(fname)
 
+def test_symbol_infer_type():
+    data = mx.symbol.Variable('data')
+    f32data = mx.symbol.Cast(data=data, dtype='float32')
+    fc1  = mx.symbol.FullyConnected(data = f32data, name='fc1', num_hidden=128)
+    mlp  = mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')
+
+    arg, out, aux = mlp.infer_type(data=np.float16)
+    assert arg == [np.float16, np.float32, np.float32, np.float32]
+    assert out == [np.float32]
+    assert aux == []
 
 if __name__ == '__main__':
+    test_symbol_infer_type()
     test_symbol_internal()
     test_symbol_basic()
     test_symbol_compose()

From b9301fcae2a6a384ee23fab8fabe928df55b6d2b Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sat, 9 Jan 2016 12:22:21 -0800
Subject: [PATCH 13/32] fix issue 1224

---
 include/mxnet/ndarray.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index cdfdb4f01b1d..15cadd39c873 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -51,7 +51,8 @@ class MXNET_API NDArray {
    * \param dev_id the device id this tensor sits at
    */
   NDArray(const TBlob &data, int dev_id)
-      : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_), offset_(0) {
+      : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_), offset_(0),
+        dtype_(data.type_flag_) {
   }
   /*!
    * \return the shape of current NDArray

From b2b8cde45f6e102ee3968a14fde5f08c70a21a60 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Jan 2016 12:45:46 -0800
Subject: [PATCH 14/32] [CONTRIBUTOR] Add Alexander Skidanov

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 1567cfcf2568..6bccba0908e3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -92,3 +92,4 @@ List of Contributors
 * [Junru Shao](https://github.com/yzgysjr)
 * [Xiao Liu](https://github.com/skylook)
 * [Lowik CHANUSSOT](https://github.com/Nzeuwik)
+* [Alexander Skidanov](https://github.com/SkidanovAlex)

From aec57147bfcfbfb3bde1fe25e9d3e97bfa1db731 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Jan 2016 12:47:13 -0800
Subject: [PATCH 15/32] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6bccba0908e3..5fad779426c5 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -93,3 +93,4 @@ List of Contributors
 * [Xiao Liu](https://github.com/skylook)
 * [Lowik CHANUSSOT](https://github.com/Nzeuwik)
 * [Alexander Skidanov](https://github.com/SkidanovAlex)
+* [Ruixiang Zhang](https://github.com/sodabeta7)

From 1f19c8615bb8f2f1d159d9e939a17ce28c582f6b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Jan 2016 12:48:51 -0800
Subject: [PATCH 16/32] [CONTRIBUTOR] Add Lodewic van Twillert

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5fad779426c5..abfbfa2c140a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -94,3 +94,4 @@ List of Contributors
 * [Lowik CHANUSSOT](https://github.com/Nzeuwik)
 * [Alexander Skidanov](https://github.com/SkidanovAlex)
 * [Ruixiang Zhang](https://github.com/sodabeta7)
+* [Lodewic van Twillert](https://github.com/Lodewic)

From 6b86893939eebee1b869da63975bd45e74cab01f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Jan 2016 12:52:21 -0800
Subject: [PATCH 17/32] [CONTRIBUTOR] Add Aditya Kumar

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index abfbfa2c140a..2ca5a99b32e1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -95,3 +95,4 @@ List of Contributors
 * [Alexander Skidanov](https://github.com/SkidanovAlex)
 * [Ruixiang Zhang](https://github.com/sodabeta7)
 * [Lodewic van Twillert](https://github.com/Lodewic)
+* [https://github.com/hiraditya](https://github.com/hiraditya)

From 3638d8bc14b19bf3a7d7a0802d13628e1151327f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Jan 2016 12:52:34 -0800
Subject: [PATCH 18/32] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 2ca5a99b32e1..0c0384e3c4c9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -95,4 +95,4 @@ List of Contributors
 * [Alexander Skidanov](https://github.com/SkidanovAlex)
 * [Ruixiang Zhang](https://github.com/sodabeta7)
 * [Lodewic van Twillert](https://github.com/Lodewic)
-* [https://github.com/hiraditya](https://github.com/hiraditya)
+* [Aditya Kumar](https://github.com/hiraditya)

From 797352eaf785297e1eeccb184a6600433d4c8246 Mon Sep 17 00:00:00 2001
From: freesouls <declanxu@126.com>
Date: Sun, 10 Jan 2016 15:48:22 +0800
Subject: [PATCH 19/32] add resnet example for cifar10

---
 .../symbol_resnet-28-small.py                 | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 example/image-classification/symbol_resnet-28-small.py

diff --git a/example/image-classification/symbol_resnet-28-small.py b/example/image-classification/symbol_resnet-28-small.py
new file mode 100644
index 000000000000..06b9c85aeafa
--- /dev/null
+++ b/example/image-classification/symbol_resnet-28-small.py
@@ -0,0 +1,100 @@
+'''
+Deep Residual Learning for Image Recognition, http://arxiv.org/abs/1512.03385
+an exmaple of deep residual network for cifar10
+
+commands & setups:
+set following parameters in example/image-classification/train_model.py
+    momentum = 0.9,
+    wd = 0.0001,
+    initializer = mx.init.Xavier(rnd_type="gaussian", factor_type="in", magnitude=2.0)
+set n=3(3 for 20 layers, n=9 for 56 layers) in the get_symbol function in example/image-classification/symbol_resnet-28-small.py
+
+#first train the network with lr=0.1 for 80 epoch
+python example/image-classification/train_cifar10.py --network resnet-28-small --num-examples 50000 --lr 0.1 --num-epoch 80 --model-prefix cifar10/resnet 
+
+#second train the network with lr=0.01 from epoch 81 to epoch 120, with lr=0.001 from epoch 121 to epoch 160
+python example/image-classification/train_cifar10.py --network resnet-28-small --num-examples 50000 --model-prefix cifar10/resnet --load-epoch 80 --lr 0.01 --lr-factor 0.1 --lr-factor-epoch 40 --num-epoch 200 
+#in the paper, he train cifar10 for 160 epoch, I set num-epoch to 200 because I want to see whether it is usefull when set lr=0.0001
+
+#since it needs 160 epochs, please be patient
+#and I use batch-size of 128, train the models on one GPU
+accuracy:
+for 20 layers resnet, accuracy=0.905+, 0.9125 in the paper
+for 32 layers resnet, accuracy=0.908+, 0.9239 in the paper
+for 56 layers resnet, accuracy=0.915+, 0.9303 in the paper
+
+though the numbers are a little bit lower than the paper, but it does obey the rule: the deeper, the better
+
+differences to the paper on cifar10 network setup
+1. in the paper, the author use identity shortcut when dealing with increasing dimensions, while I use 1*1 convolutions to deal with it
+2. in the paper, 4 pixels are padded on each side and a 32*32 crop is randomly sampled from the padded image, while I use the dataset provided by mxnet, so the input is 28*28, as a results for 3 different kinds of 2n layers output map sizes are 28*28, 14*14, 7*7, instead of 32*32, 16*16, 8*8 in the paper.
+
+the above two reason might answer why the accuracy is a bit lower than the paper, I suppose.
+Off course, there might be other reasons(for example the true network architecture may be different from my script, since my script is just my understanding of the paper), if you find out, please tell me, declanxu@gmail.com or declanxu@126.com, thanks
+
+'''
+import mxnet as mx
+import find_mxnet
+
+def conv_factory(data, num_filter, kernel, stride, pad, act_type = 'relu', conv_type = 0):
+    if conv_type == 0:
+        conv = mx.symbol.Convolution(data = data, num_filter = num_filter, kernel = kernel, stride = stride, pad = pad)
+        bn = mx.symbol.BatchNorm(data=conv)
+        act = mx.symbol.Activation(data = bn, act_type=act_type)
+        return act
+    elif conv_type == 1:
+        conv = mx.symbol.Convolution(data = data, num_filter = num_filter, kernel = kernel, stride = stride, pad = pad)
+        bn = mx.symbol.BatchNorm(data=conv)
+        return bn
+
+def residual_factory(data, num_filter, dim_match):
+    if dim_match == True: # if dimension match
+        identity_data = data
+        conv1 = conv_factory(data=data, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1), act_type='relu', conv_type=0)
+        
+        conv2 = conv_factory(data=conv1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1), conv_type=1)
+        new_data = identity_data + conv2
+        act = mx.symbol.Activation(data=new_data, act_type='relu')
+        return act
+    else:        
+        conv1 = conv_factory(data=data, num_filter=num_filter, kernel=(3,3), stride=(2,2), pad=(1,1), act_type='relu', conv_type=0)
+        conv2 = conv_factory(data=conv1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1), conv_type=1)
+
+        # adopt project method in the paper when dimension increased
+        project_data = conv_factory(data=data, num_filter=num_filter, kernel=(1,1), stride=(2,2), pad=(0,0), conv_type=1)
+        new_data = project_data + conv2
+        act = mx.symbol.Activation(data=new_data, act_type='relu')
+        return act
+
+def residual_net(data, n):
+    #fisrt 2n layers
+    for i in range(n):
+        data = residual_factory(data=data, num_filter=16, dim_match=True)
+    
+    #second 2n layers
+    for i in range(n):
+        if i==0:
+            data = residual_factory(data=data, num_filter=32, dim_match=False)
+        else:
+            data = residual_factory(data=data, num_filter=32, dim_match=True)
+    
+    #third 2n layers
+    for i in range(n):
+        if i==0:
+            data = residual_factory(data=data, num_filter=64, dim_match=False)
+        else:
+            data = residual_factory(data=data, num_filter=64, dim_match=True)
+     
+    return data
+
+def get_symbol(num_classes = 10):
+    conv = conv_factory(data=mx.symbol.Variable(name='data'), num_filter=16, kernel=(3,3), stride=(1,1), pad=(1,1), act_type='relu', conv_type=0)
+    n = 3 # set n = 3 means get a model with 3*6+2=20 layers, set n = 9 means 9*6+2=56 layers
+    resnet = residual_net(conv, n) # 
+    pool = mx.symbol.Pooling(data=resnet, kernel=(7,7), pool_type='avg')
+    flatten = mx.symbol.Flatten(data=pool, name='flatten')
+    fc = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes,  name='fc1')
+    softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
+    return softmax
+
+

From 8242772bafceff781a5c12a83a7f92933552d573 Mon Sep 17 00:00:00 2001
From: Qiang Kou <qkou@umail.iu.edu>
Date: Sun, 10 Jan 2016 11:25:11 -0500
Subject: [PATCH 20/32] [R] documents update; force users to use latest version
 of DiagrammeR

---
 R-package/DESCRIPTION                    |  2 +-
 R-package/NAMESPACE                      |  3 ++
 R-package/R/mxnet_generated.R            | 64 ++++++++++++++++++++++++
 R-package/man/mx.io.ImageRecordIter.Rd   | 12 +++++
 R-package/man/mx.nd.argmax.channel.Rd    | 16 ++++++
 R-package/man/mx.symbol.Cast.Rd          | 25 +++++++++
 R-package/man/mx.symbol.Convolution.Rd   |  3 ++
 R-package/man/mx.symbol.Crop.Rd          | 31 ++++++++++++
 R-package/man/mx.symbol.Softmax.Rd       |  6 +++
 R-package/man/mx.symbol.SoftmaxOutput.Rd |  6 +++
 R-package/man/mx.symbol.UpSampling.Rd    |  3 ++
 11 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 R-package/man/mx.nd.argmax.channel.Rd
 create mode 100644 R-package/man/mx.symbol.Cast.Rd
 create mode 100644 R-package/man/mx.symbol.Crop.Rd

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 00021fdefbd3..30c74450525e 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -14,7 +14,7 @@ BugReports: https://github.com/dmlc/mxnet/issues
 Imports:
     methods,
     Rcpp (>= 0.12.1),
-    DiagrammeR,
+    DiagrammeR (>= 0.8.1),
     data.table,
     jsonlite,
     magrittr,
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 69085bb83d1f..eb0013244c55 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -44,6 +44,7 @@ export(mx.model.FeedForward.create)
 export(mx.model.load)
 export(mx.model.save)
 export(mx.nd.abs)
+export(mx.nd.argmax.channel)
 export(mx.nd.array)
 export(mx.nd.ceil)
 export(mx.nd.choose.element.0index)
@@ -77,8 +78,10 @@ export(mx.simple.bind)
 export(mx.symbol.Activation)
 export(mx.symbol.BatchNorm)
 export(mx.symbol.BlockGrad)
+export(mx.symbol.Cast)
 export(mx.symbol.Concat)
 export(mx.symbol.Convolution)
+export(mx.symbol.Crop)
 export(mx.symbol.Deconvolution)
 export(mx.symbol.Dropout)
 export(mx.symbol.ElementWiseSum)
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 854eb12c00d3..a22f15c3617d 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -12,6 +12,16 @@
 #' @name mx.nd.abs
 NULL
 
+#' Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.argmax.channel
+NULL
+
 #' Take ceil value of the src
 #' 
 #' @param src  NDArray
@@ -269,12 +279,20 @@ mx.io.CSVIter <- function(...) {
 #'     Augmentation Param: Maxmum image size after resizing.
 #' @param min.img.size  float, optional, default=0
 #'     Augmentation Param: Minimum image size after resizing.
+#' @param random.h  int, optional, default='0'
+#'     Augmentation Param: Maximum value of H channel in HSL color space.
+#' @param random.s  int, optional, default='0'
+#'     Augmentation Param: Maximum value of S channel in HSL color space.
+#' @param random.l  int, optional, default='0'
+#'     Augmentation Param: Maximum value of L channel in HSL color space.
 #' @param rotate  int, optional, default='-1'
 #'     Augmentation Param: Rotate angle.
 #' @param fill.value  int, optional, default='255'
 #'     Augmentation Param: Maximum value of illumination variation.
 #' @param data.shape  Shape(tuple), required
 #'     Dataset Param: Shape of each instance generated by the DataIter.
+#' @param inter.method  int, optional, default='1'
+#'     Augmentation Param: 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand.
 #' @param mirror  boolean, optional, default=False
 #'     Augmentation Param: Whether to mirror the image.
 #' @param rand.mirror  boolean, optional, default=False
@@ -378,6 +396,21 @@ mx.symbol.BlockGrad <- function(...) {
   mx.varg.symbol.BlockGrad(list(...))
 }
 
+#' Cast array to a different data type.
+#' 
+#' @param data  Symbol
+#'     Input data to cast function.
+#' @param dtype  {'float16', 'float32', 'float64', 'int32', 'uint8'}, required
+#'     Target data type.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.Cast <- function(...) {
+  mx.varg.symbol.Cast(list(...))
+}
+
 #' Perform an feature concat on channel dim (dim 1) over all the inputs.
 #' 
 #' @param num.args  int, required
@@ -405,6 +438,8 @@ mx.symbol.Concat <- function(...) {
 #'     convolution kernel size: (y, x)
 #' @param stride  Shape(tuple), optional, default=(1, 1)
 #'     convolution stride: (y, x)
+#' @param dilate  Shape(tuple), optional, default=(1, 1)
+#'     convolution dilate: (y, x)
 #' @param pad  Shape(tuple), optional, default=(0, 0)
 #'     pad for convolution: (y, x)
 #' @param num.filter  int (non-negative), required
@@ -424,6 +459,25 @@ mx.symbol.Convolution <- function(...) {
   mx.varg.symbol.Convolution(list(...))
 }
 
+#' Crop the 2th and 3th dim of input data, with the corresponding size of w_h orwith widht and height of the second input symbol
+#' 
+#' @param num.args  int, required
+#'     Number of inputs for crop, if equals one, then we will use the h_wfor crop heihgt and width, else if equals two, then we will use the heightand width of the second input symbol, we name crop_like here
+#' @param offset  Shape(tuple), optional, default=(0, 0)
+#'     corp offset coordinate: (y, x)
+#' @param h.w  Shape(tuple), optional, default=(0, 0)
+#'     corp height and weight: (h, w)
+#' @param center.crop  boolean, optional, default=False
+#'     If set to true, then it will use be the center_crop,or it will crop using the shape of crop_like
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.Crop <- function(...) {
+  mx.varg.symbol.Crop(list(...))
+}
+
 #' Apply deconvolution to input then add a bias.
 #' 
 #' @param data  Symbol
@@ -704,8 +758,12 @@ mx.symbol.SliceChannel <- function(...) {
 #'     Input data to softmax.
 #' @param grad.scale  float, optional, default=1
 #'     Scale the gradient by a float factor
+#' @param ignore.label  float, optional, default=-1
+#'     the ignore_label will not work in backward, and this onlybe used when multi_output=true
 #' @param multi.output  boolean, optional, default=False
 #'     If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor, softmax will generate n*x_1*...*x_n output, eachhas k classes
+#' @param use.ignore  boolean, optional, default=False
+#'     If set to true, the ignore_label value will not contributorto the backward gradient
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -736,8 +794,12 @@ mx.symbol.SoftmaxActivation <- function(...) {
 #'     Input data to softmax.
 #' @param grad.scale  float, optional, default=1
 #'     Scale the gradient by a float factor
+#' @param ignore.label  float, optional, default=-1
+#'     the ignore_label will not work in backward, and this onlybe used when multi_output=true
 #' @param multi.output  boolean, optional, default=False
 #'     If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor, softmax will generate n*x_1*...*x_n output, eachhas k classes
+#' @param use.ignore  boolean, optional, default=False
+#'     If set to true, the ignore_label value will not contributorto the backward gradient
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -772,6 +834,8 @@ mx.symbol.SwapAxis <- function(...) {
 #'     Input filter. Only used by nearest sample_type.
 #' @param sample.type  {'bilinear', 'nearest'}, required
 #'     upsampling method
+#' @param multi.input.mode  {'concat', 'sum'},optional, default='concat'
+#'     How to handle multiple input. concat means concatenate upsampled images along the channel dimension. sum means add all images together, only available for nearest neighbor upsampling.
 #' @param num.args  int, required
 #'     Number of inputs to be upsampled. For nearest neighbor upsampling, this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other inputs will be upsampled to thesame size. For bilinear upsampling this must be 2; 1 input and 1 weight.
 #' @param name  string, optional
diff --git a/R-package/man/mx.io.ImageRecordIter.Rd b/R-package/man/mx.io.ImageRecordIter.Rd
index faee1f2d3c03..4e6790a270d5 100644
--- a/R-package/man/mx.io.ImageRecordIter.Rd
+++ b/R-package/man/mx.io.ImageRecordIter.Rd
@@ -82,6 +82,15 @@ Augmentation Param: Maxmum image size after resizing.}
 \item{min.img.size}{float, optional, default=0
 Augmentation Param: Minimum image size after resizing.}
 
+\item{random.h}{int, optional, default='0'
+Augmentation Param: Maximum value of H channel in HSL color space.}
+
+\item{random.s}{int, optional, default='0'
+Augmentation Param: Maximum value of S channel in HSL color space.}
+
+\item{random.l}{int, optional, default='0'
+Augmentation Param: Maximum value of L channel in HSL color space.}
+
 \item{rotate}{int, optional, default='-1'
 Augmentation Param: Rotate angle.}
 
@@ -91,6 +100,9 @@ Augmentation Param: Maximum value of illumination variation.}
 \item{data.shape}{Shape(tuple), required
 Dataset Param: Shape of each instance generated by the DataIter.}
 
+\item{inter.method}{int, optional, default='1'
+Augmentation Param: 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand.}
+
 \item{mirror}{boolean, optional, default=False
 Augmentation Param: Whether to mirror the image.}
 
diff --git a/R-package/man/mx.nd.argmax.channel.Rd b/R-package/man/mx.nd.argmax.channel.Rd
new file mode 100644
index 000000000000..fb795b45184d
--- /dev/null
+++ b/R-package/man/mx.nd.argmax.channel.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.argmax.channel}
+\alias{mx.nd.argmax.channel}
+\title{Take sum of the src.The result will be ndarray of shape (1,) on the same device.}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+}
+
diff --git a/R-package/man/mx.symbol.Cast.Rd b/R-package/man/mx.symbol.Cast.Rd
new file mode 100644
index 000000000000..a07489c93d3e
--- /dev/null
+++ b/R-package/man/mx.symbol.Cast.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.Cast}
+\alias{mx.symbol.Cast}
+\title{Cast array to a different data type.}
+\usage{
+mx.symbol.Cast(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data to cast function.}
+
+\item{dtype}{{'float16', 'float32', 'float64', 'int32', 'uint8'}, required
+Target data type.}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Cast array to a different data type.
+}
+
diff --git a/R-package/man/mx.symbol.Convolution.Rd b/R-package/man/mx.symbol.Convolution.Rd
index 140be1f8ff41..189bacbbf9a2 100644
--- a/R-package/man/mx.symbol.Convolution.Rd
+++ b/R-package/man/mx.symbol.Convolution.Rd
@@ -22,6 +22,9 @@ convolution kernel size: (y, x)}
 \item{stride}{Shape(tuple), optional, default=(1, 1)
 convolution stride: (y, x)}
 
+\item{dilate}{Shape(tuple), optional, default=(1, 1)
+convolution dilate: (y, x)}
+
 \item{pad}{Shape(tuple), optional, default=(0, 0)
 pad for convolution: (y, x)}
 
diff --git a/R-package/man/mx.symbol.Crop.Rd b/R-package/man/mx.symbol.Crop.Rd
new file mode 100644
index 000000000000..d0e05cc53e7c
--- /dev/null
+++ b/R-package/man/mx.symbol.Crop.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.Crop}
+\alias{mx.symbol.Crop}
+\title{Crop the 2th and 3th dim of input data, with the corresponding size of w_h orwith widht and height of the second input symbol}
+\usage{
+mx.symbol.Crop(...)
+}
+\arguments{
+\item{num.args}{int, required
+Number of inputs for crop, if equals one, then we will use the h_wfor crop heihgt and width, else if equals two, then we will use the heightand width of the second input symbol, we name crop_like here}
+
+\item{offset}{Shape(tuple), optional, default=(0, 0)
+corp offset coordinate: (y, x)}
+
+\item{h.w}{Shape(tuple), optional, default=(0, 0)
+corp height and weight: (h, w)}
+
+\item{center.crop}{boolean, optional, default=False
+If set to true, then it will use be the center_crop,or it will crop using the shape of crop_like}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Crop the 2th and 3th dim of input data, with the corresponding size of w_h orwith widht and height of the second input symbol
+}
+
diff --git a/R-package/man/mx.symbol.Softmax.Rd b/R-package/man/mx.symbol.Softmax.Rd
index fd554b1aceb4..2534bcf39e31 100644
--- a/R-package/man/mx.symbol.Softmax.Rd
+++ b/R-package/man/mx.symbol.Softmax.Rd
@@ -13,9 +13,15 @@ Input data to softmax.}
 \item{grad.scale}{float, optional, default=1
 Scale the gradient by a float factor}
 
+\item{ignore.label}{float, optional, default=-1
+the ignore_label will not work in backward, and this onlybe used when multi_output=true}
+
 \item{multi.output}{boolean, optional, default=False
 If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor, softmax will generate n*x_1*...*x_n output, eachhas k classes}
 
+\item{use.ignore}{boolean, optional, default=False
+If set to true, the ignore_label value will not contributorto the backward gradient}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.SoftmaxOutput.Rd b/R-package/man/mx.symbol.SoftmaxOutput.Rd
index 9927d0144f20..46c4a768be28 100644
--- a/R-package/man/mx.symbol.SoftmaxOutput.Rd
+++ b/R-package/man/mx.symbol.SoftmaxOutput.Rd
@@ -13,9 +13,15 @@ Input data to softmax.}
 \item{grad.scale}{float, optional, default=1
 Scale the gradient by a float factor}
 
+\item{ignore.label}{float, optional, default=-1
+the ignore_label will not work in backward, and this onlybe used when multi_output=true}
+
 \item{multi.output}{boolean, optional, default=False
 If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor, softmax will generate n*x_1*...*x_n output, eachhas k classes}
 
+\item{use.ignore}{boolean, optional, default=False
+If set to true, the ignore_label value will not contributorto the backward gradient}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.UpSampling.Rd b/R-package/man/mx.symbol.UpSampling.Rd
index 83ef96e93ac4..055dc71907fd 100644
--- a/R-package/man/mx.symbol.UpSampling.Rd
+++ b/R-package/man/mx.symbol.UpSampling.Rd
@@ -16,6 +16,9 @@ Input filter. Only used by nearest sample_type.}
 \item{sample.type}{{'bilinear', 'nearest'}, required
 upsampling method}
 
+\item{multi.input.mode}{{'concat', 'sum'},optional, default='concat'
+How to handle multiple input. concat means concatenate upsampled images along the channel dimension. sum means add all images together, only available for nearest neighbor upsampling.}
+
 \item{num.args}{int, required
 Number of inputs to be upsampled. For nearest neighbor upsampling, this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other inputs will be upsampled to thesame size. For bilinear upsampling this must be 2; 1 input and 1 weight.}
 

From 10acc9ea4599c7d004d937c1734483fd622e22b3 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Tue, 5 Jan 2016 23:30:59 -0800
Subject: [PATCH 21/32] python front end symbolic type support and cast op

---
 python/mxnet/symbol.py                | 4 ++++
 tests/python/unittest/test_ndarray.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 8d0912ffaab7..0a6c4ce8266e 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -595,6 +595,10 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, **kwargs):
             Input type dictionary, name->dtype
         kwargs : dict of str->shape
             Input shape dictionary, name->shape
+        type_dict  : dict of str->numpy.dtype
+            Input type dictionary, name->dtype
+        kwargs : dict of str->shape
+            Input shape dictionary, name->shape
 
         Returns
         -------
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index ba6eae389ad6..b55d6f71be29 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -33,6 +33,7 @@ def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=
             out2 = uf(*numpy_arg).astype(dtype)
         else:
             out2 = npuf(*numpy_arg).astype(dtype)
+            
         assert out1.shape == out2.shape
         if isinstance(out1, mx.nd.NDArray):
             out1 = out1.asnumpy()

From c39f8522000adbb93d30de229d89c6590f5ee88a Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Thu, 7 Jan 2016 22:52:33 -0800
Subject: [PATCH 22/32] torch ndarray function backend

---
 Makefile                        |  46 +++-
 include/mxnet/c_api.h           |   5 +-
 include/mxnet/ndarray.h         |  42 +++-
 make/config.mk                  |   9 +
 make/osx.mk                     |   9 +
 plugin/torch/torch_base.cc      |  44 ++++
 plugin/torch/torch_base.h       | 223 ++++++++++++++++++
 plugin/torch/torch_function.cc  |  36 +++
 plugin/torch/torch_function.h   | 164 +++++++++++++
 plugin/torch/torch_module-inl.h | 395 ++++++++++++++++++++++++++++++++
 plugin/torch/torch_module.cc    |  29 +++
 plugin/torch/torch_module.cu    |  21 ++
 python/mxnet/__init__.py        |   3 +
 python/mxnet/ndarray.py         |  15 +-
 python/mxnet/torch.py           | 143 ++++++++++++
 src/c_api/c_api.cc              |  10 +-
 src/common/tblob_op_registry.cc |   5 +-
 src/ndarray/ndarray.cc          |   9 +-
 18 files changed, 1179 insertions(+), 29 deletions(-)
 create mode 100644 plugin/torch/torch_base.cc
 create mode 100644 plugin/torch/torch_base.h
 create mode 100644 plugin/torch/torch_function.cc
 create mode 100644 plugin/torch/torch_function.h
 create mode 100644 plugin/torch/torch_module-inl.h
 create mode 100644 plugin/torch/torch_module.cc
 create mode 100644 plugin/torch/torch_module.cu
 create mode 100644 python/mxnet/torch.py

diff --git a/Makefile b/Makefile
index 3b64f185cbd5..12a444006085 100644
--- a/Makefile
+++ b/Makefile
@@ -93,10 +93,10 @@ endif
 
 all: lib/libmxnet.a lib/libmxnet.so $(BIN)
 
-SRC = $(wildcard src/*.cc src/*/*.cc)
-OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
-CUSRC = $(wildcard src/*/*.cu)
-CUOBJ = $(patsubst src/%.cu, build/%_gpu.o, $(CUSRC))
+SRC = $(wildcard src/*.cc src/*/*.cc src/*/*/*.cc)
+OBJ = $(patsubst %.cc, build/%.o, $(SRC))
+CUSRC = $(wildcard src/*/*.cu src/*/*/*.cu)
+CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC))
 
 ifneq ($(EXTRA_OPERATORS),)
 	EXTRA_SRC = $(wildcard $(EXTRA_OPERATORS)/*.cc $(EXTRA_OPERATORS)/*/*.cc)
@@ -110,10 +110,23 @@ else
 	EXTRA_CUOBJ =
 endif
 
+# plugin
+ifeq ($(USE_TORCH), 1)
+	CFLAGS += -I$(TORCH_PATH)/install/include -I$(TORCH_PATH)/install/include/TH -I$(TORCH_PATH)/install/include/THC -DMXNET_USE_TORCH=1
+	LDFLAGS += -Wl,-export-dynamic -L$(TORCH_PATH)/install/lib -L$(TORCH_PATH)/install/lib/lua/5.1 -lluajit -lluaT -lTH -lTHC -lpaths -ltorch -lcutorch -lnn -lcunn
+	
+	TORCH_SRC = $(wildcard plugin/torch/*.cc)
+	PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(TORCH_SRC))
+	TORCH_CUSRC = $(wildcard plugin/torch/*.cu)
+	PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(TORCH_CUSRC))
+else
+	CFLAGS += -DMXNET_USE_TORCH=0
+endif
+
 LIB_DEP += $(DMLC_CORE)/libdmlc.a
-ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(LIB_DEP)
+ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(PLUGIN_OBJ) $(LIB_DEP)
 ifeq ($(USE_CUDA), 1)
-	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ)
+	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) $(PLUGIN_CUOBJ)
 	LDFLAGS += -lcuda
 endif
 
@@ -125,16 +138,27 @@ else
 endif
 
 
-build/%.o: src/%.cc
+build/src/%.o: src/%.cc
+	@mkdir -p $(@D)
+	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
+	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
+
+build/src/%_gpu.o: src/%.cu
+	@mkdir -p $(@D)
+	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
+
+build/plugin/%.o: plugin/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
+	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
 	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
 
-build/%_gpu.o: src/%.cu
+build/plugin/%_gpu.o: plugin/%.cu
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M -MT build/$*_gpu.o $< >build/$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
 
+
 $(EXTRA_OPERATORS)/build/%.o: $(EXTRA_OPERATORS)/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++0x $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
@@ -173,7 +197,7 @@ include tests/cpp/unittest.mk
 test: $(TEST)
 
 lint: rcpplint
-	python2 dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts python predict/python
+	python2 dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src plugin scripts python predict/python 
 
 doc: doxygen
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index ba182570180a..1b1527bb8b0e 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -382,7 +382,10 @@ MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
 MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
                            NDArrayHandle *use_vars,
                            mx_float *scalar_args,
-                           NDArrayHandle *mutate_vars);
+                           NDArrayHandle *mutate_vars,
+                           int num_params,
+                           char **param_keys,
+                           char **param_vals);
 
 //--------------------------------------------
 // Part 3: symbolic configuration generation
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 15cadd39c873..b32639d88c74 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -12,6 +12,7 @@
 #include <dmlc/type_traits.h>
 #include <dmlc/registry.h>
 #include <vector>
+#include <map>
 #include <string>
 #include <memory>
 #include "./base.h"
@@ -446,7 +447,10 @@ MXNET_API void SampleGaussian(real_t mu, real_t sigma, NDArray *out);
 /*! \brief definition of NDArray function */
 typedef std::function<void (NDArray **used_vars,
                             real_t *scalars,
-                            NDArray **mutate_vars)> NDArrayAPIFunction;
+                            NDArray **mutate_vars,
+                            int num_params,
+                            char **param_keys,
+                            char **param_vals)> NDArrayAPIFunction;
 /*! \brief mask information on how functions can be exposed */
 enum NDArrayFunctionTypeMask {
   /*! \brief all the use_vars should go before scalar */
@@ -491,7 +495,8 @@ struct NDArrayFunctionReg
    */
   inline NDArrayFunctionReg &set_function(void (*fsetvalue)(const real_t &rhs,
                                                             NDArray *out)) {
-    body = [fsetvalue] (NDArray **used_vars, real_t *s, NDArray **mutate_vars) {
+    body = [fsetvalue] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
+                        int num_params, char **param_keys, char **param_vals) {
       (*fsetvalue)(s[0], mutate_vars[0]);
     };
     num_mutate_vars = 1; num_scalars = 1;
@@ -507,8 +512,8 @@ struct NDArrayFunctionReg
   inline NDArrayFunctionReg &set_function(void (*fbinary)(const NDArray &lhs,
                                                           const NDArray &rhs,
                                                           NDArray *out)) {
-    body = [fbinary] (NDArray **used_vars,
-                      real_t *s, NDArray **mutate_vars) {
+    body = [fbinary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
+                      int num_params, char **param_keys, char **param_vals) {
       (*fbinary)(*used_vars[0], *used_vars[1], mutate_vars[0]);
     };
     num_use_vars = 2; num_mutate_vars = 1;
@@ -526,8 +531,8 @@ struct NDArrayFunctionReg
   inline NDArrayFunctionReg &set_function(void (*fscalar)(const NDArray &lhs,
                                                           const real_t &rhs,
                                                           NDArray *out)) {
-    body = [fscalar] (NDArray **used_vars,
-                      real_t *s, NDArray **mutate_vars) {
+    body = [fscalar] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
+                      int num_params, char **param_keys, char **param_vals) {
       (*fscalar)(*used_vars[0], s[0], mutate_vars[0]);
     };
     num_use_vars = 1; num_mutate_vars = 1; num_scalars = 1;
@@ -544,8 +549,8 @@ struct NDArrayFunctionReg
    */
   inline NDArrayFunctionReg &set_function(void (*funary)(const NDArray &src,
                                                          NDArray *out)) {
-    body = [funary] (NDArray **used_vars,
-                     real_t *s, NDArray **mutate_vars) {
+    body = [funary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
+                     int num_params, char **param_keys, char **param_vals) {
       (*funary)(*used_vars[0], mutate_vars[0]);
     };
     num_use_vars = 1; num_mutate_vars = 1;
@@ -553,6 +558,27 @@ struct NDArrayFunctionReg
     this->add_argument("src", "NDArray", "Source input to the function.");
     return *this;
   }
+  /*!
+   * \brief set the function body to a unary NDArray function
+   *  this will also auto set the parameters correctly
+   * \param funary function body to set
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NDArrayFunctionReg &set_function(
+    void (*fgeneric)(NDArray **used_vars,
+                     real_t *s,
+                     NDArray **mutate_vars,
+                     const std::map<std::string, std::string>& param)) {
+    body = [fgeneric] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
+                       int num_params, char **param_keys, char **param_vals) {
+      std::map<std::string, std::string> param;
+      for (int i = 0; i < num_params; ++i) {
+        param[param_keys[i]] = param_vals[i];
+      }
+      fgeneric(used_vars, s, mutate_vars, param);
+    };
+    return *this;
+  }
   /*!
    * \brief set the number of mutate variables
    * \param n number of mutate variablesx
diff --git a/make/config.mk b/make/config.mk
index 8e9f8af3a5da..e18cc7776a41 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -105,3 +105,12 @@ USE_S3 = 0
 
 # path to folders containing projects specific operators that you don't want to put in src/operators
 EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use torch integration. This requires installing torch.
+USE_TORCH = 0
+TORCH_PATH = $(HOME)/torch
diff --git a/make/osx.mk b/make/osx.mk
index 23c2c7a363e5..3995049e60b1 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -92,3 +92,12 @@ USE_S3 = 0
 
 # path to folders containing projects specific operators that you don't want to put in src/operators
 EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use torch integration. This requires installing torch.
+USE_TORCH = 0
+TORCH_PATH = $(HOME)/torch
diff --git a/plugin/torch/torch_base.cc b/plugin/torch/torch_base.cc
new file mode 100644
index 000000000000..722997458e4b
--- /dev/null
+++ b/plugin/torch/torch_base.cc
@@ -0,0 +1,44 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file torch_base.cc
+ * \brief torch_state
+ * \author Junyuan Xie
+*/
+#include "./torch_base.h"
+
+namespace mxnet {
+lua_State* TorchState::LuaState() {
+  thread_local lua_State* state = NULL;
+  if (!state) {
+    state = luaL_newstate();
+    luaL_openlibs(state);
+    luaL_loadstring(state,
+                    "require 'torch'\n"
+                    "require 'nn'\n"
+#if MXNET_USE_CUDA
+                    "require 'cutorch'\n"
+                    "require 'cunn'\n"
+#if MXNET_USE_CUDNN
+                    "require 'cudnn'\n"
+#endif  // MXNET_USE_CUDNN
+#endif  // MXNET_USE_CUDA
+                    "local ss = require 'threads.sharedserialize'\n"
+                    "Serialize, Deserialize = ss.save, ss.load\n");
+    int err = lua_pcall(state, 0, 0, 0);
+    CHECK_EQ(err, 0) << lua_tostring(state, -1);
+  }
+  return state;
+}
+
+template<>
+void TorchState::SetStream(mshadow::Stream<mshadow::cpu>* s) {
+  return;
+}
+
+#if MXNET_USE_CUDA
+template<>
+void TorchState::SetStream(mshadow::Stream<mshadow::gpu>* s) {
+  TorchState::CudaState()->currentStream = mshadow::Stream<gpu>::GetStream(s);
+}
+#endif  // MXNET_USE_CUDA
+}  // namespace mxnet
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
new file mode 100644
index 000000000000..f70f8a181892
--- /dev/null
+++ b/plugin/torch/torch_base.h
@@ -0,0 +1,223 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file torch_base.h
+ * \brief Torch interface.
+ * \author Junyuan Xie
+ */
+#ifndef PLUGIN_TORCH_TORCH_BASE_H_
+#define PLUGIN_TORCH_TORCH_BASE_H_
+#include <mxnet/base.h>
+#include <vector>
+
+extern "C" {
+#include "lua.h"
+#include "luaT.h"
+#include "lualib.h"
+#include "THStorage.h"
+#include "THTensor.h"
+}
+
+#if MXNET_USE_CUDA
+extern "C" {
+#include "THCStorage.h"
+#include "THCTensor.h"
+}
+#endif  // MXNET_USE_CUDA
+
+namespace mxnet {
+
+class TorchState {
+ public:
+  static lua_State* LuaState();
+
+#if MXNET_USE_CUDA
+  static THCState* CudaState() {
+    lua_State* L = TorchState::LuaState();
+    lua_getglobal(L, "cutorch");
+    CHECK(!lua_isnil(L, -1));
+    lua_getfield(L, -1, "_state");
+    CHECK(!lua_isnil(L, -1));
+    THCState* state = reinterpret_cast<THCState*>(lua_touserdata(L, -1));
+    lua_pop(L, 2);
+    return state;
+  }
+#endif  // MXNET_USE_CUDA
+
+  template<typename xpu>
+  static void SetStream(mshadow::Stream<xpu>* s);
+
+  static int Deserialize(THCharStorage* chunk) {  // read only to the chunk
+    CHECK_NE(chunk, NULL);
+    lua_State* L = LuaState();
+    lua_getglobal(L, "Deserialize");
+    luaT_pushudata(L, chunk, "torch.CharStorage");
+    THCharStorage_retain(chunk);  // keep it because read only
+    int err = lua_pcall(L, 1, 1, 0);
+    CHECK_EQ(err, 0);
+    return 1;
+  }
+
+  static int Serialize(THCharStorage** chunk) {
+    lua_State* L = LuaState();
+    lua_getglobal(L, "Serialize");
+    lua_pushvalue(L, -2);
+    int err = lua_pcall(L, 1, 1, 0);
+    CHECK_EQ(err, 0) << "Serialize failed " << lua_tostring(L, -1);
+    THCharStorage_free(*chunk);  // free the original
+    *chunk = reinterpret_cast<THCharStorage*>(luaT_toudata(L, -1, "torch.CharStorage"));
+    THCharStorage_retain(*chunk);  // keep the chunk even when lua side deletes
+    lua_pop(L, 2);
+    return 0;
+  }
+
+  static void PrintState() {
+    lua_State* L = LuaState();
+    int i;
+    int top = lua_gettop(L);
+    LOG(INFO) << "Stack height: " << top;
+    for (i = 1; i <= top; i++) {  /* repeat for each level */
+      int t = lua_type(L, i);
+      switch (t) {
+        case LUA_TSTRING:  /* strings */
+          LOG(INFO) << i << ": '" << lua_tostring(L, i) << "'";
+          break;
+        case LUA_TBOOLEAN:  /* booleans */
+          LOG(INFO) << i << ": " << (lua_toboolean(L, i) ? "true" : "false");
+          break;
+        case LUA_TNUMBER:  /* numbers */
+          LOG(INFO) << i << ": " << lua_tonumber(L, i);
+          break;
+        default:  /* other values */
+          LOG(INFO) << i << ": " << lua_typename(L, t);
+          break;
+      }
+    }
+  }
+};
+
+typedef void* THGeneralTensor;
+typedef void* THGeneralStorage;
+
+class TorchTensor {
+ public:
+  static const char* TensorType(int dev_mask) {
+    switch (dev_mask) {
+      case cpu::kDevMask:
+        return "torch.FloatTensor";
+      case gpu::kDevMask:
+        return "torch.CudaTensor";
+      default:
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+        return NULL;
+    }
+  }
+
+  static const char* ModuleType(int dev_mask) {
+    switch (dev_mask) {
+      case cpu::kDevMask:
+        return ":float()";
+      case gpu::kDevMask:
+        return ":cuda()";
+      default:
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+        return NULL;
+    }
+  }
+
+  static const char* TensorType(TBlob data) {
+    return TensorType(data.dev_mask_);
+  }
+
+  static const char* ModuleType(TBlob data) {
+    return TensorType(data.dev_mask_);
+  }
+
+  static THGeneralTensor TBlobToTHTensor(TBlob data) {
+    size_t size = data.Size();
+    THGeneralTensor tensor = NULL;
+    THLongStorage* thshape = THLongStorage_newWithSize(data.ndim());
+    for (int i = 0; i < data.ndim(); ++i) {
+      THLongStorage_set(thshape, i, data.shape_[i]);
+    }
+    CHECK_EQ(data.type_flag_, mshadow::kFloat32) << "Torch Interface only support float32";
+    switch (data.dev_mask_) {
+      case cpu::kDevMask: {
+        THFloatStorage* storage = THFloatStorage_newWithData(static_cast<real_t*>(data.dptr_),
+                                                             size);
+        THFloatStorage_clearFlag(storage, TH_STORAGE_FREEMEM);
+        tensor = (THGeneralTensor)THFloatTensor_newWithStorage(storage, 0, thshape, NULL);
+        THFloatStorage_free(storage);
+        break;
+      }
+#if MXNET_USE_CUDA
+      case gpu::kDevMask: {
+        THCState* state = TorchState::CudaState();
+        THCudaStorage* storage = THCudaStorage_newWithData(state, static_cast<real_t*>(data.dptr_),
+                                                           size);
+        // a bug in cutorch
+        THFloatStorage_clearFlag(reinterpret_cast<THFloatStorage*>(storage), TH_STORAGE_FREEMEM);
+        tensor = (THGeneralTensor)THCudaTensor_newWithStorage(state, storage, 0, thshape, NULL);
+        THCudaStorage_free(state, storage);
+        break;
+      }
+#endif
+      default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    }
+    THLongStorage_free(thshape);
+
+    return tensor;
+  }
+
+  static void SetInternal(THGeneralTensor tensor, const TBlob& blob) {
+    size_t size = blob.Size();
+    switch (blob.dev_mask_) {
+      case cpu::kDevMask: {
+        THFloatStorage* storage = THFloatStorage_newWithData(static_cast<real_t*>(blob.dptr_),
+                                                             size);
+        THFloatStorage_clearFlag(storage, TH_STORAGE_FREEMEM);
+        THFloatStorage* original = static_cast<THFloatTensor*>(tensor)->storage;
+        static_cast<THFloatTensor*>(tensor)->storage = storage;
+        THFloatStorage_free(original);
+      }
+      case gpu::kDevMask: {
+#if MXNET_USE_CUDA
+        THCState* state = TorchState::CudaState();
+        THCudaStorage* storage = THCudaStorage_newWithData(state,
+                                                           static_cast<real_t*>(blob.dptr_),
+                                                           size);
+        // TODO(min): torch bug Cuda version not implemented
+        THFloatStorage_clearFlag(reinterpret_cast<THFloatStorage*>(storage), TH_STORAGE_FREEMEM);
+        THCudaStorage* original = static_cast<THCudaTensor*>(tensor)->storage;
+        static_cast<THCudaTensor*>(tensor)->storage = storage;
+        THCudaStorage_free(state, original);
+#else
+        LOG(FATAL) << "GPU is not enabled";
+#endif
+      }
+      default:
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    }
+  }
+
+  static void TBlobVectorAsTable(const std::vector<TBlob>::const_iterator begin,
+                         const std::vector<TBlob>::const_iterator end) {
+    lua_State* L = TorchState::LuaState();
+    int num = end - begin;
+    if (num > 1) {
+      lua_createtable(L, num, 0);
+      int index = 1;
+      for (std::vector<TBlob>::const_iterator it = begin; it != end; ++it) {
+        THGeneralTensor th = TorchTensor::TBlobToTHTensor(*it);
+        luaT_pushudata(L, th, TorchTensor::TensorType(*it));
+        lua_rawseti(L, -2, index++);
+      }
+    } else if (num == 0) {
+      lua_pushnil(L);
+    } else {
+      luaT_pushudata(L, TorchTensor::TBlobToTHTensor(*begin), TorchTensor::TensorType(*begin));
+    }
+  }
+};
+
+}  // namespace mxnet
+#endif  // PLUGIN_TORCH_TORCH_BASE_H_
diff --git a/plugin/torch/torch_function.cc b/plugin/torch/torch_function.cc
new file mode 100644
index 000000000000..e969de66490b
--- /dev/null
+++ b/plugin/torch/torch_function.cc
@@ -0,0 +1,36 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file torch_base.cc
+ * \brief torch_state
+ * \author Junyuan Xie
+*/
+#include "./torch_function.h"
+
+namespace mxnet {
+
+// Element-wise Mathematical Operations
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_abs, abs);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_sign, sign);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_acos, acos);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_asin, asin);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_atan, atan);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_ceil, ceil);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_cos, cos);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_cosh, cosh);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_exp, exp);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_floor, floor);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_log, log);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_log1p, log1p);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_pow, pow);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_round, round);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_sin, sin);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_sinh, sinh);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_sqrt, sqrt);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_tan, tan);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_tanh, tanh);
+
+// Basic operations
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_add_scalar, add);
+
+
+}  // namespace mxnet
diff --git a/plugin/torch/torch_function.h b/plugin/torch/torch_function.h
new file mode 100644
index 000000000000..0866dc08f98f
--- /dev/null
+++ b/plugin/torch/torch_function.h
@@ -0,0 +1,164 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file torch_function.h
+ * \brief Torch interface.
+ * \author Junyuan Xie
+ */
+#ifndef PLUGIN_TORCH_TORCH_FUNCTION_H_
+#define PLUGIN_TORCH_TORCH_FUNCTION_H_
+#include "./torch_base.h"
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <map>
+#include <algorithm>
+#include <vector>
+
+namespace mxnet {
+
+template<typename xpu, typename OP>
+void TorchRunOp(std::vector<NDArray> arr_in,
+                std::vector<NDArray> arr_out,
+                const std::map<std::string, std::string>& param,
+                RunContext ctx) {
+  lua_State* L = TorchState::LuaState();
+  TorchState::SetStream(ctx.get_stream<xpu>());
+  lua_getglobal(L, "torch");
+  lua_getfield(L, -1, OP::fname);
+  int idx = 0;
+  std::vector<NDArray> arr(arr_out.begin(), arr_out.end());
+  arr.insert(arr.end(), arr_in.begin(), arr_in.end());
+  std::string format = param.at("format");
+  std::istringstream args(param.at("args"));
+  for (size_t i = 0; i < format.size(); ++i) {
+    std::string val;
+    std::getline(args, val, ',');
+    switch (format[i]) {
+      case 'n': {
+        CHECK(idx < arr.size()) << "Too few NDArray arguments for Torch." << OP::fname;
+        luaT_pushudata(L,
+                       TorchTensor::TBlobToTHTensor(arr[idx].data()),
+                       TorchTensor::TensorType(arr[idx].data()));
+        idx++;
+        break;
+      }
+      case 'i':
+        lua_pushinteger(L, std::stoi(val));
+        break;
+      case 'f':
+        lua_pushnumber(L, std::stof(val));
+        break;
+      case 's':
+        lua_pushstring(L, val.c_str());
+        break;
+      case 'b':
+        lua_pushboolean(L, std::stoi(val));
+        break;
+      default:
+        LOG(FATAL) << "Unknown argument type " << format[i] << " for Torch." << OP::fname;
+    }
+  }
+  CHECK_EQ(lua_pcall(L, format.size(), 0, 0), 0) << "Lua Error: " << lua_tostring(L, -1);
+}
+
+template<typename OP>
+void TorchOp(NDArray **u, real_t *s, NDArray **out,
+             const std::map<std::string, std::string>& param) {
+  std::vector<mshadow::TShape> shapes = OP::GetShape(u, param);
+  CHECK_EQ(shapes.size(), OP::num_outputs)
+    << "Too many output shapes for TorchOp " << OP::fname;
+  Context ctx;
+  int type_flag;
+  if (OP::num_inputs) {
+    ctx = u[0]->ctx();
+    type_flag = u[0]->dtype();
+    for (int i = 0; i < OP::num_inputs; ++i) {
+      CHECK_EQ(ctx, u[i]->ctx()) << "Context of all oprands must be the same.";
+      CHECK_EQ(type_flag, u[i]->dtype()) << "Data type of all oprands must be the same.";
+    }
+  } else {
+    CHECK(param.count("ctx")) << "Must provide keyword argument ctx for TorchOp with 0 inputs";
+    std::istringstream str_ctx(param.at("ctx"));
+    std::string dev;
+    int id;
+    char tmp;
+    str_ctx >> dev >> tmp >> id >> tmp;
+    if (dev == "cpu") {
+      ctx = Context::Create(Context::kCPU, id);
+    } else if (dev == "gpu") {
+      ctx = Context::Create(Context::kGPU, id);
+    } else {
+      LOG(FATAL) << "Unknown device type " << dev;
+    }
+
+    if (param.count("dtype")) {
+      std::stringstream str_dtype(param.at("dtype"));
+      str_dtype >> type_flag;
+    } else {
+      type_flag = mshadow::default_type_flag;
+    }
+  }
+  std::vector<NDArray> arr_in, arr_out;
+  std::vector<Engine::VarHandle> var_in, var_out, var_const;
+  for (int i = 0; i < OP::num_inputs; ++i) {
+    arr_in.push_back(*(u[i]));
+    var_in.push_back(u[i]->var());
+  }
+  for (int i = 0; i < OP::num_outputs; ++i) {
+    if (out[i]->is_none()) {
+      *(out[i]) = NDArray(shapes[i], ctx, false, type_flag);
+    }
+    arr_out.push_back(*(out[i]));
+    var_out.push_back(out[i]->var());
+  }
+  std::sort(var_in.begin(), var_in.end());
+  var_in.resize(std::unique(var_in.begin(), var_in.end()) - var_in.begin());
+  std::sort(var_out.begin(), var_out.end());
+  var_out.resize(std::unique(var_out.begin(), var_out.end()) - var_out.begin());
+  std::set_difference(var_in.begin(), var_in.end(), var_out.begin(), var_out.end(),
+                      std::inserter(var_const, var_const.begin()));
+  switch (ctx.dev_mask()) {
+    case mshadow::cpu::kDevMask: {
+      Engine::Get()->PushSync([arr_in, arr_out, param](RunContext rctx) {
+        TorchRunOp<mshadow::cpu, OP>(arr_in, arr_out, param, rctx);
+      }, ctx, var_const, var_out);
+      break;
+    }
+#if MXNET_USE_CUDA
+    case gpu::kDevMask: {
+      Engine::Get()->PushSync([arr_in, arr_out, param](RunContext rctx) {
+        TorchRunOp<mshadow::gpu, OP>(arr_in, arr_out, param, rctx);
+      }, ctx, var_const, var_out);
+      break;
+    }
+#endif
+    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+  }
+}
+
+struct TorchUnaryOpDesc {
+  static std::vector<mshadow::TShape> GetShape(NDArray **u,
+    const std::map<std::string, std::string>& param) {
+    return {u[0]->shape()};
+  }
+  static const int num_inputs = 1;
+  static const int num_outputs = 1;
+};
+
+#define MXNET_REGISTER_TORCH_FUN(name, OP)                \
+  MXNET_REGISTER_NDARRAY_FUN(name)                        \
+  .set_function(TorchOp<OP>)                              \
+  .set_num_use_vars(OP::num_inputs)                       \
+  .set_num_mutate_vars(OP::num_outputs)                   \
+  .set_type_mask(kAcceptEmptyMutateTarget)
+
+#define MXNET_REGISTER_TORCH_UNARY_FUN(name, func)                            \
+  struct TorchUnaryOpDesc_ ## name ## _ ## func : public TorchUnaryOpDesc {   \
+    static constexpr const char* fname = #func;                               \
+  };                                                                          \
+  MXNET_REGISTER_TORCH_FUN(name, TorchUnaryOpDesc_ ## name ## _ ## func);
+
+}  // namespace mxnet
+#endif  // PLUGIN_TORCH_TORCH_FUNCTION_H_
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
new file mode 100644
index 000000000000..703dc3ba1f43
--- /dev/null
+++ b/plugin/torch/torch_module-inl.h
@@ -0,0 +1,395 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file torch_module-inl.h
+ * \brief torch module operator
+ * \author Min Lin
+*/
+#ifndef PLUGIN_TORCH_TORCH_MODULE_INL_H_
+#define PLUGIN_TORCH_TORCH_MODULE_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <stdio.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../../src/operator/operator_common.h"
+#include "./torch_base.h"
+
+namespace mxnet {
+namespace op {
+struct TorchModuleParam : public dmlc::Parameter<TorchModuleParam> {
+  std::string lua_string;
+  uint32_t num_data;
+  uint32_t num_params;
+  uint32_t num_outputs;
+  DMLC_DECLARE_PARAMETER(TorchModuleParam) {
+    DMLC_DECLARE_FIELD(lua_string)
+    .describe("lua string that is called to generate the object");
+    DMLC_DECLARE_FIELD(num_data)
+    .describe("the number of input data");
+    DMLC_DECLARE_FIELD(num_params)
+    .describe("the number of parameters");
+    DMLC_DECLARE_FIELD(num_outputs)
+    .describe("the number of outputs");
+  }
+};
+
+/**
+ * \brief This is the implementation of activation operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
+template<typename xpu>
+class TorchModuleOp : public Operator {
+ private:
+  TorchModuleParam param_;
+
+ protected:
+  THCharStorage* chunk_;
+
+ public:
+  explicit TorchModuleOp(TorchModuleParam p) : chunk_(NULL) {
+    this->param_ = p;
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    std::string exec = std::string("return ") + p.lua_string
+      + TorchTensor::ModuleType(xpu::kDevMask);
+    CHECK_EQ(luaL_loadstring(L, exec.c_str()), 0);
+    int err = lua_pcall(L, 0, 1, 0);
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    // Get number of parameters
+    uint32_t param_num = 0;
+    lua_getfield(L, -1, "parameters");
+    lua_pushvalue(L, -2);
+    CHECK_EQ(lua_pcall(L, 1, LUA_MULTRET, 0), 0);
+    if (lua_gettop(L) == 1) {
+      param_num = 0;
+    } else {
+      CHECK_EQ(lua_gettop(L), 3);
+      param_num = lua_objlen(L, -2);
+      lua_pop(L, 2);
+    }
+    CHECK_EQ(param_num, param_.num_params);
+    // serialize
+    TorchState::Serialize(&chunk_);
+  }
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    CHECK_EQ(in_data.size(), param_.num_params + param_.num_data);
+    CHECK_EQ(out_data.size(), param_.num_outputs);
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    TorchState::SetStream(s);
+    // Deserialize self table
+    TorchState::Deserialize(chunk_);
+    TorchTensor::TBlobVectorAsTable(out_data.begin(), out_data.begin() + param_.num_outputs);
+    // set the output field
+    lua_setfield(L, -2, "output");
+    // set the parameters
+    if (param_.num_params != 0) {
+      // get the parameters into the stack
+      lua_getfield(L, -1, "parameters");
+      lua_pushvalue(L, -2);
+      int err = lua_pcall(L, 1, 1, 0);
+      CHECK_EQ(err, 0);
+      // iterate the parameters table to put tblobs inside
+      lua_pushnil(L);
+      std::vector<TBlob>::const_iterator it = in_data.begin() + param_.num_data;
+      while (lua_next(L, -2)) {
+        CHECK(luaT_isudata(L, -1, TorchTensor::TensorType(*it)));
+        void* udata = luaT_toudata(L, -1, TorchTensor::TensorType(*it));
+        TorchTensor::SetInternal(static_cast<THGeneralTensor>(udata), *(it));
+        it++;
+        lua_pop(L, 1);
+      }
+      lua_pop(L, 1);  // pop the parameter table
+    }
+    // call updateOutput
+    // | self
+    lua_getfield(L, -1, "updateOutput");
+    // | self | updateOutput
+    lua_pushvalue(L, -2);
+    // | self | updateOutput | self
+    TorchTensor::TBlobVectorAsTable(in_data.begin(), in_data.begin() + param_.num_data);
+    // | self | updateOutput | self | inputs
+    int err = lua_pcall(L, 2, 0, 0);  // doesn't need the output
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    TorchState::Serialize(&chunk_);
+    CHECK_EQ(lua_gettop(L), 0);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    CHECK_EQ(in_data.size(), param_.num_params + param_.num_data);
+    CHECK_EQ(out_data.size(), param_.num_outputs);
+    CHECK_EQ(out_grad.size(), param_.num_outputs);
+    CHECK_EQ(in_grad.size(), param_.num_params + param_.num_data);
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    TorchState::SetStream(s);
+    TorchState::Deserialize(chunk_);
+    TorchTensor::TBlobVectorAsTable(out_data.begin(), out_data.end());
+    lua_setfield(L, -2, "output");
+    TorchTensor::TBlobVectorAsTable(in_grad.begin(), in_grad.begin() + param_.num_data);
+    lua_setfield(L, -2, "gradInput");
+    if (param_.num_params != 0) {
+      // get the parameters into the stack
+      lua_getfield(L, -1, "parameters");
+      lua_pushvalue(L, -2);
+      int err = lua_pcall(L, 1, LUA_MULTRET, 0);
+      CHECK_EQ(err, 0) << lua_tostring(L, -1);
+      // iterate the parameters table to put tblobs inside
+      lua_pushnil(L);
+      std::vector<TBlob>::const_iterator it = in_data.begin() + param_.num_data;
+      while (lua_next(L, -3)) {
+        TorchTensor::SetInternal(
+          static_cast<THGeneralTensor>(luaT_toudata(L, -1, TorchTensor::TensorType(*it))),
+          *it);
+        it++;
+        lua_pop(L, 1);
+      }
+      // iterate the grad of params
+      lua_pushnil(L);
+      it = in_grad.begin() + param_.num_data;;
+      while (lua_next(L, -2)) {
+        TorchTensor::SetInternal(
+          static_cast<THGeneralTensor>(luaT_toudata(L, -1, TorchTensor::TensorType(*it))),
+          *it);
+        it++;
+        lua_pop(L, 1);
+      }
+      lua_pop(L, 2);  // pop the parameters
+    }
+    lua_getfield(L, -1, "zeroGradParameters");
+    lua_pushvalue(L, -2);
+    CHECK_EQ(lua_pcall(L, 1, 0, 0), 0);
+    TorchTensor::TBlobVectorAsTable(in_data.begin(), in_data.begin() + param_.num_data);
+    TorchTensor::TBlobVectorAsTable(out_grad.begin(), out_grad.end());
+    // call
+    lua_getfield(L, -3, "accGradParameters");
+    lua_pushvalue(L, -4);
+    lua_pushvalue(L, -4);
+    lua_pushvalue(L, -4);
+    lua_pushnumber(L, 1);
+    int err = lua_pcall(L, 4, 0, 0);  // doesn't need the output
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    lua_getfield(L, -3, "updateGradInput");
+    lua_pushvalue(L, -4);
+    lua_pushvalue(L, -4);
+    lua_pushvalue(L, -4);
+    err = lua_pcall(L, 3, 0, 0);  // doesn't need the output
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    lua_pop(L, 2);
+    TorchState::Serialize(&chunk_);
+    CHECK_EQ(lua_gettop(L), 0);
+  }
+};  // class TorchModuleOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(TorchModuleParam type);
+
+#if DMLC_USE_CXX11
+class TorchModuleProp : public OperatorProperty {
+ protected:
+  mutable THCharStorage* chunk_;
+  void InitChunk_() const {
+    lua_State* L = TorchState::LuaState();
+    std::string exec = std::string("return ") + param_.lua_string;
+    CHECK_EQ(luaL_loadstring(L, exec.c_str()), 0);
+    int err = lua_pcall(L, 0, LUA_MULTRET, 0);
+    CHECK_EQ(lua_gettop(L), 1);
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    lua_getfield(L, -1, "float");
+    lua_pushvalue(L, -2);
+    err = lua_pcall(L, 1, 1, 0);
+    CHECK_EQ(err, 0);
+    TorchState::Serialize(&chunk_);
+    lua_pop(L, 1);
+    CHECK_EQ(lua_gettop(L), 0);
+  }
+
+ public:
+  std::vector<std::string> ListArguments() const override {
+    std::vector<std::string> ret;
+    if (!chunk_) {
+      InitChunk_();
+    }
+    std::string data = "data";
+    for (uint32_t i = 0; i < param_.num_data; ++i) {
+      ret.push_back(data + "_" + std::to_string(i));
+    }
+    std::string lua_code =
+        "return function(module)\n"
+        "          local params = module:parameters()\n"
+        "          local dict = {}\n"
+        "          if params == nil then\n"
+        "             return {}\n"
+        "          end\n"
+        "          for id, p in ipairs(params) do\n"
+        "             dict[p] = string.format('param_%d', id)\n"
+        "          end\n"
+        "          for key, value in pairs(module) do\n"
+        "             if dict[value] then\n"
+        "                dict[value] = key\n"
+        "             end\n"
+        "          end\n"
+        "          local ret = {}\n"
+        "          for _, p in ipairs(params) do\n"
+        "             table.insert(ret, dict[p])\n"
+        "          end\n"
+        "          return ret\n"
+        "end\n";
+    lua_State* L = TorchState::LuaState();
+    luaL_loadstring(L, lua_code.c_str());
+    int err = lua_pcall(L, 0, 1, 0);  // return the function
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    TorchState::Deserialize(chunk_);
+    err = lua_pcall(L, 1, 1, 0);  // call the function
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    lua_pushnil(L);
+    while (lua_next(L, -2)) {
+      ret.push_back(lua_tostring(L, -1));
+      lua_pop(L, 1);
+    }
+    lua_pop(L, 1);
+    return ret;
+  }
+
+  virtual std::vector<std::string> ListOutputs() const {
+    std::vector<std::string> ret;
+    std::string output = "output";
+    for (uint32_t i = 0; i < param_.num_outputs; ++i) {
+      ret.push_back(output + "_" + std::to_string(i));
+    }
+    return ret;
+  }
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    if (chunk_ == nullptr) {
+      this->InitChunk_();
+    }
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    TorchState::Deserialize(chunk_);
+    CHECK_EQ(in_shape->size(), param_.num_data + param_.num_params);
+    CHECK_EQ(out_shape->size(), param_.num_outputs);
+    CHECK_EQ(aux_shape->size(), 0);
+    lua_getfield(L, -1, "updateOutput");
+    lua_pushvalue(L, -2);  // self
+    if (param_.num_data == 1) {
+      THLongStorage* thshape = THLongStorage_newWithSize((*in_shape)[0].ndim());
+      for (uint32_t i = 0; i < (*in_shape)[0].ndim(); ++i) {
+        THLongStorage_set(thshape, i, (*in_shape)[0][i]);
+      }
+      THFloatTensor* in_data = THFloatTensor_newWithSize(thshape, NULL);
+      THLongStorage_free(thshape);
+      luaT_pushudata(L, in_data, TorchTensor::TensorType(mshadow::cpu::kDevMask));
+    } else if (param_.num_data > 1) {
+      lua_createtable(L, param_.num_data, 0);
+      for (uint32_t data_index = 0; data_index < param_.num_data; ++data_index) {
+        THLongStorage* thshape = THLongStorage_newWithSize((*in_shape)[data_index].ndim());
+        for (uint32_t i = 0; i < (*in_shape)[data_index].ndim(); ++i) {
+          THLongStorage_set(thshape, i, (*in_shape)[data_index][i]);
+        }
+        THFloatTensor* in_data = THFloatTensor_newWithSize(thshape, NULL);
+        THLongStorage_free(thshape);
+        luaT_pushudata(L, in_data, TorchTensor::TensorType(mshadow::cpu::kDevMask));
+        lua_rawseti(L, -2, data_index);
+      }
+    }
+    int err = lua_pcall(L, 2, 0, 0);
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    if (param_.num_params != 0) {
+      lua_getfield(L, -1, "parameters");
+      lua_pushvalue(L, -2);
+      int err = lua_pcall(L, 1, LUA_MULTRET, 0);
+      CHECK_EQ(err, 0);
+      CHECK_EQ(lua_gettop(L), 3);
+      lua_pushnil(L);
+      int index = param_.num_data;
+      while (lua_next(L, -3)) {
+        THFloatTensor* param = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
+          TorchTensor::TensorType(mshadow::cpu::kDevMask)));
+        size_t* size = param->size;
+        (*in_shape)[index++] = TShape(size, size + THFloatTensor_nDimension(param));
+        lua_pop(L, 1);
+      }
+      lua_pop(L, 2);
+    }
+    lua_getfield(L, -1, "output");
+    if (param_.num_outputs == 0) {
+    } else if (param_.num_outputs == 1) {
+      THFloatTensor* output = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
+        TorchTensor::TensorType(mshadow::cpu::kDevMask)));
+      size_t* size = output->size;
+      (*out_shape)[0] = TShape(size, size + THFloatTensor_nDimension(output));
+    } else {
+      for (uint32_t data_index = 0; data_index < param_.num_outputs; ++data_index) {
+        lua_pushnil(L);
+        int index = 0;
+        while (lua_next(L, -2)) {
+          THFloatTensor* out = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
+            TorchTensor::TensorType(mshadow::cpu::kDevMask)));
+          size_t* size = out->size;
+          (*out_shape)[index++] = TShape(size, size + THFloatTensor_nDimension(out));
+        }
+      }
+    }
+    lua_pop(L, 2);
+    CHECK_EQ(lua_gettop(L), 0);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new TorchModuleProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "TorchModule";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    std::vector<int> dep;
+    dep.insert(dep.end(), out_grad.begin(), out_grad.end());
+    dep.insert(dep.end(), out_data.begin(), out_data.end());
+    dep.insert(dep.end(), in_data.begin(), in_data.end());
+    return dep;
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  TorchModuleParam param_;
+};
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // PLUGIN_TORCH_TORCH_MODULE_INL_H_
diff --git a/plugin/torch/torch_module.cc b/plugin/torch/torch_module.cc
new file mode 100644
index 000000000000..909d34c8cfa9
--- /dev/null
+++ b/plugin/torch/torch_module.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cc
+ * \brief activation op
+ * \author Bing Xu
+*/
+#include "./torch_module-inl.h"
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(TorchModuleParam param) {
+  return new TorchModuleOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *TorchModuleProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(TorchModuleParam);
+
+MXNET_REGISTER_OP_PROPERTY(TorchModule, TorchModuleProp)
+.describe("Modules from torch.")
+.add_arguments(TorchModuleParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/torch/torch_module.cu b/plugin/torch/torch_module.cu
new file mode 100644
index 000000000000..893ebacd4fef
--- /dev/null
+++ b/plugin/torch/torch_module.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cc
+ * \brief activation op
+ * \author Bing Xu
+*/
+#include "./torch_module-inl.h"
+#include "../../src/operator/mshadow_op.h"
+extern "C" {
+#include "THCTensor.h"
+}
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(TorchModuleParam param) {
+  return new TorchModuleOp<gpu>(param);
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 9b9667729b49..01d0438e25a0 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -41,4 +41,7 @@
 from . import monitor
 from . import monitor as mon
 
+from . import torch
+from . import torch as th
+
 __version__ = base.__version__
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index a50a1406c2f1..a9f66cc2b9b9 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -680,7 +680,10 @@ def binary_ndarray_function(lhs, rhs, out=None):
         check_call(_LIB.MXFuncInvoke(handle,
                                      c_array(NDArrayHandle, (lhs.handle, rhs.handle)),
                                      c_array(mx_float, ()),
-                                     c_array(NDArrayHandle, (out.handle,))))
+                                     c_array(NDArrayHandle, (out.handle,)),
+                                     ctypes.c_int(0),
+                                     c_array(ctypes.c_char_p, []),
+                                     c_array(ctypes.c_char_p, [])))
         return out
 
     def unary_ndarray_function(src, out=None):
@@ -698,7 +701,10 @@ def unary_ndarray_function(src, out=None):
                 handle, \
                 c_array(NDArrayHandle, (src.handle,)), \
                 c_array(mx_float, ()), \
-                c_array(NDArrayHandle, (out.handle,))))
+                c_array(NDArrayHandle, (out.handle,)), \
+                ctypes.c_int(0), \
+                c_array(ctypes.c_char_p, []), \
+                c_array(ctypes.c_char_p, [])))
         return out
 
     def generic_ndarray_function(*args, **kwargs):
@@ -732,7 +738,10 @@ def generic_ndarray_function(*args, **kwargs):
                 handle, \
                 c_array(NDArrayHandle, [args[i].handle for i in use_vars_range]), \
                 c_array(mx_float, [args[i] for i in scalar_range]), \
-                c_array(NDArrayHandle, [v.handle for v in mutate_vars])))
+                c_array(NDArrayHandle, [v.handle for v in mutate_vars]), \
+                ctypes.c_int(0), \
+                c_array(ctypes.c_char_p, []), \
+                c_array(ctypes.c_char_p, [])))
         if n_mutate_vars == 1:
             return mutate_vars[0]
         else:
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
new file mode 100644
index 000000000000..ce4eb147fd3f
--- /dev/null
+++ b/python/mxnet/torch.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+"""Interface for NDArray functions executed by torch backend.
+Install torch and Compile with USE_TORCH=1 to use this module"""
+from __future__ import absolute_import
+
+import ctypes
+import sys
+from .base import _LIB
+from .base import c_array, py_str
+from .base import mx_uint, mx_float, NDArrayHandle, FunctionHandle
+from .base import check_call
+from .ndarray import NDArray, _new_empty_handle
+
+# pylint: disable=too-many-locals, invalid-name
+def _make_torch_function(handle):
+    """Create a Torch function from the FunctionHandle."""
+    # Get the property of function
+    n_used_vars = mx_uint()
+    n_scalars = mx_uint()
+    n_mutate_vars = mx_uint()
+    type_mask = ctypes.c_int()
+    check_call(_LIB.MXFuncDescribe(
+        handle,
+        ctypes.byref(n_used_vars),
+        ctypes.byref(n_scalars),
+        ctypes.byref(n_mutate_vars),
+        ctypes.byref(type_mask)))
+    n_mutate_vars = n_mutate_vars.value
+    n_used_vars = n_used_vars.value
+    n_scalars = n_scalars.value
+    type_mask = type_mask.value
+
+    # Get the information from the function
+    name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+
+    check_call(_LIB.MXFuncGetInfo(
+        handle, ctypes.byref(name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs)))
+    func_name = py_str(name.value)
+    if not func_name.startswith('_th_'):
+        return None
+
+    doc_str = (('Interface for Torch function {name}.\n' +
+                'Invoke with\nres = mxnet.th.{name}(...)\nor\n'+
+                'mxnet.th.{name}(res, ...).\n\n' +
+                'detailed help can be found at ' +
+                'https://github.com/torch/torch7/blob/master/doc/maths.md\n').format(
+                    name=func_name[4:]))
+
+    def generic_torch_function(*args, **kwargs):
+        """Invoke this function by passing in parameters
+
+        Parameters
+        ----------
+        *args
+            Positional arguments of input scalars and NDArray
+
+        Returns
+        -------
+        out : NDArray
+            The result NDArray(tuple) of result of computation.
+        """
+        ndargs = []
+        arg_format = ''
+        value = ''
+        for arg in args:
+            if isinstance(arg, NDArray):
+                ndargs.append(arg)
+                arg_format += 'n'
+                value += ','
+            elif isinstance(arg, int):
+                arg_format += 'i'
+                value += str(arg) + ','
+            elif isinstance(arg, str):
+                arg_format += 's'
+                value += str(arg) + ','
+            elif isinstance(arg, float):
+                arg_format += 'f'
+                value += str(arg) + ','
+            elif isinstance(arg, bool):
+                arg_format += 'b'
+                value += str(arg) + ','
+        value = value[:-1]
+        if len(ndargs) == n_used_vars:
+            ndargs = [NDArray(_new_empty_handle()) for _ in range(n_mutate_vars)] + ndargs
+            arg_format = 'n'*n_mutate_vars + arg_format
+            value = ','*n_mutate_vars + value
+        elif len(ndargs) == n_mutate_vars + n_used_vars:
+            pass
+        else:
+            raise AssertionError(('Incorrect number of input NDArrays. ' +
+                                  'Need to be either %d (inputs) or %d ' +
+                                  '(output buffer) + %d (input)') %
+                                 (n_used_vars, n_mutate_vars, n_used_vars))
+
+        kwargs['format'] = arg_format
+        kwargs['args'] = value
+
+        check_call(_LIB.MXFuncInvoke( \
+                handle, \
+                c_array(NDArrayHandle, [x.handle for x in ndargs[n_mutate_vars:]]), \
+                c_array(mx_float, []), \
+                c_array(NDArrayHandle, [x.handle for x in ndargs[:n_mutate_vars]]),
+                ctypes.c_int(len(kwargs)),
+                c_array(ctypes.c_char_p, kwargs.keys()),
+                c_array(ctypes.c_char_p, kwargs.values()),))
+        if n_mutate_vars == 1:
+            return ndargs[0]
+        else:
+            return ndargs[:n_mutate_vars]
+    # End of function declaration
+    ret_function = generic_torch_function
+    ret_function.__name__ = func_name[4:]
+    ret_function.__doc__ = doc_str
+    return ret_function
+
+# pylint: enable=too-many-locals, invalid-name
+
+def _init_torch_module():
+    """List and add all the torch backed ndarray functions to current module."""
+    plist = ctypes.POINTER(FunctionHandle)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListFunctions(ctypes.byref(size),
+                                    ctypes.byref(plist)))
+
+    module_obj = sys.modules[__name__]
+    for i in range(size.value):
+        hdl = FunctionHandle(plist[i])
+        function = _make_torch_function(hdl)
+        # if function name starts with underscore, register as static method of NDArray
+        if function is not None:
+            setattr(module_obj, function.__name__, function)
+
+# Initialize the NDArray module
+_init_torch_module()
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 9018e02fd866..ea45758192f1 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -383,12 +383,18 @@ int MXFuncDescribe(FunctionHandle fun,
 int MXFuncInvoke(FunctionHandle fun,
                  NDArrayHandle *use_vars,
                  mx_float *scalar_args,
-                 NDArrayHandle *mutate_vars) {
+                 NDArrayHandle *mutate_vars,
+                 int num_params,
+                 char **param_keys,
+                 char **param_vals) {
   API_BEGIN();
   auto *f = static_cast<const NDArrayFunctionReg*>(fun);
   f->body((NDArray**)(use_vars),  //  NOLINT(*)
           scalar_args,
-          (NDArray**)(mutate_vars));  //  NOLINT(*)
+          (NDArray**)(mutate_vars),  //  NOLINT(*)
+          num_params,
+          param_keys,
+          param_vals);
   API_END();
 }
 
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
index 9e7de6d1171b..d700de361e00 100644
--- a/src/common/tblob_op_registry.cc
+++ b/src/common/tblob_op_registry.cc
@@ -272,7 +272,10 @@ void TBlobOpRegEntryImpl::RegisterUnary() {
   // The body to be registered
   auto body = [this] (NDArray **used_vars,
                       real_t *s,
-                      NDArray **mutate_vars) {
+                      NDArray **mutate_vars,
+                      int num_params,
+                      char **param_keys,
+                      char **param_vals) {
     NDArray src = *used_vars[0];
     NDArray *out = mutate_vars[0];
     TShape dshape = src.shape();
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 3b33ea28283d..87641f76fa34 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -631,14 +631,16 @@ MXNET_REGISTER_NDARRAY_FUN(_copyto)
 
 // register random number generators
 MXNET_REGISTER_NDARRAY_FUN(_random_uniform)
-.set_body([](NDArray **u, real_t *s, NDArray **out) {
+.set_body([](NDArray **u, real_t *s, NDArray **out,
+             int num_params, char **param_keys, char **param_vals) {
     SampleUniform(s[0], s[1], out[0]);
   })
 .set_num_scalars(2)
 .set_num_mutate_vars(1);
 
 MXNET_REGISTER_NDARRAY_FUN(_random_gaussian)
-.set_body([](NDArray **u, real_t *s, NDArray **out) {
+.set_body([](NDArray **u, real_t *s, NDArray **out,
+             int num_params, char **param_keys, char **param_vals) {
     SampleGaussian(s[0], s[1], out[0]);
   })
 .set_num_scalars(2)
@@ -646,7 +648,8 @@ MXNET_REGISTER_NDARRAY_FUN(_random_gaussian)
 
 MXNET_REGISTER_NDARRAY_FUN(clip)
 .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
-.set_body([](NDArray **u, real_t *s, NDArray **out) {
+.set_body([](NDArray **u, real_t *s, NDArray **out,
+             int num_params, char **param_keys, char **param_vals) {
     ClipOp(*u[0], s[0], s[1], out[0]);
   })
 .set_num_use_vars(1)

From 903c6efa2a5aaef99e09ce1ed201d615103ded5d Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sat, 9 Jan 2016 02:11:39 -0800
Subject: [PATCH 23/32] more torch ops

---
 Makefile                        |  10 ++-
 doc/tutorial/torch_howto.md     |  53 ++++++++++++++++
 example/torch/torch_function.py |  10 +++
 example/torch/torch_module.py   |  30 +++++++++
 include/mxnet/c_api.h           |  26 ++++++--
 include/mxnet/ndarray.h         |   2 +-
 plugin/torch/torch_base.h       |  13 ++--
 plugin/torch/torch_function.cc  | 105 +++++++++++++++++++++++++++++++-
 plugin/torch/torch_function.h   |  66 +++++++++++++++++---
 plugin/torch/torch_module-inl.h |   6 +-
 python/mxnet/ndarray.py         |  18 +++---
 python/mxnet/torch.py           |  42 ++++++++-----
 src/c_api/c_api.cc              |  15 +++++
 13 files changed, 345 insertions(+), 51 deletions(-)
 create mode 100644 doc/tutorial/torch_howto.md
 create mode 100644 example/torch/torch_function.py
 create mode 100644 example/torch/torch_module.py

diff --git a/Makefile b/Makefile
index 12a444006085..6e51438062db 100644
--- a/Makefile
+++ b/Makefile
@@ -93,9 +93,9 @@ endif
 
 all: lib/libmxnet.a lib/libmxnet.so $(BIN)
 
-SRC = $(wildcard src/*.cc src/*/*.cc src/*/*/*.cc)
+SRC = $(wildcard src/*.cc src/*/*.cc)
 OBJ = $(patsubst %.cc, build/%.o, $(SRC))
-CUSRC = $(wildcard src/*/*.cu src/*/*/*.cu)
+CUSRC = $(wildcard src/*/*.cu)
 CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC))
 
 ifneq ($(EXTRA_OPERATORS),)
@@ -113,7 +113,10 @@ endif
 # plugin
 ifeq ($(USE_TORCH), 1)
 	CFLAGS += -I$(TORCH_PATH)/install/include -I$(TORCH_PATH)/install/include/TH -I$(TORCH_PATH)/install/include/THC -DMXNET_USE_TORCH=1
-	LDFLAGS += -Wl,-export-dynamic -L$(TORCH_PATH)/install/lib -L$(TORCH_PATH)/install/lib/lua/5.1 -lluajit -lluaT -lTH -lTHC -lpaths -ltorch -lcutorch -lnn -lcunn
+	LDFLAGS += -L$(TORCH_PATH)/install/lib -L$(TORCH_PATH)/install/lib/lua/5.1 -lluajit -lluaT -lTH -lTHC -lpaths -ltorch -lnn
+	ifeq ($(USE_CUDA), 1)
+		LDFLAGS += -lcutorch -lcunn
+	endif
 	
 	TORCH_SRC = $(wildcard plugin/torch/*.cc)
 	PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(TORCH_SRC))
@@ -240,6 +243,7 @@ clean_all: clean
 
 -include build/*.d
 -include build/*/*.d
+-include build/*/*/*.d
 ifneq ($(EXTRA_OPERATORS),)
 	-include $(EXTRA_OPERATORS)/build/*.d
 endif
diff --git a/doc/tutorial/torch_howto.md b/doc/tutorial/torch_howto.md
new file mode 100644
index 000000000000..0b8288399634
--- /dev/null
+++ b/doc/tutorial/torch_howto.md
@@ -0,0 +1,53 @@
+# How to use MXNet as a (almost) full function Torch front-end
+
+This tutorial demonstrates how to use MXNet as front-end to two of Torch's major functionalities:
+
+* 1) Compile MXNet with Torch support.
+
+* 2) Call Torch's tensor mathematical functions with MXNet.NDArray.
+
+* 3) Embed Torch's neural network modules (layers) into MXNet's symbolic graph.
+
+## Compile with Torch
+* First install Torch following [official guide](http://torch.ch/docs/getting-started.html).
+* Then, in `config.mk` (if you haven't already, copy `make/config.mk` (Linux) or `make/osx.mk` (Mac) into MXNet root folder as `config.mk`) set `USE_TORCH = 1`
+and `TORCH_PATH = /path/to/torch`. By default Torch should be installed in your home folder (so `TORCH_PATH = $(HOME)/torch`).
+* Run `make clean && make` to build with torch support.
+
+## Tensor Mathematics
+mxnet.th module supports calling Torch's tensor mathematical functions with mxnet.nd.NDArray. For example ([full code](https://github.com/dmlc/mxnet/blob/master/example/torch/torch_function.py)):
+```Python
+import mxnet as mx
+x = mx.th.randn(2, 2, ctx=mx.cpu(0))
+print x.asnumpy()
+y = mx.th.abs(x)
+print y.asnumpy()
+
+x = mx.th.randn(2, 2, ctx=mx.cpu(0))
+print x.asnumpy()
+mx.th.abs(x, x) # in-place
+print x.asnumpy()
+```
+Help can be found with `help(mx.th)`. 
+We already added support for most common functions listed on [Torch's doc page](https://github.com/torch/torch7/blob/master/doc/maths.md). 
+If you find that the function you need is not supported, you can easily register it in `mxnet_root/plugin/torch/torch_function.cc` following existing registrations.
+
+## Torch Modules (Layers)
+Torch's neural network modules is also supported by MXNet through `mxnet.symbol.TorchModule` symbol.
+For example, the following code defines a 3 layer DNN for classifying MNIST digits ([full code](https://github.com/dmlc/mxnet/blob/master/example/torch/torch_module.py)):
+```Python
+data = mx.symbol.Variable('data')
+fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')
+act1 = mx.symbol.TorchModule(data_0=fc1, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu1')
+fc2 = mx.symbol.TorchModule(data_0=act1, lua_string='nn.Linear(128, 64)', num_data=1, num_params=2, num_outputs=1, name='fc2')
+act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu2')
+fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
+mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+```
+Let's break it down. First `data = mx.symbol.Variable('data')` defines a Variable as placeholder for input.
+Then it's fed through Torch's nn modules with `fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')`.
+Note that we used `mx.symbol.SoftmaxOutput` instead of Torch module because torch implement loss layers in separate class torch.Critirion and it's not supported yet.
+The input to nn module is named as data_i for i = 0 ... num_data-1. `lua_string` is a single Lua statement that creates the module object.
+For Torch's built-in module this is simply `nn.module_name(arguments)`.
+If you are using custom module, place it in a .lua script file and load it with `require 'module_file.lua'` if your script returns an torch.nn object, or `(require 'module_file.lua')()` if your script returns a torch.nn class.
+
diff --git a/example/torch/torch_function.py b/example/torch/torch_function.py
new file mode 100644
index 000000000000..ffca595e5141
--- /dev/null
+++ b/example/torch/torch_function.py
@@ -0,0 +1,10 @@
+import mxnet as mx
+x = mx.th.randn(2, 2, ctx=mx.cpu(0))
+print x.asnumpy()
+y = mx.th.abs(x)
+print y.asnumpy()
+
+x = mx.th.randn(2, 2, ctx=mx.cpu(0))
+print x.asnumpy()
+mx.th.abs(x, x) # in-place
+print x.asnumpy()
\ No newline at end of file
diff --git a/example/torch/torch_module.py b/example/torch/torch_module.py
new file mode 100644
index 000000000000..4fef11b6c9ef
--- /dev/null
+++ b/example/torch/torch_module.py
@@ -0,0 +1,30 @@
+# pylint: skip-file
+from data import mnist_iterator
+import mxnet as mx
+import numpy as np
+import logging
+
+# define mlp
+
+data = mx.symbol.Variable('data')
+fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')
+act1 = mx.symbol.TorchModule(data_0=fc1, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu1')
+fc2 = mx.symbol.TorchModule(data_0=act1, lua_string='nn.Linear(128, 64)', num_data=1, num_params=2, num_outputs=1, name='fc2')
+act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu2')
+fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
+mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+
+# data
+
+train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+
+# train
+
+logging.basicConfig(level=logging.DEBUG)
+
+model = mx.model.FeedForward(
+    ctx = mx.gpu(0), symbol = mlp, num_epoch = 20,
+    learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
+
+model.fit(X=train, eval_data=val)
+
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 1b1527bb8b0e..3a4b1d68a641 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -382,11 +382,27 @@ MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
 MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
                            NDArrayHandle *use_vars,
                            mx_float *scalar_args,
-                           NDArrayHandle *mutate_vars,
-                           int num_params,
-                           char **param_keys,
-                           char **param_vals);
-
+                           NDArrayHandle *mutate_vars);
+/*!
+ * \brief invoke a function, the array size of passed in arguments
+ *   must match the values in the
+ * \param fun the function
+ * \param use_vars the normal arguments passed to function
+ * \param scalar_args the scalar qarguments
+ * \param mutate_vars the mutate arguments
+ * \param num_params number of keyword parameters
+ * \param param_keys keys for keyword parameters
+ * \param param_vals values for keyword parameters
+ * \return 0 when success, -1 when failure happens
+ * \sa MXFuncDescribeArgs
+ */
+MXNET_DLL int MXFuncInvokeEx(FunctionHandle fun,
+                             NDArrayHandle *use_vars,
+                             mx_float *scalar_args,
+                             NDArrayHandle *mutate_vars,
+                             int num_params,
+                             char **param_keys,
+                             char **param_vals);
 //--------------------------------------------
 // Part 3: symbolic configuration generation
 //--------------------------------------------
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index b32639d88c74..fa19f93caf86 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -561,7 +561,7 @@ struct NDArrayFunctionReg
   /*!
    * \brief set the function body to a unary NDArray function
    *  this will also auto set the parameters correctly
-   * \param funary function body to set
+   * \param fgeneric function body to set
    * \return ref to the registered entry, used to set properties
    */
   inline NDArrayFunctionReg &set_function(
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
index f70f8a181892..0263f5f0b1fe 100644
--- a/plugin/torch/torch_base.h
+++ b/plugin/torch/torch_base.h
@@ -161,7 +161,8 @@ class TorchTensor {
         break;
       }
 #endif
-      default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      default:
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
     }
     THLongStorage_free(thshape);
 
@@ -178,9 +179,10 @@ class TorchTensor {
         THFloatStorage* original = static_cast<THFloatTensor*>(tensor)->storage;
         static_cast<THFloatTensor*>(tensor)->storage = storage;
         THFloatStorage_free(original);
+        break;
       }
-      case gpu::kDevMask: {
 #if MXNET_USE_CUDA
+      case gpu::kDevMask: {
         THCState* state = TorchState::CudaState();
         THCudaStorage* storage = THCudaStorage_newWithData(state,
                                                            static_cast<real_t*>(blob.dptr_),
@@ -190,12 +192,11 @@ class TorchTensor {
         THCudaStorage* original = static_cast<THCudaTensor*>(tensor)->storage;
         static_cast<THCudaTensor*>(tensor)->storage = storage;
         THCudaStorage_free(state, original);
-#else
-        LOG(FATAL) << "GPU is not enabled";
-#endif
+        break;
       }
+#endif
       default:
-        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+        LOG(FATAL) << "Unknown device type " << blob.dev_mask_;
     }
   }
 
diff --git a/plugin/torch/torch_function.cc b/plugin/torch/torch_function.cc
index e969de66490b..b47ab56f68d2 100644
--- a/plugin/torch/torch_function.cc
+++ b/plugin/torch/torch_function.cc
@@ -8,6 +8,16 @@
 
 namespace mxnet {
 
+// Construction or extraction functions
+MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(_th_eye, eye);
+MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(_th_ones, ones);
+MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(_th_rand, rand);
+MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(_th_randn, randn);
+MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(_th_randperm, randperm);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_tril, tril);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_triu, triu);
+MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(_th_zeros, zeros);
+
 // Element-wise Mathematical Operations
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_abs, abs);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_sign, sign);
@@ -21,7 +31,9 @@ MXNET_REGISTER_TORCH_UNARY_FUN(_th_exp, exp);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_floor, floor);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_log, log);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_log1p, log1p);
-MXNET_REGISTER_TORCH_UNARY_FUN(_th_pow, pow);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_pow, pow)
+.add_argument("n", "float", "pow(x, n) returns x^n, element-wise. "
+  "pow(n, x) returns n^x, element-wise.");
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_round, round);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_sin, sin);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_sinh, sinh);
@@ -30,7 +42,96 @@ MXNET_REGISTER_TORCH_UNARY_FUN(_th_tan, tan);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_tanh, tanh);
 
 // Basic operations
-MXNET_REGISTER_TORCH_UNARY_FUN(_th_add_scalar, add);
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_add_scalar, add)
+.add_argument("value", "float", "Add value to all elements in x");
+MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_add, add);
+MXNET_REGISTER_TORCH_BINARY_FUN(_th_add_axpy, add);
+
+// MXNET_REGISTER_TORCH_UNARY_FUN(_th_csub_scalar, csub);
+// MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_csub, csub);
+
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_mul_scalar, mul)
+.add_argument("value", "float", "Multiply value to all elements in x");
+MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_cmul, cmul);
+
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_clamp, clamp);
+MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_cpow, cpow);
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_addcmul, addcmul);
+
+MXNET_REGISTER_TORCH_UNARY_FUN(_th_div_scalar, div)
+.add_argument("value", "float", "Divide all elements in x by value");
+MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_cdiv, cdiv);
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_addcdiv, addcdiv);
+
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_addmv, addmv);
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_addr, addr);
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_addmm, addmm);
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_addbmm, addbmm);
+MXNET_REGISTER_TORCH_TENARY_FUN(_th_baddbmm, baddbmm);
+
+struct TorchMMShape {
+  static std::vector<mshadow::TShape> GetShape(NDArray **u,
+    const std::map<std::string, std::string>& param) {
+    CHECK_EQ(u[0]->shape().ndim(), 2);
+    CHECK_EQ(u[1]->shape().ndim(), 2);
+    CHECK_EQ(u[0]->shape()[1], u[1]->shape()[0]);
+    index_t shape[] = {u[0]->shape()[0], u[1]->shape()[1]};
+    mshadow::TShape tshape(shape, shape+2);
+    return {tshape};
+  }
+  static constexpr const char* fname = "mm";
+  static const int num_inputs = 2;
+  static const int num_outputs = 1;
+};
+MXNET_REGISTER_TORCH_FUN(_th_mm, TorchMMShape);
+
+struct TorchMVShape {
+  static std::vector<mshadow::TShape> GetShape(NDArray **u,
+    const std::map<std::string, std::string>& param) {
+    CHECK_EQ(u[0]->shape().ndim(), 2);
+    CHECK_EQ(u[1]->shape().ndim(), 1);
+    CHECK_EQ(u[0]->shape()[1], u[1]->shape()[0]);
+    index_t shape[] = {u[0]->shape()[0]};
+    mshadow::TShape tshape(shape, shape+1);
+    return {tshape};
+  }
+  static constexpr const char* fname = "mv";
+  static const int num_inputs = 2;
+  static const int num_outputs = 1;
+};
+MXNET_REGISTER_TORCH_FUN(_th_mv, TorchMVShape);
+
+
+struct TorchBMMShape {
+  static std::vector<mshadow::TShape> GetShape(NDArray **u,
+    const std::map<std::string, std::string>& param) {
+    CHECK_EQ(u[0]->shape().ndim(), 3);
+    CHECK_EQ(u[1]->shape().ndim(), 3);
+    CHECK_EQ(u[0]->shape()[0], u[1]->shape()[0]);
+    CHECK_EQ(u[0]->shape()[2], u[1]->shape()[1]);
+    index_t shape[] = {u[0]->shape()[1], u[1]->shape()[2]};
+    mshadow::TShape tshape(shape, shape+2);
+    return {tshape};
+  }
+  static constexpr const char* fname = "bmm";
+  static const int num_inputs = 2;
+  static const int num_outputs = 1;
+};
+MXNET_REGISTER_TORCH_FUN(_th_bmm, TorchBMMShape);
 
+struct TorchGERShape {
+  static std::vector<mshadow::TShape> GetShape(NDArray **u,
+    const std::map<std::string, std::string>& param) {
+    CHECK_EQ(u[0]->shape().ndim(), 1);
+    CHECK_EQ(u[1]->shape().ndim(), 1);
+    index_t shape[] = {u[0]->shape()[0], u[1]->shape()[0]};
+    mshadow::TShape tshape(shape, shape+2);
+    return {tshape};
+  }
+  static constexpr const char* fname = "ger";
+  static const int num_inputs = 2;
+  static const int num_outputs = 1;
+};
+MXNET_REGISTER_TORCH_FUN(_th_ger, TorchGERShape);
 
 }  // namespace mxnet
diff --git a/plugin/torch/torch_function.h b/plugin/torch/torch_function.h
index 0866dc08f98f..9eca84698583 100644
--- a/plugin/torch/torch_function.h
+++ b/plugin/torch/torch_function.h
@@ -80,11 +80,11 @@ void TorchOp(NDArray **u, real_t *s, NDArray **out,
     }
   } else {
     CHECK(param.count("ctx")) << "Must provide keyword argument ctx for TorchOp with 0 inputs";
-    std::istringstream str_ctx(param.at("ctx"));
-    std::string dev;
+    std::string str_ctx(param.at("ctx"));
     int id;
-    char tmp;
-    str_ctx >> dev >> tmp >> id >> tmp;
+    char tmp[4];
+    sscanf(str_ctx.c_str(), "%3s(%d)", tmp, &id);
+    std::string dev(tmp);
     if (dev == "cpu") {
       ctx = Context::Create(Context::kCPU, id);
     } else if (dev == "gpu") {
@@ -138,12 +138,31 @@ void TorchOp(NDArray **u, real_t *s, NDArray **out,
   }
 }
 
-struct TorchUnaryOpDesc {
+struct TorchFirstShape {
   static std::vector<mshadow::TShape> GetShape(NDArray **u,
     const std::map<std::string, std::string>& param) {
     return {u[0]->shape()};
   }
-  static const int num_inputs = 1;
+};
+
+struct TorchConstructorShape {
+  static std::vector<mshadow::TShape> GetShape(NDArray **u,
+    const std::map<std::string, std::string>& param) {
+    std::vector<index_t> shape;
+    std::string format = param.at("format");
+    std::istringstream args(param.at("args"));
+    std::string val;
+    std::getline(args, val, ',');
+    CHECK_LE(format.size(), 5) << "Only support up to 4 dimensions.";
+    for (size_t i = 1; i < format.size(); ++i) {
+      CHECK_EQ(format[i], 'i') << "Only take integer arguments.";
+      std::getline(args, val, ',');
+      shape.push_back(std::stoi(val));
+    }
+    mshadow::TShape tshape(shape.begin(), shape.end());
+    return {tshape};
+  }
+  static const int num_inputs = 0;
   static const int num_outputs = 1;
 };
 
@@ -155,10 +174,41 @@ struct TorchUnaryOpDesc {
   .set_type_mask(kAcceptEmptyMutateTarget)
 
 #define MXNET_REGISTER_TORCH_UNARY_FUN(name, func)                            \
-  struct TorchUnaryOpDesc_ ## name ## _ ## func : public TorchUnaryOpDesc {   \
+  struct TorchUnaryOpDesc_ ## name ## _ ## func : public TorchFirstShape {    \
     static constexpr const char* fname = #func;                               \
+    static const int num_inputs = 1;                                          \
+    static const int num_outputs = 1;                                         \
   };                                                                          \
-  MXNET_REGISTER_TORCH_FUN(name, TorchUnaryOpDesc_ ## name ## _ ## func);
+  MXNET_REGISTER_TORCH_FUN(name, TorchUnaryOpDesc_ ## name ## _ ## func)      \
+  .add_argument("x", "NDArray", "Input NDArray")
+
+#define MXNET_REGISTER_TORCH_BINARY_FUN(name, func)                           \
+  struct TorchBinaryOpDesc_ ## name ## _ ## func : public TorchFirstShape {   \
+    static constexpr const char* fname = #func;                               \
+    static const int num_inputs = 2;                                          \
+    static const int num_outputs = 1;                                         \
+  };                                                                          \
+  MXNET_REGISTER_TORCH_FUN(name, TorchBinaryOpDesc_ ## name ## _ ## func)
+
+#define MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(name, func)                  \
+  MXNET_REGISTER_TORCH_BINARY_FUN(name, func)                                 \
+  .add_argument("x1", "NDArray", "First Input NDArray")                       \
+  .add_argument("x2", "NDArray", "Second Input NDArray")
+
+#define MXNET_REGISTER_TORCH_TENARY_FUN(name, func)                           \
+  struct TorchTenaryOpDesc_ ## name ## _ ## func : public TorchFirstShape {   \
+    static constexpr const char* fname = #func;                               \
+    static const int num_inputs = 3;                                          \
+    static const int num_outputs = 1;                                         \
+  };                                                                          \
+  MXNET_REGISTER_TORCH_FUN(name, TorchTenaryOpDesc_ ## name ## _ ## func)
+
+#define MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(name, func)                                  \
+  struct TorchConstructorOpDesc_ ## name ## _ ## func : public TorchConstructorShape {    \
+    static constexpr const char* fname = #func;                                           \
+  };                                                                                      \
+  MXNET_REGISTER_TORCH_FUN(name, TorchConstructorOpDesc_ ## name ## _ ## func)
+
 
 }  // namespace mxnet
 #endif  // PLUGIN_TORCH_TORCH_FUNCTION_H_
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index 703dc3ba1f43..1e7cb74a7cf7 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -332,7 +332,7 @@ class TorchModuleProp : public OperatorProperty {
       while (lua_next(L, -3)) {
         THFloatTensor* param = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
           TorchTensor::TensorType(mshadow::cpu::kDevMask)));
-        size_t* size = param->size;
+        long int* size = param->size;  // NOLINT(*)
         (*in_shape)[index++] = TShape(size, size + THFloatTensor_nDimension(param));
         lua_pop(L, 1);
       }
@@ -343,7 +343,7 @@ class TorchModuleProp : public OperatorProperty {
     } else if (param_.num_outputs == 1) {
       THFloatTensor* output = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
         TorchTensor::TensorType(mshadow::cpu::kDevMask)));
-      size_t* size = output->size;
+      long int* size = output->size;  // NOLINT(*)
       (*out_shape)[0] = TShape(size, size + THFloatTensor_nDimension(output));
     } else {
       for (uint32_t data_index = 0; data_index < param_.num_outputs; ++data_index) {
@@ -352,7 +352,7 @@ class TorchModuleProp : public OperatorProperty {
         while (lua_next(L, -2)) {
           THFloatTensor* out = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
             TorchTensor::TensorType(mshadow::cpu::kDevMask)));
-          size_t* size = out->size;
+          long int* size = out->size;  // NOLINT(*)
           (*out_shape)[index++] = TShape(size, size + THFloatTensor_nDimension(out));
         }
       }
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index a9f66cc2b9b9..364acfbbb2d8 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -677,13 +677,13 @@ def binary_ndarray_function(lhs, rhs, out=None):
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
             out = NDArray(_new_empty_handle())
-        check_call(_LIB.MXFuncInvoke(handle,
-                                     c_array(NDArrayHandle, (lhs.handle, rhs.handle)),
-                                     c_array(mx_float, ()),
-                                     c_array(NDArrayHandle, (out.handle,)),
-                                     ctypes.c_int(0),
-                                     c_array(ctypes.c_char_p, []),
-                                     c_array(ctypes.c_char_p, [])))
+        check_call(_LIB.MXFuncInvokeEx(handle,
+                                       c_array(NDArrayHandle, (lhs.handle, rhs.handle)),
+                                       c_array(mx_float, ()),
+                                       c_array(NDArrayHandle, (out.handle,)),
+                                       ctypes.c_int(0),
+                                       c_array(ctypes.c_char_p, []),
+                                       c_array(ctypes.c_char_p, [])))
         return out
 
     def unary_ndarray_function(src, out=None):
@@ -697,7 +697,7 @@ def unary_ndarray_function(src, out=None):
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
             out = NDArray(_new_empty_handle())
-        check_call(_LIB.MXFuncInvoke( \
+        check_call(_LIB.MXFuncInvokeEx( \
                 handle, \
                 c_array(NDArrayHandle, (src.handle,)), \
                 c_array(mx_float, ()), \
@@ -734,7 +734,7 @@ def generic_ndarray_function(*args, **kwargs):
                     NDArray(_new_empty_handle()) for i in range(n_mutate_vars))
             else:
                 raise TypeError('argument out is required to call %s' % func_name)
-        check_call(_LIB.MXFuncInvoke( \
+        check_call(_LIB.MXFuncInvokeEx( \
                 handle, \
                 c_array(NDArrayHandle, [args[i].handle for i in use_vars_range]), \
                 c_array(mx_float, [args[i] for i in scalar_range]), \
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
index ce4eb147fd3f..32f73f51e9d6 100644
--- a/python/mxnet/torch.py
+++ b/python/mxnet/torch.py
@@ -6,11 +6,16 @@
 import ctypes
 import sys
 from .base import _LIB
-from .base import c_array, py_str
+from .base import c_array, py_str, ctypes2docstring
 from .base import mx_uint, mx_float, NDArrayHandle, FunctionHandle
 from .base import check_call
 from .ndarray import NDArray, _new_empty_handle
 
+try:
+    _LUAJIT = ctypes.CDLL("libluajit.so", mode=ctypes.RTLD_GLOBAL)
+except OSError:
+    pass
+
 # pylint: disable=too-many-locals, invalid-name
 def _make_torch_function(handle):
     """Create a Torch function from the FunctionHandle."""
@@ -47,13 +52,19 @@ def _make_torch_function(handle):
     func_name = py_str(name.value)
     if not func_name.startswith('_th_'):
         return None
-
+    param_str = ctypes2docstring(num_args, arg_names, arg_types, arg_descs)
+    if n_mutate_vars > 1:
+        res = ','.join(['res%d '%i for i in range(n_mutate_vars)])
+    else:
+        res = 'res '
     doc_str = (('Interface for Torch function {name}.\n' +
-                'Invoke with\nres = mxnet.th.{name}(...)\nor\n'+
-                'mxnet.th.{name}(res, ...).\n\n' +
-                'detailed help can be found at ' +
+                'Invoke with\n{res}= mxnet.th.{name}(Parameters)\nor\n'+
+                'mxnet.th.{name}({res}, Parameters).\n\n' +
+                '{param_str}\n' +
+                'Reference: ' +
                 'https://github.com/torch/torch7/blob/master/doc/maths.md\n').format(
-                    name=func_name[4:]))
+                    name=func_name[4:], param_str=param_str,
+                    res=res))
 
     def generic_torch_function(*args, **kwargs):
         """Invoke this function by passing in parameters
@@ -104,14 +115,17 @@ def generic_torch_function(*args, **kwargs):
         kwargs['format'] = arg_format
         kwargs['args'] = value
 
-        check_call(_LIB.MXFuncInvoke( \
-                handle, \
-                c_array(NDArrayHandle, [x.handle for x in ndargs[n_mutate_vars:]]), \
-                c_array(mx_float, []), \
-                c_array(NDArrayHandle, [x.handle for x in ndargs[:n_mutate_vars]]),
-                ctypes.c_int(len(kwargs)),
-                c_array(ctypes.c_char_p, kwargs.keys()),
-                c_array(ctypes.c_char_p, kwargs.values()),))
+        for k in kwargs:
+            kwargs[k] = str(kwargs[k])
+
+        check_call(_LIB.MXFuncInvokeEx( \
+                   handle, \
+                   c_array(NDArrayHandle, [x.handle for x in ndargs[n_mutate_vars:]]), \
+                   c_array(mx_float, []), \
+                   c_array(NDArrayHandle, [x.handle for x in ndargs[:n_mutate_vars]]),
+                   ctypes.c_int(len(kwargs)),
+                   c_array(ctypes.c_char_p, kwargs.keys()),
+                   c_array(ctypes.c_char_p, kwargs.values()),))
         if n_mutate_vars == 1:
             return ndargs[0]
         else:
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ea45758192f1..407a1a3696ae 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -381,6 +381,21 @@ int MXFuncDescribe(FunctionHandle fun,
 }
 
 int MXFuncInvoke(FunctionHandle fun,
+                 NDArrayHandle *use_vars,
+                 mx_float *scalar_args,
+                 NDArrayHandle *mutate_vars) {
+  API_BEGIN();
+  auto *f = static_cast<const NDArrayFunctionReg*>(fun);
+  f->body((NDArray**)(use_vars),  //  NOLINT(*)
+          scalar_args,
+          (NDArray**)(mutate_vars),  //  NOLINT(*)
+          0,
+          NULL,
+          NULL);
+  API_END();
+}
+
+int MXFuncInvokeEx(FunctionHandle fun,
                  NDArrayHandle *use_vars,
                  mx_float *scalar_args,
                  NDArrayHandle *mutate_vars,

From f5c1b8d2f6fafe858bd55216b03a978b7b5615a1 Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sun, 10 Jan 2016 20:52:14 -0800
Subject: [PATCH 24/32] torch criterion

---
 Makefile                           |   6 +-
 example/torch/data.py              |  32 +++++
 example/torch/torch_function.py    |   6 +-
 example/torch/torch_module.py      |  18 ++-
 plugin/torch/torch_base.h          |  85 +++++++++++-
 plugin/torch/torch_criterion-inl.h | 209 +++++++++++++++++++++++++++++
 plugin/torch/torch_criterion.cc    |  29 ++++
 plugin/torch/torch_criterion.cu    |  18 +++
 plugin/torch/torch_module-inl.h    |  19 ++-
 plugin/torch/torch_module.cu       |   3 -
 python/mxnet/executor.py           |   2 +-
 python/mxnet/metric.py             |  11 ++
 12 files changed, 420 insertions(+), 18 deletions(-)
 create mode 100644 example/torch/data.py
 create mode 100644 plugin/torch/torch_criterion-inl.h
 create mode 100644 plugin/torch/torch_criterion.cc
 create mode 100644 plugin/torch/torch_criterion.cu

diff --git a/Makefile b/Makefile
index 6e51438062db..cc68e734924a 100644
--- a/Makefile
+++ b/Makefile
@@ -113,7 +113,7 @@ endif
 # plugin
 ifeq ($(USE_TORCH), 1)
 	CFLAGS += -I$(TORCH_PATH)/install/include -I$(TORCH_PATH)/install/include/TH -I$(TORCH_PATH)/install/include/THC -DMXNET_USE_TORCH=1
-	LDFLAGS += -L$(TORCH_PATH)/install/lib -L$(TORCH_PATH)/install/lib/lua/5.1 -lluajit -lluaT -lTH -lTHC -lpaths -ltorch -lnn
+	LDFLAGS += -L$(TORCH_PATH)/install/lib -lluajit -lluaT -lTH -lTHC -L$(TORCH_PATH)/install/lib/lua/5.1 -lpaths -ltorch -lnn
 	ifeq ($(USE_CUDA), 1)
 		LDFLAGS += -lcutorch -lcunn
 	endif
@@ -156,12 +156,12 @@ build/plugin/%.o: plugin/%.cc
 	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
 	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
 
+# A nvcc bug cause this to generate "generic/xxx.h" dependencies from torch headers.
+# $(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 build/plugin/%_gpu.o: plugin/%.cu
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
 
-
 $(EXTRA_OPERATORS)/build/%.o: $(EXTRA_OPERATORS)/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++0x $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
diff --git a/example/torch/data.py b/example/torch/data.py
new file mode 100644
index 000000000000..d39821f52145
--- /dev/null
+++ b/example/torch/data.py
@@ -0,0 +1,32 @@
+# pylint: skip-file
+""" data iterator for mnist """
+import sys
+import os
+# code to automatically download dataset
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+import get_data
+import mxnet as mx
+
+def mnist_iterator(batch_size, input_shape):
+    """return train and val iterators for mnist"""
+    # download data
+    get_data.GetMNIST_ubyte()
+    flat = False if len(input_shape) == 3 else True
+
+    train_dataiter = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        input_shape=input_shape,
+        batch_size=batch_size,
+        shuffle=True,
+        flat=flat)
+
+    val_dataiter = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        input_shape=input_shape,
+        batch_size=batch_size,
+        flat=flat)
+
+    return (train_dataiter, val_dataiter)
diff --git a/example/torch/torch_function.py b/example/torch/torch_function.py
index ffca595e5141..446ab1f00267 100644
--- a/example/torch/torch_function.py
+++ b/example/torch/torch_function.py
@@ -7,4 +7,8 @@
 x = mx.th.randn(2, 2, ctx=mx.cpu(0))
 print x.asnumpy()
 mx.th.abs(x, x) # in-place
-print x.asnumpy()
\ No newline at end of file
+print x.asnumpy()
+
+x = mx.th.ones(2, 2, ctx=mx.cpu(0))
+y = mx.th.ones(2, 2, ctx=mx.cpu(0))*2
+print mx.th.cdiv(x,y).asnumpy()
diff --git a/example/torch/torch_module.py b/example/torch/torch_module.py
index 4fef11b6c9ef..9ac0e35037c1 100644
--- a/example/torch/torch_module.py
+++ b/example/torch/torch_module.py
@@ -13,6 +13,15 @@
 act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu2')
 fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
 mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+# logsoftmax = mx.symbol.TorchModule(data_0=fc3, lua_string='nn.LogSoftMax()', num_data=1, num_params=0, num_outputs=1, name='logsoftmax')
+# label = mx.symbol.Variable('softmax_label') + 1
+# mlp = mx.symbol.TorchCriterion(data=logsoftmax, label=label, lua_string='nn.ClassNLLCriterion()', name='softmax')
+
+# exe = mlp.simple_bind(mx.cpu(0), 'write', {'data': np.float32}, data=(128,784))
+# exe.forward(is_train=True)
+# exe.backward()
+
+# exit(0)
 
 # data
 
@@ -22,9 +31,14 @@
 
 logging.basicConfig(level=logging.DEBUG)
 
+mon = mx.monitor.Monitor(1, None, pattern='.*', sort=True)
+
 model = mx.model.FeedForward(
-    ctx = mx.gpu(0), symbol = mlp, num_epoch = 20,
+    ctx = mx.cpu(0), symbol = mlp, num_epoch = 20,
     learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
 
-model.fit(X=train, eval_data=val)
+model.fit(X=train, eval_data=val,
+    #eval_metric=mx.metric.Torch())
+    monitor = mon,
+    batch_end_callback = [mx.callback.Speedometer(100, 1), mx.callback.log_train_metric(1)])
 
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
index 0263f5f0b1fe..e667ff323a92 100644
--- a/plugin/torch/torch_base.h
+++ b/plugin/torch/torch_base.h
@@ -10,17 +10,17 @@
 #include <vector>
 
 extern "C" {
-#include "lua.h"
-#include "luaT.h"
-#include "lualib.h"
-#include "THStorage.h"
-#include "THTensor.h"
+#include <lua.h>
+#include <luaT.h>
+#include <lualib.h>
+#include <THStorage.h>
+#include <THTensor.h>
 }
 
 #if MXNET_USE_CUDA
 extern "C" {
-#include "THCStorage.h"
-#include "THCTensor.h"
+#include <THCStorage.h>
+#include <THCTensor.h>
 }
 #endif  // MXNET_USE_CUDA
 
@@ -169,6 +169,77 @@ class TorchTensor {
     return tensor;
   }
 
+  static TBlob THTensorToTBlob(THGeneralTensor* handle) {
+    using namespace mshadow;
+    lua_State* L = TorchState::LuaState();
+    TBlob res;
+    lua_getfield(L, -1, "contiguous");
+    lua_pushvalue(L, -2);
+    int err = lua_pcall(L, 1, 1, 0);
+    CHECK_EQ(err, 0);
+    if (luaT_isudata(L, -1, TorchTensor::TensorType(cpu::kDevMask))) {
+      THFloatTensor* tensor = static_cast<THFloatTensor*>(luaT_toudata(L, -1,
+        TorchTensor::TensorType(cpu::kDevMask)));
+      *handle = static_cast<THGeneralTensor>(tensor);
+      THFloatStorage* storage = tensor->storage;
+      TShape shape(tensor->size, tensor->size + tensor->nDimension);
+      res = TBlob(storage->data, shape, cpu::kDevMask);
+#if MXNET_USE_CUDA
+    } else if (luaT_isudata(L, -1, TorchTensor::TensorType(gpu::kDevMask))) {
+      THCudaTensor* tensor = static_cast<THCudaTensor*>(luaT_toudata(L, -1,
+        TorchTensor::TensorType(gpu::kDevMask)));
+      *handle = static_cast<THGeneralTensor>(tensor);
+      THCudaStorage* storage = tensor->storage;
+      TShape shape(tensor->size, tensor->size + tensor->nDimension);
+      res = TBlob(storage->data, shape, gpu::kDevMask);
+#endif
+    } else {
+      LOG(FATAL) << "Unsupported Torch Tensor type " << luaT_typename(L, -1);
+    }
+    lua_pop(L, 2);
+    return res;
+  }
+
+  static void THTensorFree(THGeneralTensor handle, int dev_mask) {
+    switch (dev_mask) {
+      case cpu::kDevMask: {
+        THFloatTensor* original = static_cast<THFloatTensor*>(handle);
+        THFloatTensor_free(original);
+        break;
+      }
+#if MXNET_USE_CUDA
+      case gpu::kDevMask: {
+        THCState* state = TorchState::CudaState();
+        THCudaTensor* original = static_cast<THCudaTensor*>(handle);
+        THCudaTensor_free(state, original);
+        break;
+      }
+#endif
+      default:
+        LOG(FATAL) << "Unknown device type " << dev_mask;
+    }
+  }
+
+  static void FreeInternal(THGeneralTensor tensor, int dev_mask) {
+    switch (dev_mask) {
+      case cpu::kDevMask: {
+        THFloatStorage* original = static_cast<THFloatTensor*>(tensor)->storage;
+        THFloatStorage_free(original);
+        break;
+      }
+#if MXNET_USE_CUDA
+      case gpu::kDevMask: {
+        THCState* state = TorchState::CudaState();
+        THCudaStorage* original = static_cast<THCudaTensor*>(tensor)->storage;
+        THCudaStorage_free(state, original);
+        break;
+      }
+#endif
+      default:
+        LOG(FATAL) << "Unknown device type " << dev_mask;
+    }
+  }
+
   static void SetInternal(THGeneralTensor tensor, const TBlob& blob) {
     size_t size = blob.Size();
     switch (blob.dev_mask_) {
diff --git a/plugin/torch/torch_criterion-inl.h b/plugin/torch/torch_criterion-inl.h
new file mode 100644
index 000000000000..cc61266dbcca
--- /dev/null
+++ b/plugin/torch/torch_criterion-inl.h
@@ -0,0 +1,209 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file torch_module-inl.h
+ * \brief torch module operator
+ * \author Min Lin
+*/
+#ifndef PLUGIN_TORCH_TORCH_CRITERION_INL_H_
+#define PLUGIN_TORCH_TORCH_CRITERION_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <stdio.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../../src/operator/operator_common.h"
+#include "./torch_base.h"
+
+namespace mxnet {
+namespace op {
+struct TorchCriterionParam : public dmlc::Parameter<TorchCriterionParam> {
+  std::string lua_string;
+  TShape label_shape;
+  float grad_scale;
+  DMLC_DECLARE_PARAMETER(TorchCriterionParam) {
+    DMLC_DECLARE_FIELD(lua_string)
+    .describe("lua string that is called to generate the torch criterion object");
+    DMLC_DECLARE_FIELD(label_shape)
+    .set_default(TShape())
+    .enforce_nonzero()
+    .describe("Shape of label (without batch size).");
+    DMLC_DECLARE_FIELD(grad_scale)
+    .set_default(1.0f)
+    .describe("Scale the gradient by a float factor (a.k.a weight of this loss).");
+  }
+};
+
+/**
+ * \brief This is the implementation of activation operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
+template<typename xpu>
+class TorchCriterionOp : public Operator {
+ private:
+  TorchCriterionParam param_;
+
+ protected:
+  THCharStorage* chunk_;
+
+ public:
+  explicit TorchCriterionOp(TorchCriterionParam p) : chunk_(NULL) {
+    this->param_ = p;
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    std::string exec = std::string("return ") + p.lua_string
+      + TorchTensor::ModuleType(xpu::kDevMask);
+    CHECK_EQ(luaL_loadstring(L, exec.c_str()), 0);
+    int err = lua_pcall(L, 0, 1, 0);
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    // serialize
+    TorchState::Serialize(&chunk_);
+  }
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    TorchState::SetStream(s);
+    // Deserialize self table
+    TorchState::Deserialize(chunk_);
+    // call forward
+    // | self
+    lua_getfield(L, -1, "forward");
+    // | self | forward
+    lua_pushvalue(L, -2);
+    // | self | forward | self
+    for (index_t i = 0; i < in_data.size(); ++i) {
+      THGeneralTensor th = TorchTensor::TBlobToTHTensor(in_data[i]);
+      luaT_pushudata(L, th, TorchTensor::TensorType(in_data[i]));
+    }
+    // | self | forward | self | pred | label
+    int err = lua_pcall(L, 3, 1, 0);
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    CHECK(lua_isnumber(L, -1)) << "Criterion must return a number";
+    real_t loss = static_cast<real_t>(lua_tonumber(L, -1));
+    lua_pop(L, 1);
+    Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[0], loss*param_.grad_scale);
+    TorchState::Serialize(&chunk_);
+    CHECK_EQ(lua_gettop(L), 0);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    lua_State* L = TorchState::LuaState();
+    CHECK_EQ(lua_gettop(L), 0);
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req[0], kWriteTo) << "Torch Criterion only supports write to in_grad";
+    CHECK_EQ(req[1], kNullOp) << "Torch Criterion cannot back prop to label";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    TorchState::SetStream(s);
+    TorchState::Deserialize(chunk_);
+    THGeneralTensor th = TorchTensor::TBlobToTHTensor(in_grad[0]);
+    luaT_pushudata(L, th, TorchTensor::TensorType(in_grad[0]));
+    lua_setfield(L, -2, "gradInput");
+    lua_getfield(L, -1, "backward");
+    // | self | backward
+    lua_pushvalue(L, -2);
+    // | self | backward | self
+    for (index_t i = 0; i < in_data.size(); ++i) {
+      th = TorchTensor::TBlobToTHTensor(in_data[i]);
+      luaT_pushudata(L, th, TorchTensor::TensorType(in_data[i]));
+    }
+    // | self | forward | self | pred | label
+    int err = lua_pcall(L, 3, 0, 0);
+    CHECK_EQ(err, 0) << lua_tostring(L, -1);
+    Tensor<xpu, 2> grad = in_grad[0].FlatTo2D<xpu, real_t>(s);
+    grad *= param_.grad_scale * in_grad[0].shape_[0];
+    TorchState::Serialize(&chunk_);
+    CHECK_EQ(lua_gettop(L), 0);
+  }
+};  // class TorchCriterionOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(TorchCriterionParam type);
+
+#if DMLC_USE_CXX11
+class TorchCriterionProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  virtual std::vector<std::string> ListOutputs() const {
+    return {"output"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2);
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    std::vector<index_t> lshape;
+    lshape.push_back(dshape[0]);
+    lshape.insert(lshape.end(), param_.label_shape.data(),
+      param_.label_shape.data() +  param_.label_shape.ndim());
+    TShape shape(lshape.begin(), lshape.end());
+    SHAPE_ASSIGN_CHECK(*in_shape, 1, shape);
+    out_shape->clear();
+    out_shape->push_back(Shape1(dshape[0]));
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new TorchCriterionProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "TorchCriterion";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    std::vector<int> dep;
+    dep.insert(dep.end(), in_data.begin(), in_data.end());
+    return dep;
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  TorchCriterionParam param_;
+};
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // PLUGIN_TORCH_TORCH_CRITERION_INL_H_
diff --git a/plugin/torch/torch_criterion.cc b/plugin/torch/torch_criterion.cc
new file mode 100644
index 000000000000..a54be46a936d
--- /dev/null
+++ b/plugin/torch/torch_criterion.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cc
+ * \brief activation op
+ * \author Junyuan Xie
+*/
+#include "./torch_criterion-inl.h"
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(TorchCriterionParam param) {
+  return new TorchCriterionOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *TorchCriterionProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(TorchCriterionParam);
+
+MXNET_REGISTER_OP_PROPERTY(TorchCriterion, TorchCriterionProp)
+.describe("Criterions from torch.")
+.add_arguments(TorchCriterionParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/torch/torch_criterion.cu b/plugin/torch/torch_criterion.cu
new file mode 100644
index 000000000000..57730a0bd88b
--- /dev/null
+++ b/plugin/torch/torch_criterion.cu
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file activation.cc
+ * \brief activation op
+ * \author Bing Xu
+*/
+#include "./torch_criterion-inl.h"
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(TorchCriterionParam param) {
+  return new TorchCriterionOp<gpu>(param);
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index 1e7cb74a7cf7..074dd3e30362 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -28,7 +28,7 @@ struct TorchModuleParam : public dmlc::Parameter<TorchModuleParam> {
   uint32_t num_outputs;
   DMLC_DECLARE_PARAMETER(TorchModuleParam) {
     DMLC_DECLARE_FIELD(lua_string)
-    .describe("lua string that is called to generate the object");
+    .describe("lua string that is called to generate the torch module object");
     DMLC_DECLARE_FIELD(num_data)
     .describe("the number of input data");
     DMLC_DECLARE_FIELD(num_params)
@@ -73,6 +73,23 @@ class TorchModuleOp : public Operator {
       lua_pop(L, 2);
     }
     CHECK_EQ(param_num, param_.num_params);
+    // // Free the parameters allocated by torch so it doesn't take up memory.
+    // if (param_.num_params != 0) {
+    //   // get the parameters into the stack
+    //   lua_getfield(L, -1, "parameters");
+    //   lua_pushvalue(L, -2);
+    //   int err = lua_pcall(L, 1, 1, 0);
+    //   CHECK_EQ(err, 0);
+    //   // iterate the parameters table to put tblobs inside
+    //   lua_pushnil(L);
+    //   while (lua_next(L, -2)) {
+    //     CHECK(luaT_isudata(L, -1, TorchTensor::TensorType(xpu::kDevMask)));
+    //     void* udata = luaT_toudata(L, -1, TorchTensor::TensorType(xpu::kDevMask));
+    //     TorchTensor::FreeInternal(static_cast<THGeneralTensor>(udata), xpu::kDevMask);
+    //     lua_pop(L, 1);
+    //   }
+    //   lua_pop(L, 1);  // pop the parameter table
+    // }
     // serialize
     TorchState::Serialize(&chunk_);
   }
diff --git a/plugin/torch/torch_module.cu b/plugin/torch/torch_module.cu
index 893ebacd4fef..f5d437dd9b9f 100644
--- a/plugin/torch/torch_module.cu
+++ b/plugin/torch/torch_module.cu
@@ -6,9 +6,6 @@
 */
 #include "./torch_module-inl.h"
 #include "../../src/operator/mshadow_op.h"
-extern "C" {
-#include "THCTensor.h"
-}
 
 namespace mxnet {
 namespace op {
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 4b44272cab7f..c74e7598d256 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -353,7 +353,7 @@ def __init__(self, symbol, ctx, train_data,
         self.train_execs = []
         for i in range(len(ctx)):
             data_shapes = {k: tuple([slices[i].stop-slices[i].start] + list(v[1:]))
-                           for k, v in train_data.provide_data}
+                           for k, v in train_data.provide_data + train_data.provide_label}
             train_exec = symbol.simple_bind(ctx[i], 'write', **data_shapes)
             self.train_execs.append(train_exec)
 
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 4cb807e7232c..d8b9f97618a8 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -83,6 +83,17 @@ def update(self, labels, preds):
             self.sum_metric += numpy.sqrt(numpy.mean((label.asnumpy() - pred.asnumpy())**2))
         self.num_inst += 1
 
+class Torch(EvalMetric):
+    """Dummy metric for torch criterions"""
+    def __init__(self):
+        super(Torch, self).__init__('torch')
+
+    def update(self, labels, preds):
+        self.reset()
+        for p in preds:
+            self.sum_metric += p.asnumpy().mean()
+        self.num_inst += 1
+
 class CustomMetric(EvalMetric):
     """Custom evaluation metric that takes a NDArray function.
 

From 34b78072cb738b62b2fb9155d76a4cb190d5447c Mon Sep 17 00:00:00 2001
From: Junyuan Xie <eric.jy.xie@gmail.com>
Date: Sun, 10 Jan 2016 22:33:32 -0800
Subject: [PATCH 25/32] fix random crash caused by torch_module.list_arugments

---
 README.md                       |   1 +
 doc/tutorial/torch_howto.md     |   9 ++-
 example/torch/torch_module.py   |  29 ++++----
 plugin/torch/torch_base.h       |  51 --------------
 plugin/torch/torch_module-inl.h | 119 ++++++++++++++++----------------
 python/mxnet/metric.py          |   7 +-
 6 files changed, 86 insertions(+), 130 deletions(-)

diff --git a/README.md b/README.md
index bb138a27a402..ce44d12becf1 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ deep learning system, and interesting insights of DL systems for hackers.
 
 What's New
 ----------
+* [Embedding Torch layers and functions in MXNet](https://mxnet.readthedocs.org/en/latest/tutorial/torch_howto.html)
 * [MXNet.js: Javascript Package for Deep Learning in Browser (without server)
 ](https://github.com/dmlc/mxnet.js/)
 * [Design Note: Design Efficient Deep Learning Data Loading Module](http://mxnet.readthedocs.org/en/latest/developer-guide/note_data_loading.html)
diff --git a/doc/tutorial/torch_howto.md b/doc/tutorial/torch_howto.md
index 0b8288399634..d5ddb471aef3 100644
--- a/doc/tutorial/torch_howto.md
+++ b/doc/tutorial/torch_howto.md
@@ -46,7 +46,14 @@ mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
 ```
 Let's break it down. First `data = mx.symbol.Variable('data')` defines a Variable as placeholder for input.
 Then it's fed through Torch's nn modules with `fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')`.
-Note that we used `mx.symbol.SoftmaxOutput` instead of Torch module because torch implement loss layers in separate class torch.Critirion and it's not supported yet.
+We can also replace the last line with:
+```Python
+logsoftmax = mx.symbol.TorchModule(data_0=fc3, lua_string='nn.LogSoftMax()', num_data=1, num_params=0, num_outputs=1, name='logsoftmax')
+# Torch's label starts from 1
+label = mx.symbol.Variable('softmax_label') + 1
+mlp = mx.symbol.TorchCriterion(data=logsoftmax, label=label, lua_string='nn.ClassNLLCriterion()', name='softmax')
+```
+to use Torch's criterion as loss functions.
 The input to nn module is named as data_i for i = 0 ... num_data-1. `lua_string` is a single Lua statement that creates the module object.
 For Torch's built-in module this is simply `nn.module_name(arguments)`.
 If you are using custom module, place it in a .lua script file and load it with `require 'module_file.lua'` if your script returns an torch.nn object, or `(require 'module_file.lua')()` if your script returns a torch.nn class.
diff --git a/example/torch/torch_module.py b/example/torch/torch_module.py
index 9ac0e35037c1..02eacc311d73 100644
--- a/example/torch/torch_module.py
+++ b/example/torch/torch_module.py
@@ -6,22 +6,22 @@
 
 # define mlp
 
+use_torch_criterion = False
+
 data = mx.symbol.Variable('data')
 fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')
 act1 = mx.symbol.TorchModule(data_0=fc1, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu1')
 fc2 = mx.symbol.TorchModule(data_0=act1, lua_string='nn.Linear(128, 64)', num_data=1, num_params=2, num_outputs=1, name='fc2')
 act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu2')
 fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
-mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
-# logsoftmax = mx.symbol.TorchModule(data_0=fc3, lua_string='nn.LogSoftMax()', num_data=1, num_params=0, num_outputs=1, name='logsoftmax')
-# label = mx.symbol.Variable('softmax_label') + 1
-# mlp = mx.symbol.TorchCriterion(data=logsoftmax, label=label, lua_string='nn.ClassNLLCriterion()', name='softmax')
-
-# exe = mlp.simple_bind(mx.cpu(0), 'write', {'data': np.float32}, data=(128,784))
-# exe.forward(is_train=True)
-# exe.backward()
 
-# exit(0)
+if use_torch_criterion:
+    logsoftmax = mx.symbol.TorchModule(data_0=fc3, lua_string='nn.LogSoftMax()', num_data=1, num_params=0, num_outputs=1, name='logsoftmax')
+    # Torch's label starts from 1
+    label = mx.symbol.Variable('softmax_label') + 1
+    mlp = mx.symbol.TorchCriterion(data=logsoftmax, label=label, lua_string='nn.ClassNLLCriterion()', name='softmax')
+else:
+    mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
 
 # data
 
@@ -31,14 +31,11 @@
 
 logging.basicConfig(level=logging.DEBUG)
 
-mon = mx.monitor.Monitor(1, None, pattern='.*', sort=True)
-
 model = mx.model.FeedForward(
     ctx = mx.cpu(0), symbol = mlp, num_epoch = 20,
     learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
 
-model.fit(X=train, eval_data=val,
-    #eval_metric=mx.metric.Torch())
-    monitor = mon,
-    batch_end_callback = [mx.callback.Speedometer(100, 1), mx.callback.log_train_metric(1)])
-
+if use_torch_criterion:
+    model.fit(X=train, eval_data=val, eval_metric=mx.metric.Torch())
+else:
+    model.fit(X=train, eval_data=val)
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
index e667ff323a92..ecc6d67e6ed6 100644
--- a/plugin/torch/torch_base.h
+++ b/plugin/torch/torch_base.h
@@ -169,57 +169,6 @@ class TorchTensor {
     return tensor;
   }
 
-  static TBlob THTensorToTBlob(THGeneralTensor* handle) {
-    using namespace mshadow;
-    lua_State* L = TorchState::LuaState();
-    TBlob res;
-    lua_getfield(L, -1, "contiguous");
-    lua_pushvalue(L, -2);
-    int err = lua_pcall(L, 1, 1, 0);
-    CHECK_EQ(err, 0);
-    if (luaT_isudata(L, -1, TorchTensor::TensorType(cpu::kDevMask))) {
-      THFloatTensor* tensor = static_cast<THFloatTensor*>(luaT_toudata(L, -1,
-        TorchTensor::TensorType(cpu::kDevMask)));
-      *handle = static_cast<THGeneralTensor>(tensor);
-      THFloatStorage* storage = tensor->storage;
-      TShape shape(tensor->size, tensor->size + tensor->nDimension);
-      res = TBlob(storage->data, shape, cpu::kDevMask);
-#if MXNET_USE_CUDA
-    } else if (luaT_isudata(L, -1, TorchTensor::TensorType(gpu::kDevMask))) {
-      THCudaTensor* tensor = static_cast<THCudaTensor*>(luaT_toudata(L, -1,
-        TorchTensor::TensorType(gpu::kDevMask)));
-      *handle = static_cast<THGeneralTensor>(tensor);
-      THCudaStorage* storage = tensor->storage;
-      TShape shape(tensor->size, tensor->size + tensor->nDimension);
-      res = TBlob(storage->data, shape, gpu::kDevMask);
-#endif
-    } else {
-      LOG(FATAL) << "Unsupported Torch Tensor type " << luaT_typename(L, -1);
-    }
-    lua_pop(L, 2);
-    return res;
-  }
-
-  static void THTensorFree(THGeneralTensor handle, int dev_mask) {
-    switch (dev_mask) {
-      case cpu::kDevMask: {
-        THFloatTensor* original = static_cast<THFloatTensor*>(handle);
-        THFloatTensor_free(original);
-        break;
-      }
-#if MXNET_USE_CUDA
-      case gpu::kDevMask: {
-        THCState* state = TorchState::CudaState();
-        THCudaTensor* original = static_cast<THCudaTensor*>(handle);
-        THCudaTensor_free(state, original);
-        break;
-      }
-#endif
-      default:
-        LOG(FATAL) << "Unknown device type " << dev_mask;
-    }
-  }
-
   static void FreeInternal(THGeneralTensor tensor, int dev_mask) {
     switch (dev_mask) {
       case cpu::kDevMask: {
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index 074dd3e30362..8e3290dd35c0 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -73,23 +73,23 @@ class TorchModuleOp : public Operator {
       lua_pop(L, 2);
     }
     CHECK_EQ(param_num, param_.num_params);
-    // // Free the parameters allocated by torch so it doesn't take up memory.
-    // if (param_.num_params != 0) {
-    //   // get the parameters into the stack
-    //   lua_getfield(L, -1, "parameters");
-    //   lua_pushvalue(L, -2);
-    //   int err = lua_pcall(L, 1, 1, 0);
-    //   CHECK_EQ(err, 0);
-    //   // iterate the parameters table to put tblobs inside
-    //   lua_pushnil(L);
-    //   while (lua_next(L, -2)) {
-    //     CHECK(luaT_isudata(L, -1, TorchTensor::TensorType(xpu::kDevMask)));
-    //     void* udata = luaT_toudata(L, -1, TorchTensor::TensorType(xpu::kDevMask));
-    //     TorchTensor::FreeInternal(static_cast<THGeneralTensor>(udata), xpu::kDevMask);
-    //     lua_pop(L, 1);
-    //   }
-    //   lua_pop(L, 1);  // pop the parameter table
-    // }
+    // Free the parameters allocated by torch so it doesn't take up memory.
+    if (param_.num_params != 0) {
+      // get the parameters into the stack
+      lua_getfield(L, -1, "parameters");
+      lua_pushvalue(L, -2);
+      int err = lua_pcall(L, 1, 1, 0);
+      CHECK_EQ(err, 0);
+      // iterate the parameters table to free tblobs inside
+      lua_pushnil(L);
+      while (lua_next(L, -2)) {
+        CHECK(luaT_isudata(L, -1, TorchTensor::TensorType(xpu::kDevMask)));
+        void* udata = luaT_toudata(L, -1, TorchTensor::TensorType(xpu::kDevMask));
+        TorchTensor::FreeInternal(static_cast<THGeneralTensor>(udata), xpu::kDevMask);
+        lua_pop(L, 1);
+      }
+      lua_pop(L, 1);  // pop the parameter table
+    }
     // serialize
     TorchState::Serialize(&chunk_);
   }
@@ -223,6 +223,8 @@ Operator* CreateOp(TorchModuleParam type);
 class TorchModuleProp : public OperatorProperty {
  protected:
   mutable THCharStorage* chunk_;
+  mutable std::vector<std::string> arguments_;
+
   void InitChunk_() const {
     lua_State* L = TorchState::LuaState();
     std::string exec = std::string("return ") + param_.lua_string;
@@ -241,49 +243,50 @@ class TorchModuleProp : public OperatorProperty {
 
  public:
   std::vector<std::string> ListArguments() const override {
-    std::vector<std::string> ret;
-    if (!chunk_) {
-      InitChunk_();
-    }
-    std::string data = "data";
-    for (uint32_t i = 0; i < param_.num_data; ++i) {
-      ret.push_back(data + "_" + std::to_string(i));
-    }
-    std::string lua_code =
-        "return function(module)\n"
-        "          local params = module:parameters()\n"
-        "          local dict = {}\n"
-        "          if params == nil then\n"
-        "             return {}\n"
-        "          end\n"
-        "          for id, p in ipairs(params) do\n"
-        "             dict[p] = string.format('param_%d', id)\n"
-        "          end\n"
-        "          for key, value in pairs(module) do\n"
-        "             if dict[value] then\n"
-        "                dict[value] = key\n"
-        "             end\n"
-        "          end\n"
-        "          local ret = {}\n"
-        "          for _, p in ipairs(params) do\n"
-        "             table.insert(ret, dict[p])\n"
-        "          end\n"
-        "          return ret\n"
-        "end\n";
-    lua_State* L = TorchState::LuaState();
-    luaL_loadstring(L, lua_code.c_str());
-    int err = lua_pcall(L, 0, 1, 0);  // return the function
-    CHECK_EQ(err, 0) << lua_tostring(L, -1);
-    TorchState::Deserialize(chunk_);
-    err = lua_pcall(L, 1, 1, 0);  // call the function
-    CHECK_EQ(err, 0) << lua_tostring(L, -1);
-    lua_pushnil(L);
-    while (lua_next(L, -2)) {
-      ret.push_back(lua_tostring(L, -1));
+    if (arguments_.size() == 0) {
+      if (!chunk_) {
+        InitChunk_();
+      }
+      for (uint32_t i = 0; i < param_.num_data; ++i) {
+        std::string data = "data_" + std::to_string(i);
+        arguments_.push_back(data);
+      }
+      std::string lua_code =
+          "return function(module)\n"
+          "          local params = module:parameters()\n"
+          "          local dict = {}\n"
+          "          if params == nil then\n"
+          "             return {}\n"
+          "          end\n"
+          "          for id, p in ipairs(params) do\n"
+          "             dict[p] = string.format('param_%d', id)\n"
+          "          end\n"
+          "          for key, value in pairs(module) do\n"
+          "             if dict[value] then\n"
+          "                dict[value] = key\n"
+          "             end\n"
+          "          end\n"
+          "          local ret = {}\n"
+          "          for _, p in ipairs(params) do\n"
+          "             table.insert(ret, dict[p])\n"
+          "          end\n"
+          "          return ret\n"
+          "end\n";
+      lua_State* L = TorchState::LuaState();
+      luaL_loadstring(L, lua_code.c_str());
+      int err = lua_pcall(L, 0, 1, 0);  // return the function
+      CHECK_EQ(err, 0) << lua_tostring(L, -1);
+      TorchState::Deserialize(chunk_);
+      err = lua_pcall(L, 1, 1, 0);  // call the function
+      CHECK_EQ(err, 0) << lua_tostring(L, -1);
+      lua_pushnil(L);
+      while (lua_next(L, -2)) {
+        arguments_.push_back(lua_tostring(L, -1));
+        lua_pop(L, 1);
+      }
       lua_pop(L, 1);
     }
-    lua_pop(L, 1);
-    return ret;
+    return arguments_;
   }
 
   virtual std::vector<std::string> ListOutputs() const {
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index d8b9f97618a8..e779d8eef22e 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -88,10 +88,9 @@ class Torch(EvalMetric):
     def __init__(self):
         super(Torch, self).__init__('torch')
 
-    def update(self, labels, preds):
-        self.reset()
-        for p in preds:
-            self.sum_metric += p.asnumpy().mean()
+    def update(self, _, preds):
+        for pred in preds:
+            self.sum_metric += pred.asnumpy().mean()
         self.num_inst += 1
 
 class CustomMetric(EvalMetric):

From 22bb168d8d16ff694585ba803439323c34b8c76c Mon Sep 17 00:00:00 2001
From: svohara <svohara@gmail.com>
Date: Mon, 11 Jan 2016 12:36:02 -0700
Subject: [PATCH 26/32] Fix bug preventing train_mnist.py from working when
 data-dir is an S3 uri.

---
 example/image-classification/train_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
index d8751833a1e7..f7fb07777d79 100644
--- a/example/image-classification/train_mnist.py
+++ b/example/image-classification/train_mnist.py
@@ -95,7 +95,7 @@ def get_iterator(args, kv):
     data_dir = args.data_dir
     if '://' not in args.data_dir:
         _download(args.data_dir)
-        flat = False if len(data_shape) == 3 else True
+    flat = False if len(data_shape) == 3 else True
 
     train           = mx.io.MNISTIter(
         image       = data_dir + "train-images-idx3-ubyte",

From 8978b30882f1a729160a90d7c30c370623665049 Mon Sep 17 00:00:00 2001
From: Chuntao Hong <chuntao.hong@gmail.com>
Date: Tue, 12 Jan 2016 11:06:40 +0800
Subject: [PATCH 27/32] remove MXNET_API in narray.h

---
 include/mxnet/ndarray.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index fa19f93caf86..811fa2c968a4 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -28,7 +28,7 @@ namespace mxnet {
 /*!
  * \brief ndarray interface
  */
-class MXNET_API NDArray {
+class NDArray {
  public:
   /*! \brief default cosntructor */
   NDArray() {}
@@ -354,7 +354,7 @@ class MXNET_API NDArray {
  * \note The function name explicitly marks the order of from and to
  *     due to different possible convention carried by copy function.
  */
-MXNET_API void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
+void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
 
 /*!
  * \brief Perform elementwise sum over each data from source, store result into out.
@@ -362,7 +362,7 @@ MXNET_API void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
  * \param out the target ndarray
  * \param priority Priority of the action.
  */
-MXNET_API void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority = 0);
+void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority = 0);
 
 /*!
  * \brief elementwise add
@@ -370,69 +370,69 @@ MXNET_API void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out,
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator+(const NDArray &lhs, const NDArray &rhs);
+NDArray operator+(const NDArray &lhs, const NDArray &rhs);
 /*!
  * \brief elementwise add
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator+(const NDArray &lhs, const real_t &rhs);
+NDArray operator+(const NDArray &lhs, const real_t &rhs);
 /*!
  * \brief elementwise substraction
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator-(const NDArray &lhs, const NDArray &rhs);
+NDArray operator-(const NDArray &lhs, const NDArray &rhs);
 /*!
  * \brief elementwise substraction
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator-(const NDArray &lhs, const real_t &rhs);
+NDArray operator-(const NDArray &lhs, const real_t &rhs);
 /*!
  * \brief elementwise multiplication
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator*(const NDArray &lhs, const NDArray &rhs); \
+NDArray operator*(const NDArray &lhs, const NDArray &rhs); \
 /*!
  * \brief elementwise multiplication
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator*(const NDArray &lhs, const real_t &rhs);
+NDArray operator*(const NDArray &lhs, const real_t &rhs);
 /*!
  * \brief elementwise division
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator/(const NDArray &lhs, const NDArray &rhs);
+NDArray operator/(const NDArray &lhs, const NDArray &rhs);
 /*!
  * \brief elementwise division
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-MXNET_API NDArray operator/(const NDArray &lhs, const real_t &rhs);
+NDArray operator/(const NDArray &lhs, const real_t &rhs);
 
 /*!
  * \brief Seed the random number generator.
  * \param seed the seed to set to global random number generators.
  */
-MXNET_API void RandomSeed(uint32_t seed);
+void RandomSeed(uint32_t seed);
 /*!
  * \brief Sample uniform distribution for each elements of out.
  * \param begin lower bound of distribution.
  * \param end upper bound of distribution.
  * \param out output NDArray.
  */
-MXNET_API void SampleUniform(real_t begin, real_t end, NDArray *out);
+void SampleUniform(real_t begin, real_t end, NDArray *out);
 
 /*!
  * \brief Sample gaussian distribution for each elements of out.
@@ -440,7 +440,7 @@ MXNET_API void SampleUniform(real_t begin, real_t end, NDArray *out);
  * \param sigma standard deviation of gaussian distribution.
  * \param out output NDArray.
  */
-MXNET_API void SampleGaussian(real_t mu, real_t sigma, NDArray *out);
+void SampleGaussian(real_t mu, real_t sigma, NDArray *out);
 //--------------------------------------------------------------
 // The following part are API Registration of NDArray functions.
 //--------------------------------------------------------------

From 120dd4b78154a80d64d5e019969f96476176a6f5 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Tue, 12 Jan 2016 22:35:46 +0800
Subject: [PATCH 28/32] minor fix network choice

---
 example/image-classification/train_imagenet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index 54bf605e5d28..dfd573a4e17e 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -7,7 +7,7 @@
 # don't use -n and -s, which are resevered for the distributed training
 parser = argparse.ArgumentParser(description='train an image classifer on imagenet')
 parser.add_argument('--network', type=str, default='inception-bn',
-                    choices = ['alexnet', 'vgg', 'googlenet', 'inception-bn', 'inception-bn-full.py'],
+                    choices = ['alexnet', 'vgg', 'googlenet', 'inception-bn', 'inception-bn-full'],
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, required=True,
                     help='the input data directory')

From 69c76baab57d2b24b7dd38f6c4f482f563b22352 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Tue, 12 Jan 2016 22:49:46 +0800
Subject: [PATCH 29/32] minor fix function name for AttributeError

fix follow error
File "train_imagenet.py", line 56, in <module>
    net = importlib.import_module('symbol_' + args.network).get_symbol(args.num_classes)
AttributeError: 'module' object has no attribute 'get_symbol'
---
 example/image-classification/symbol_inception-bn-full.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/image-classification/symbol_inception-bn-full.py b/example/image-classification/symbol_inception-bn-full.py
index b984f3fa4fdb..27f6bebd9815 100644
--- a/example/image-classification/symbol_inception-bn-full.py
+++ b/example/image-classification/symbol_inception-bn-full.py
@@ -42,7 +42,7 @@ def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
     concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
     return concat
 
-def inception(num_classes = 21841):
+def get_symbol(num_classes = 21841):
     # data
     data = mx.symbol.Variable(name="data")
     # stage 1

From 42dd1e478304b90385d9c25e977965e09303026b Mon Sep 17 00:00:00 2001
From: Jacob Schreiber <jmschr@cs.washington.edu>
Date: Mon, 4 Jan 2016 17:24:59 -0800
Subject: [PATCH 30/32] ENH python metrics added, fixes

fix
---
 .gitignore                                    |   6 +-
 Makefile                                      |   3 +-
 cmake/Utils.cmake                             |   6 +-
 doc/build.md                                  |  27 ++-
 example/cpp/Makefile                          |   9 +-
 example/kaggle-ndsb1/README.md                |  34 ++++
 example/kaggle-ndsb1/gen_img_list.py          |  43 +++++
 example/kaggle-ndsb1/run_local.py             |  96 +++++++++++
 example/kaggle-ndsb2/Preprocessing.py         |  44 +++--
 python/mxnet/metric.py                        | 162 +++++++++++++++---
 .../elementwise_binary_scalar_op-inl.h        |   2 +-
 tools/accnn/acc_conv.py                       |   4 +-
 tools/accnn/acc_fc.py                         |   2 +-
 tools/accnn/accnn.py                          |   3 +-
 tools/accnn/rank_selection.py                 |   9 +-
 tools/accnn/utils.py                          |  55 ++----
 16 files changed, 409 insertions(+), 96 deletions(-)
 create mode 100644 example/kaggle-ndsb1/README.md
 create mode 100644 example/kaggle-ndsb1/gen_img_list.py
 create mode 100644 example/kaggle-ndsb1/run_local.py

diff --git a/.gitignore b/.gitignore
index c794bbaeef7e..89c5c7af2b56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,12 +82,12 @@ R-package/inst/*
 *.bin
 
 # ipython notebook
-example/notebooks/.ipynb_checkpoints/*
 *_pb2.py
+*.ipynb_checkpoints*
+input.txt*
 
 # Jetbrain
 .idea
 
 # ctags
-tags
-
+tags
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 08356bb02f79..d09e92a9b685 100644
--- a/Makefile
+++ b/Makefile
@@ -145,6 +145,8 @@ $(EXTRA_OPERATORS)/build/%_gpu.o: $(EXTRA_OPERATORS)/%.cu
 	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $(EXTRA_OPERATORS)/build/$*_gpu.o $< >$(EXTRA_OPERATORS)/build/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" $<
 
+# NOTE: to statically link libmxnet.a we need the option
+# --Wl,--whole-archive -lmxnet --Wl,--no-whole-archive
 lib/libmxnet.a: $(ALL_DEP)
 	@mkdir -p $(@D)
 	ar crv $@ $(filter %.o, $?)
@@ -153,7 +155,6 @@ lib/libmxnet.so: $(ALL_DEP)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 
-# ps-lite
 $(PS_PATH)/build/libps.a:
 	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
 	ln -fs $(PS_PATH)/tracker .
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 342689c8256c..0308645df6b8 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -1,3 +1,6 @@
+# For cmake_parse_arguments
+include(CMakeParseArguments)
+
 ################################################################################################
 # Command alias for debugging messages
 # Usage:
@@ -395,4 +398,5 @@ function(mxnet_source_group group)
     file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
     source_group(${group} FILES ${srcs2})
   endif()
-endfunction()
\ No newline at end of file
+endfunction()
+
diff --git a/doc/build.md b/doc/build.md
index b3d8ff2559dd..1d144949cb8d 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -35,7 +35,7 @@ Our goal is to build the shared library:
 The minimal building requirement is
 
 - A recent c++ compiler supporting C++ 11 such as `g++ >= 4.8` or `clang`
-- A BLAS library, such as `libblas`, `libblas`, `openblas` `intel mkl`
+- A BLAS library, such as `libblas`, `atlas`, `openblas` or `intel mkl`
 
 Optional libraries
 
@@ -239,6 +239,31 @@ Now you should have the R package as a tar.gz file and you can install it as a n
 R CMD INSTALL mxnet_0.5.tar.gz
 ```
 
+
+To install the package using GPU on Windows without building the package from scratch. Note that you need a couple of programs installed already:  
+- You'll need the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). This depends on Visual Studio, and a free compatible version would be [Visual Studio Community 2013](https://www.visualstudio.com/en-us/news/vs2013-community-vs.aspx). For instructions and compatibility checks, read http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/ .
+
+- You will also need to register as a developer at nvidia and download CUDNN V3, https://developer.nvidia.com/cudnn . 
+
+
+1. Download the mxnet package as a ZIP from the Github repository https://github.com/dmlc/mxnet and unpack it. You will be editing the `/mxnet/R-package` folder.
+
+2. Download the most recent GPU-enabled package from the [Releases tab](https://github.com/dmlc/mxnet/releases). Unzip this file so you have a folder `/nocudnn`. Note that this file and the folder you'll save it in will be used for future reference and not directly for installing the package. Only some files will be copied from it into the `R-package` folder.
+
+(Note: you now have 2 folders we're working with, possibly in different locations, that we'll reference with `R-package/` and `nocudnn/`.)
+
+3. Download CUDNN V3 from https://developer.nvidia.com/cudnn. Unpack the .zip file and you'll see 3 folders, `/bin`, `/include`, `/lib`. Copy and replace these 3 folders into `nocudnn/3rdparty/cudnn/`, or unpack the .zip file there directly.
+
+4. Create the folder `R-package/inst/libs/x64`. We only support 64-bit operating system now, so you need the x64 folder;
+
+5. Put dll files in `R-package/inst/libs/x64`. 
+
+The first dll file you need is `nocudnn/lib/libmxnet.dll`. The other dll files you need are the ones in all 4 subfolders of `nocudnn/3rdparty/`, for the `cudnn` and `openblas` you'll need to look in the `/bin` folders. There should be 11 dll files now in `R-package/inst/libs/x64`.
+
+6. Copy the folder `nocudnn/include/` to `R-package/inst/`. So now you should have a folder `R-package/inst/include/` with 3 subfolders.
+
+7. Run `R CMD INSTALL --no-multiarch R-package`. Make sure that R is added to your PATH in Environment Variables. Running the command `Where R` in Command Prompt should return the location.
+
 Note on Library Build:
 
 We isolate the library build with Rcpp end to maximize the portability
diff --git a/example/cpp/Makefile b/example/cpp/Makefile
index f8a85278a2c0..dc61757126d1 100644
--- a/example/cpp/Makefile
+++ b/example/cpp/Makefile
@@ -1,11 +1,16 @@
 CFLAGS=-I ../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -fopenmp -I ../../mshadow -I ../../dmlc-core/include
 LDFLAGS=-L ../../lib -lmxnet -lopenblas -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=1
 
+CXX=g++
+
 mlp: ./mlp.cpp
-	g++ -std=c++0x $(CFLAGS) $(LDFLAGS) -o $@ $^
+	$(CXX) -std=c++0x $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
 use_ndarray: ./use_ndarray.cpp
-	g++ -std=c++0x $(CFLAGS) $(LDFLAGS) -o $@ $^
+	$(CXX) -std=c++0x $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
 lint:
 	python2 ../../dmlc-core/scripts/lint.py mxnet "cpp" ./
+
+clean:
+	rm -f mlp use_ndarray
diff --git a/example/kaggle-ndsb1/README.md b/example/kaggle-ndsb1/README.md
new file mode 100644
index 000000000000..057c69c5d368
--- /dev/null
+++ b/example/kaggle-ndsb1/README.md
@@ -0,0 +1,34 @@
+Tutorial for Kaggle NDSB-1
+-----
+
+This is an MXNet example for Kaggle Nation Data Science Bowl 1.
+
+In this example we ignored submission part, only show local validation result.
+
+#### Step 1: Generate image list
+- Prepare original data, in layout like
+```
+--gen_img_list.py
+--data/
+    |
+    |--train/
+    |   |
+    |   |--acantharia_protist/...
+    |   |--.../
+    |--sampleSubmission.csv
+```
+- Run command ``` python gen_img_list.py train data/sampleSubmission.csv data/train/ train.lst``` to generate a full image list
+- Run command ```sed -n '1, 20000p' train.lst > tr.lst``` to generate local train list
+- Run command ```sed -n '20001p, 30337p' train.lst > va.lst``` to generate local validation list
+
+
+#### Step 2: Generate Image Record (new shape with short edge = 48)
+- Run command ```../../bin/im2rec tr.lst ./ tr.rec resize=48``` to generate training data record file
+- Run command ```../../bin/im2rec va.lst ./ va.rec resize=48``` to generate validation data record file
+
+#### Step 3: Train Model
+- Feel free to change hyper parameter in ```run_local.py```
+- Run ```python run_local.py``` to train the model
+- Sample code result: Train-accuracy=60.1%,  Validation-accuracy=62.1%
+
+
diff --git a/example/kaggle-ndsb1/gen_img_list.py b/example/kaggle-ndsb1/gen_img_list.py
new file mode 100644
index 000000000000..c88fb3c562e6
--- /dev/null
+++ b/example/kaggle-ndsb1/gen_img_list.py
@@ -0,0 +1,43 @@
+import csv
+import os
+import sys
+import random
+
+if len(sys.argv) < 4:
+    print "Usage: gen_img_list.py train/test sample_submission.csv train_folder img.lst"
+    exit(1)
+
+random.seed(888)
+
+task = sys.argv[1]
+fc = csv.reader(file(sys.argv[2]))
+fi = sys.argv[3]
+fo = csv.writer(open(sys.argv[4], "w"), delimiter='\t', lineterminator='\n')
+
+# make class map
+head = fc.next()
+head = head[1:]
+
+# make image list
+img_lst = []
+cnt = 0
+if task == "train":
+    for i in xrange(len(head)):
+        path = fi + head[i]
+        lst = os.listdir(fi + head[i])
+        for img in lst:
+            img_lst.append((cnt, i, path + '/' + img))
+            cnt += 1
+else:
+    lst = os.listdir(fi)
+    for img in lst:
+        img_lst.append((cnt, 0, fi + img))
+        cnt += 1
+
+# shuffle
+random.shuffle(img_lst)
+
+#wirte
+for item in img_lst:
+    fo.writerow(item)
+
diff --git a/example/kaggle-ndsb1/run_local.py b/example/kaggle-ndsb1/run_local.py
new file mode 100644
index 000000000000..172035ca443b
--- /dev/null
+++ b/example/kaggle-ndsb1/run_local.py
@@ -0,0 +1,96 @@
+import mxnet as mx
+import numpy as np
+import logging
+
+# Example performance:
+# INFO:root:Epoch[34] Train-accuracy=0.601388
+# INFO:root:Epoch[34] Validation-accuracy=0.620949
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+# running device
+dev = mx.gpu()
+# batch size and input shape
+batch_size = 64
+data_shape = (3, 36, 36)
+# training data info for learning rate reduction
+num_examples = 20000
+epoch_size = num_examples / batch_size
+lr_factor_epoch = 15
+# model saving parameter
+model_prefix = "./models/sample_net"
+
+# train data iterator
+train = mx.io.ImageRecordIter(
+        path_imgrec = "tr.rec",
+        mean_r      = 128,
+        mean_g      = 128,
+        mean_b      = 128,
+        scale       = 0.0078125,
+        max_aspect_ratio = 0.35,
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True)
+
+# validate data iterator
+val = mx.io.ImageRecordIter(
+        path_imgrec = "va.rec",
+        mean_r      = 128,
+        mean_b      = 128,
+        mean_g      = 128,
+        scale       = 0.0078125,
+        rand_crop   = False,
+        rand_mirror = False,
+        data_shape  = data_shape,
+        batch_size  = batch_size)
+
+# network definition
+# stage 1
+net = mx.sym.Variable("data")
+net = mx.sym.Convolution(data=net, kernel=(5, 5), num_filter=32, pad=(2, 2))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Convolution(data=net, kernel=(5, 5), num_filter=64, pad=(2, 2))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Pooling(data=net, pool_type="max", kernel=(3, 3), stride=(2, 2))
+# stage 2
+net = mx.sym.Convolution(data=net, kernel=(3, 3), num_filter=64, pad=(1, 1))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Convolution(data=net, kernel=(3, 3), num_filter=64, pad=(1, 1))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Convolution(data=net, kernel=(3, 3), num_filter=128, pad=(1, 1))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Pooling(data=net, pool_type="max", kernel=(3, 3), stride=(2, 2))
+# stage 3
+net = mx.sym.Convolution(data=net, kernel=(3, 3), num_filter=256, pad=(1, 1))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Convolution(data=net, kernel=(3, 3), num_filter=256, pad=(1, 1))
+net = mx.sym.Activation(data=net, act_type="relu")
+net = mx.sym.Pooling(data=net, pool_type="avg", kernel=(9, 9), stride=(1, 1))
+# stage 4
+net = mx.sym.Flatten(data=net)
+net = mx.sym.Dropout(data=net, p=0.25)
+net = mx.sym.FullyConnected(data=net, num_hidden=121)
+net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
+
+# Model parameter
+# This model will reduce learning rate by factor 0.1 for every 15 epoch
+model = mx.model.FeedForward(
+        ctx                = dev,
+        symbol             = net,
+        num_epoch          = 35,
+        learning_rate      = 0.01,
+        momentum           = 0.9,
+        wd                 = 0.0001,
+        clip_gradient      = 5,
+        lr_scheduler       = mx.lr_scheduler.FactorScheduler(step=epoch_size * lr_factor_epoch, factor = 0.1),
+        initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+# fit the model
+model.fit(
+        X                  = train,
+        eval_data          = val,
+        batch_end_callback = mx.callback.Speedometer(batch_size, 50),
+        epoch_end_callback = mx.callback.do_checkpoint(model_prefix))
+
diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
index fb55b4634066..ee32a7775a73 100644
--- a/example/kaggle-ndsb2/Preprocessing.py
+++ b/example/kaggle-ndsb2/Preprocessing.py
@@ -10,6 +10,8 @@
 import numpy as np
 import dicom
 from skimage import io, transform
+from joblib import Parallel, delayed
+import dill
 
 def mkdir(fname):
    try:
@@ -53,29 +55,33 @@ def write_label_csv(fname, frames, label_map):
    fo.close()
 
 
+def get_data(lst,preproc):
+   data = []
+   result = []
+   for path in lst:
+       f = dicom.read_file(path)
+       img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
+       dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
+       scipy.misc.imsave(dst_path, img)
+       result.append(dst_path)
+       data.append(img)
+   data = np.array(data, dtype=np.uint8)
+   data = data.reshape(data.size)
+   data = np.array(data,dtype=np.str_)
+   data = data.reshape(data.size)
+   return [data,result]
+
+
 def write_data_csv(fname, frames, preproc):
    """Write data to csv file"""
    fdata = open(fname, "w")
-   dwriter = csv.writer(fdata)
-   counter = 0
-   result = []
-   for lst in frames:
-       data = []
-       for path in lst:
-           f = dicom.read_file(path)
-           img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
-           dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
-           scipy.misc.imsave(dst_path, img)
-           result.append(dst_path)
-           data.append(img)
-       data = np.array(data, dtype=np.uint8)
-       data = data.reshape(data.size)
-       dwriter.writerow(data)
-       counter += 1
-       if counter % 100 == 0:
-           print("%d slices processed" % counter)
-   print("All finished, %d slices in total" % counter)
+   dr = Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
+   data,result = zip(*dr)
+   for entry in data:
+      fdata.write(','.join(entry)+'\r\n')
+   print("All finished, %d slices in total" % len(data))
    fdata.close()
+   result = np.ravel(result)
    return result
 
 
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 4cb807e7232c..0c86f115a33d 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -1,12 +1,23 @@
 # coding: utf-8
 """Online evaluation metric module."""
 from __future__ import absolute_import
-
-from .base import string_types
 import numpy
 
+def check_label_shapes(labels, preds, shape=0):
+    """Check to see if the two arrays are the same size."""
+
+    if shape == 0:
+        label_shape, pred_shape = len(labels), len(preds)
+    else:
+        label_shape, pred_shape = labels.shape, preds.shape
+
+    if label_shape != pred_shape:
+        raise ValueError("Shape of labels {} does not match shape of "
+                         "predictions {}".format(label_shape, pred_shape))
+
 class EvalMetric(object):
     """Base class of all evaluation metrics."""
+
     def __init__(self, name):
         self.name = name
         self.reset()
@@ -41,34 +52,118 @@ def get(self):
         """
         return (self.name, self.sum_metric / self.num_inst)
 
+########################
+# CLASSIFICATION METRICS
+########################
 
 class Accuracy(EvalMetric):
     """Calculate accuracy"""
+
     def __init__(self):
         super(Accuracy, self).__init__('accuracy')
 
     def update(self, labels, preds):
-        assert len(labels) == len(preds)
+        check_label_shapes(labels, preds)
+
         for i in range(len(labels)):
             pred = preds[i].asnumpy()
             label = labels[i].asnumpy().astype('int32')
             pred_label = numpy.argmax(pred, axis=1)
-            if label.shape[0] < pred_label.shape[0]:
-                raise Exception("Predict label is more than data label? ")
-            self.sum_metric += numpy.sum(pred_label == label[:pred_label.shape[0]])
-            num_inst = pred_label.size
-        self.num_inst += num_inst
+
+            check_label_shapes(label, pred)
+
+            self.sum_metric += (pred_label == label).sum()
+            self.num_inst += pred_label.shape[0]
+
+class F1(EvalMetric):
+    """Calculate the F1 score of a binary classification problem."""
+
+    def __init__(self):
+        super(F1, self).__init__('f1')
+
+    def update(self, labels, preds):
+        check_label_shapes(labels, preds)
+
+        for i in range(len(labels)):
+            pred = preds[i].asnumpy()
+            label = labels[i].asnumpy().astype('int32')
+            pred_label = numpy.argmax(pred, axis=1)
+
+            check_label_shapes(label, pred)
+            if len(numpy.unique(label)) > 2:
+                raise ValueError("F1 currently only supports binary classification.")
+
+            true_positives, false_positives, false_negatives = 0., 0., 0.
+
+            for y_pred, y_true in zip(pred_label, label):
+                if y_pred == 1 and y_true == 1:
+                    true_positives += 1.
+                elif y_pred == 1 and y_true == 0:
+                    false_positives += 1.
+                elif y_pred == 0 and y_true == 1:
+                    false_negatives += 1.
+
+            if true_positives + false_positives > 0:
+                precision = true_positives / (true_positives + false_positives)
+            else:
+                precision = 0.
+
+            if true_positives + false_negatives > 0:
+                recall = true_positives / (true_positives + false_negatives)
+            else:
+                recall = 0.
+
+            if precision + recall > 0:
+                f1_score = 2 * precision * recall / (precision + recall)
+            else:
+                f1_score = 0.
+
+            self.sum_metric += f1_score
+            self.num_inst += 1
+
+####################
+# REGRESSION METRICS
+####################
 
 class MAE(EvalMetric):
     """Calculate Mean Absolute Error loss"""
+
     def __init__(self):
         super(MAE, self).__init__('mae')
 
     def update(self, labels, preds):
-        assert len(labels) == len(preds)
+        check_label_shapes(labels, preds)
+
+        for label, pred in zip(labels, preds):
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            if len(label.shape) == 1:
+                label = label.reshape(label.shape[0], 1)
+
+            check_label_shapes(label, pred, shape=1)
+
+            self.sum_metric += numpy.abs(label - pred).sum()
+            self.num_inst += numpy.prod(label.shape)
+
+class MSE(EvalMetric):
+    """Calculate Mean Squared Error loss"""
+    def __init__(self):
+        super(MSE, self).__init__('mse')
+
+    def update(self, labels, preds):
+        check_label_shapes(labels, preds)
+
         for label, pred in zip(labels, preds):
-            assert label.shape == pred.shape
-            self.sum_metric += numpy.sum(numpy.abs(label.asnumpy() - pred.asnumpy()))
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            if len(label.shape) == 1:
+                label = label.reshape(label.shape[0], 1)
+
+            check_label_shapes(label, pred, shape=1)
+
+            self.sum_metric += ((label - pred)**2.0).mean()
             self.num_inst += numpy.prod(label.shape)
 
 class RMSE(EvalMetric):
@@ -77,10 +172,18 @@ def __init__(self):
         super(RMSE, self).__init__('rmse')
 
     def update(self, labels, preds):
-        assert len(labels) == len(preds)
+        check_label_shapes(labels, preds)
+
         for label, pred in zip(labels, preds):
-            assert label.shape == pred.shape
-            self.sum_metric += numpy.sqrt(numpy.mean((label.asnumpy() - pred.asnumpy())**2))
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            if len(label.shape) == 1:
+                label = label.reshape(label.shape[0], 1)
+
+            check_label_shapes(label, pred, shape=1)
+
+            self.sum_metric += numpy.sqrt(((label - pred)**2.0).mean())
         self.num_inst += 1
 
 class CustomMetric(EvalMetric):
@@ -103,8 +206,14 @@ def __init__(self, feval, name=None):
         self._feval = feval
 
     def update(self, labels, preds):
-        assert len(labels) == len(preds)
+        check_label_shapes(labels, preds)
         for pred, label in zip(preds, labels):
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            if pred.shape[1] == 2:
+                pred = pred[:, 1]
+
             self.sum_metric += self._feval(label, pred)
             self.num_inst += 1
 
@@ -122,7 +231,7 @@ def np(numpy_feval, name=None):
     """
     def feval(label, pred):
         """Internal eval function."""
-        return numpy_feval(label.asnumpy(), pred.asnumpy())
+        return numpy_feval(label, pred)
     feval.__name__ = numpy_feval.__name__
     return CustomMetric(feval, name)
 # pylint: enable=invalid-name
@@ -136,11 +245,20 @@ def create(metric):
         The name of the metric, or a function
         providing statistics given pred, label NDArray.
     """
+
+    metrics = {
+        'accuracy' : Accuracy(),
+        'f1' : F1(),
+        'acc' : Accuracy(),
+        'rmse' : RMSE(),
+        'mae' : MAE(),
+        'mse' : MSE()
+    }
+
     if callable(metric):
         return CustomMetric(metric)
-    if not isinstance(metric, string_types):
-        raise TypeError('metric should either be callable or str')
-    if metric == 'acc' or metric == 'accuracy':
-        return Accuracy()
-    else:
-        raise ValueError('Cannot find metric %s' % metric)
+    try:
+        return metrics[metric.lower()]
+    except:
+        raise ValueError("Metric must be either callable or in {}".format(
+            metrics.keys()))
diff --git a/src/operator/elementwise_binary_scalar_op-inl.h b/src/operator/elementwise_binary_scalar_op-inl.h
index 3a35cfba2232..dc1d60b485cd 100644
--- a/src/operator/elementwise_binary_scalar_op-inl.h
+++ b/src/operator/elementwise_binary_scalar_op-inl.h
@@ -252,7 +252,7 @@ class ElementwiseBinaryScalarOpProp : public OperatorProperty {
     param_.Init(kwargs);
   }
   std::map<std::string, std::string> GetParams() const override {
-    return std::map<std::string, std::string>();
+    return param_.__DICT__();
   }
 
   bool InferShape(std::vector<TShape> *in_shape,
diff --git a/tools/accnn/acc_conv.py b/tools/accnn/acc_conv.py
index 8f468def14fc..095e386beebc 100644
--- a/tools/accnn/acc_conv.py
+++ b/tools/accnn/acc_conv.py
@@ -67,11 +67,11 @@ def main():
 if __name__ == '__main__':
   parser=argparse.ArgumentParser()
   parser.add_argument('-m', '--model', help='the model to speed up')
-  parser.add_argument('-g', '--gpus', default='0,1,2,3', help='the gpus to be used in ctx')
+  parser.add_argument('-g', '--gpus', default='0', help='the gpus to be used in ctx')
   parser.add_argument('--load-epoch',type=int,default=1)
   parser.add_argument('--layer')
   parser.add_argument('--K', type=int)
   parser.add_argument('--save-model')
   args = parser.parse_args()
   main()
-  
\ No newline at end of file
+  
diff --git a/tools/accnn/acc_fc.py b/tools/accnn/acc_fc.py
index a7b7da163990..dcc255452b1d 100644
--- a/tools/accnn/acc_fc.py
+++ b/tools/accnn/acc_fc.py
@@ -48,7 +48,7 @@ def main():
 if __name__ == '__main__':
   parser=argparse.ArgumentParser()
   parser.add_argument('-m', '--model', help='the model to speed up')
-  parser.add_argument('-g', '--gpus', default='0,1,2,3', help='the gpus to be used in ctx')
+  parser.add_argument('-g', '--gpus', default='0', help='the gpus to be used in ctx')
   parser.add_argument('--load-epoch',type=int,default=1)
   parser.add_argument('--layer')
   parser.add_argument('--K', type=int)
diff --git a/tools/accnn/accnn.py b/tools/accnn/accnn.py
index a5e3c8fdd5bf..22aad24a83f0 100644
--- a/tools/accnn/accnn.py
+++ b/tools/accnn/accnn.py
@@ -12,7 +12,7 @@
 parser.add_argument('-m', '--model',  help='the model to speed up')
 parser.add_argument('-g', '--gpus', default='0', help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--load-epoch',type=int, default=1, help="load the model on an epoch using the model-prefix")
-parser.add_argument('--save-model', help='output model prefix')
+parser.add_argument('--save-model', type=str, default='new-model', help='output model prefix')
 parser.add_argument('--config', default=None, help='specify the config file')
 parser.add_argument('--ratio', type=float, default=2, help='speed up ratio')
 args = parser.parse_args()
@@ -25,6 +25,7 @@
   config['conv_params'] = rank_selection.get_ranksel(model, args.ratio)
   config['fc_params'] = {}
   json.dump(config, open('config-rksel-%.1f.json'%(args.ratio), 'w'), indent=2)
+  args.config = config
 
 new_model = model
 Args = collections.namedtuple('ConvArgs', 'layer K')
diff --git a/tools/accnn/rank_selection.py b/tools/accnn/rank_selection.py
index 57e3bcc8acd1..ee0fd98acb9d 100644
--- a/tools/accnn/rank_selection.py
+++ b/tools/accnn/rank_selection.py
@@ -33,12 +33,13 @@ def get_ranksel(model, ratio):
   for node in nodes:
     if node['op'] == 'Convolution':        
       input_nodes = [nodes[int(j[0])] for j in node['inputs']]
-      data = [input_node['name'] for input_node in input_nodes\
+      data = [input_node for input_node in input_nodes\
                                   if not input_node['name'].startswith(node['name'])][0]      
-      if utils.is_input(node):
+
+      if utils.is_input(data):
         ishape = (3, 224, 224)
       else:
-        ishape = out_shape_dic[data + '_output'][1:]
+        ishape = out_shape_dic[data['name'] + '_output'][1:]
       C.append(calc_complexity(ishape, node))
       D.append(int(node['param']['num_filter']))
       S.append(calc_eigenvalue(model, node))
@@ -81,6 +82,6 @@ def get_ranksel(model, ratio):
   res = [0]*n
   nowc = target_c
   for i in xrange(n-1,-1,-1):    
-    res[i] = dpc[i][nowc][0]
+    res[i] = dpc[i][nowc][0] + 1
     nowc = dpc[i][nowc][1]
   return dict(zip(conv_names, res))
diff --git a/tools/accnn/utils.py b/tools/accnn/utils.py
index a57a384b1fab..6ac13dab4a05 100644
--- a/tools/accnn/utils.py
+++ b/tools/accnn/utils.py
@@ -1,6 +1,7 @@
 import mxnet as mx
 import copy
 import json
+import ast
 
 def load_model(args):
   devs = mx.cpu() if args.gpus == None else [mx.gpu(int(i)) for i in args.gpus.split(',')]  
@@ -14,10 +15,7 @@ def topsort(nodes):
     if node.has_key('inputs'):
       for j in node['inputs']:
         deg[i] += 1
-        g[j[0]].append(i)
-        if node['name'] == '':
-          print node
-          print '!!!',j[0]
+        g[j[0]].append(i)        
   from collections import deque
   q = deque([i for i in xrange(n) if deg[i]==0])
   res = []  
@@ -38,7 +36,18 @@ def topsort(nodes):
 def is_input(node):
   name = node['name']
   return len(node['inputs']) == 0 and ('weight' not in name) and ('bias' not in name) and ('label' not in name)
-  
+
+def sym_factory(node, data):
+  name = node['name']
+  params = {}
+  if 'param' in node:    
+    for k, v in node['param'].iteritems():
+      try:
+        params[k] = ast.literal_eval(v)
+      except ValueError, e:
+        params[k] = v
+  return getattr(mx.symbol, node['op'])(data=data, name=name, **params)
+
 def replace_conv_layer(layer_name, old_model, sym_handle, arg_handle):
   conf = json.loads(old_model.symbol.tojson())
   sym_dict = {}
@@ -57,45 +66,15 @@ def replace_conv_layer(layer_name, old_model, sym_handle, arg_handle):
       try:
         data=sym_dict[datas[0]]
       except Exception, e:
-        print 'can not find symbol %s'%(datas[0])      
+        print 'can not find symbol %s'%(datas[0])
         raise e    
       if node['name'] == layer_name:
         sym = sym_handle(data, node)          
       else:
-        if node['op'] == 'Convolution':           
-          kernel = eval(node['param']['kernel'])
-          pad = eval(node['param']['pad'])
-          num_filter = int(node['param']['num_filter'])
-          name = node['name']
-          sym = mx.symbol.Convolution(data=data, kernel=kernel, pad=pad, num_filter=num_filter, name=name)        
-        elif node['op'] == 'Activation':
-          sym = mx.symbol.Activation(data=data, act_type=node['param']['act_type'], name=node['name'])
-        elif node['op'] == 'Pooling':
-          kernel = eval(node['param']['kernel'])
-          pad = eval(node['param']['pad'])
-          pool_type = node['param']['pool_type']
-          stride = eval(node['param']['stride'])
-          sym = mx.symbol.Pooling(data=data, kernel=kernel, pad=pad, pool_type=pool_type, stride=stride, name=node['name'])
-        elif node['op'] == 'Dropout':
-          p = float(node['param']['p'])
-          sym = mx.symbol.Dropout(data=data, p=p, name=node['name'])
-        elif node['op'] == 'FullyConnected':
-          no_bias = True if node['param']['no_bias']=='True' else False
-          num_hidden = int(node['param']['num_hidden'])
-          sym = mx.symbol.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, name=node['name'])
-        elif node['op'] == 'Flatten':        
-          sym = mx.symbol.Flatten(data=data, name=node['name'])
-        elif node['op'] == 'SoftmaxOutput':        
-          sym = mx.symbol.SoftmaxOutput(data=data, name='softmax')
-          res_sym = sym      
-        elif node['op'] == 'Reshape':
-          target_shape = eval(node['param']['target_shape'])
-          sym = mx.symbol.Reshape(data=data, target_shape=target_shape)
-          res_sym = sym
-        else:
-          raise Exception("Invalid symbol")
+        sym = sym_factory(node, data)        
     if sym:
       sym_dict[node['name']] = sym
+      res_sym = sym
 
   arg_params = copy.deepcopy(old_model.arg_params)
   if layer_name:  

From 1d5a5cc33a475f0e032e155f18fd3a3d62276465 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Wed, 13 Jan 2016 13:18:21 +0800
Subject: [PATCH 31/32] minor fix donot to don't

---
 example/cpp/image-classification/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/cpp/image-classification/README.md b/example/cpp/image-classification/README.md
index 71723dd30309..241b652e2106 100644
--- a/example/cpp/image-classification/README.md
+++ b/example/cpp/image-classification/README.md
@@ -43,7 +43,7 @@ The only parameter is the path of the test image.
 * The model used in the sample can be downloaded here:
 http://pan.baidu.com/s/1sjXKrqX
 
-* If you donot run it in the mxnet root path, maybe you will need to copy lib folder here.
+* If you don't run it in the mxnet root path, maybe you will need to copy lib folder here.
 
 # Author
 * **Xiao Liu**

From 25f5a265b3a507160e5bcb09b6e9eff568c60604 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 12 Jan 2016 09:39:52 -0700
Subject: [PATCH 32/32] [io] sframe iter

---
 Makefile                     |   6 +-
 dmlc-core                    |   2 +-
 make/config.mk               |   5 +
 plugin/sframe/SFrame.mk      |   7 ++
 plugin/sframe/iter_sframe.cc | 225 +++++++++++++++++++++++++++++++++++
 ps-lite                      |   2 +-
 6 files changed, 243 insertions(+), 4 deletions(-)
 create mode 100644 plugin/sframe/SFrame.mk
 create mode 100644 plugin/sframe/iter_sframe.cc

diff --git a/Makefile b/Makefile
index cc68e734924a..3d554478b715 100644
--- a/Makefile
+++ b/Makefile
@@ -89,6 +89,8 @@ ifeq ($(USE_DIST_KVSTORE), 1)
 	LDFLAGS += $(PS_LDFLAGS_A)
 endif
 
+include $(MXNET_PLUGINS)
+
 .PHONY: clean all test lint doc clean_all rcpplint rcppexport roxygen
 
 all: lib/libmxnet.a lib/libmxnet.so $(BIN)
@@ -117,7 +119,7 @@ ifeq ($(USE_TORCH), 1)
 	ifeq ($(USE_CUDA), 1)
 		LDFLAGS += -lcutorch -lcunn
 	endif
-	
+
 	TORCH_SRC = $(wildcard plugin/torch/*.cc)
 	PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(TORCH_SRC))
 	TORCH_CUSRC = $(wildcard plugin/torch/*.cu)
@@ -200,7 +202,7 @@ include tests/cpp/unittest.mk
 test: $(TEST)
 
 lint: rcpplint
-	python2 dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src plugin scripts python predict/python 
+	python2 dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src plugin scripts python predict/python
 
 doc: doxygen
 
diff --git a/dmlc-core b/dmlc-core
index ec454218564f..ea9b247b6f99 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit ec454218564fee8e531aee02b8943a4634330ce1
+Subproject commit ea9b247b6f9965c95aa66f42374d0867c46d9abd
diff --git a/make/config.mk b/make/config.mk
index e18cc7776a41..223e4edc056c 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -114,3 +114,8 @@ EXTRA_OPERATORS =
 # whether to use torch integration. This requires installing torch.
 USE_TORCH = 0
 TORCH_PATH = $(HOME)/torch
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/SFrame.mk
diff --git a/plugin/sframe/SFrame.mk b/plugin/sframe/SFrame.mk
new file mode 100644
index 000000000000..a6b199eec014
--- /dev/null
+++ b/plugin/sframe/SFrame.mk
@@ -0,0 +1,7 @@
+SFRMAE_SRC = plugin/sframe/iter_sframe.cc
+PLUGIN_OBJ += build/plugin/sframe/iter_sframe.o
+CFLAGS += -I$(SFRAME_PATH)/oss_src/unity/lib/
+CFLAGS += -I$(SFRAME_PATH)/oss_src/
+LDFLAGS += -L$(SFRAME_PATH)/release/oss_src/unity/python/sframe/
+LDFLAGS += -lunity_shared
+LDFLAGS += -lboost_system
diff --git a/plugin/sframe/iter_sframe.cc b/plugin/sframe/iter_sframe.cc
new file mode 100644
index 000000000000..615fad0e02d6
--- /dev/null
+++ b/plugin/sframe/iter_sframe.cc
@@ -0,0 +1,225 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file iter_sframe_image.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include <unity/lib/image_util.hpp>
+#include <unity/lib/gl_sframe.hpp>
+#include <unity/lib/gl_sarray.hpp>
+#include <mxnet/io.h>
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <dmlc/omp.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <string>
+#include <memory>
+#include "../../src/io/inst_vector.h"
+#include "../../src/io/image_recordio.h"
+#include "../../src/io/image_augmenter.h"
+#include "../../src/io/iter_prefetcher.h"
+#include "../../src/io/iter_normalize.h"
+#include "../../src/io/iter_batchloader.h"
+
+namespace mxnet {
+namespace io {
+
+struct SFrameParam : public dmlc::Parameter<SFrameParam> {
+  /*! \brief sframe path */
+  std::string path_sframe;
+  std::string data_field;
+  std::string label_field;
+  TShape data_shape;
+  TShape label_shape;
+  DMLC_DECLARE_PARAMETER(SFrameParam) {
+    DMLC_DECLARE_FIELD(path_sframe).set_default("")
+    .describe("Dataset Param: path to image dataset sframe");
+    DMLC_DECLARE_FIELD(data_field).set_default("data")
+    .describe("Dataset Param: data column in sframe");
+    DMLC_DECLARE_FIELD(label_field).set_default("label")
+    .describe("Dataset Param: label column in sframe");
+    DMLC_DECLARE_FIELD(data_shape)
+    .describe("Dataset Param: input data instance shape");
+    DMLC_DECLARE_FIELD(label_shape)
+    .describe("Dataset Param: input label instance shape");
+  }
+};  // struct SFrameImageParam
+
+class SFrameIterBase : public IIterator<DataInst> {
+ public:
+  SFrameIterBase() {}
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.InitAllowUnknown(kwargs);
+    sframe_ = graphlab::gl_sframe(param_.path_sframe)[{param_.data_field, param_.label_field}];
+    range_it_.reset(new graphlab::gl_sframe_range(sframe_.range_iterator()));
+    this->BeforeFirst();
+  }
+
+  virtual ~SFrameIterBase() {}
+
+  virtual void BeforeFirst() {
+    idx_ = 0;
+    *range_it_ = sframe_.range_iterator();
+    current_it_ = range_it_->begin();
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+  virtual bool Next() = 0;
+
+ protected:
+  /*! \brief index of instance */
+  index_t idx_;
+  /*! \brief output of sframe iterator */
+  DataInst out_;
+  /*! \brief temp space */
+  InstVector tmp_;
+  /*! \brief sframe iter parameter */
+  SFrameParam param_;
+  /*! \brief sframe object*/
+  graphlab::gl_sframe sframe_;
+  /*! \brief sframe range iterator */
+  std::unique_ptr<graphlab::gl_sframe_range> range_it_;
+  /*! \brief current iterator in range iterator */
+  graphlab::gl_sframe_range::iterator current_it_;
+
+ protected:
+  /*! \brief copy data */
+  template<int dim>
+  void Copy_(mshadow::Tensor<cpu, dim> tensor, const graphlab::flex_vec &vec) {
+    CHECK_EQ(tensor.shape_.Size(), vec.size());
+    CHECK_EQ(tensor.CheckContiguous(), true);
+    mshadow::Tensor<cpu, 1> flatten(tensor.dptr_, mshadow::Shape1(tensor.shape_.Size()));
+    for (index_t i = 0; i < vec.size(); ++i) {
+      flatten[i] = static_cast<float>(vec[i]);
+    }
+  }
+};  // class SFrameIterBase
+
+class SFrameImageIter : public SFrameIterBase {
+ public:
+  SFrameImageIter() :
+    augmenter_(new ImageAugmenter()), prnd_(new common::RANDOM_ENGINE(8964)) {}
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    Parent::Init(kwargs);
+    augmenter_->Init(kwargs);
+    CHECK_EQ(Parent::param_.data_shape.ndim(), 3)
+      << "Image shpae must be (channel, height, width)";
+  }
+
+  bool Next(void) override {
+    if (Parent::current_it_ == Parent::range_it_->end()) {
+      return false;
+    }
+    graphlab::image_type gl_img = (*Parent::current_it_)[0];
+    graphlab::flex_vec gl_label = (*Parent::current_it_)[1];
+    // TODO(bing): check not decoded
+    // TODO(bing): check img shape
+    CHECK_EQ(gl_label.size(), Parent::param_.label_shape.Size()) << "Label shape does not match";
+    const unsigned char *raw_data = gl_img.get_image_data();
+    cv::Mat res;
+    cv::Mat buf(1, gl_img.m_image_data_size, CV_8U, const_cast<unsigned char*>(raw_data));
+    res = cv::imdecode(buf, -1);
+    res = augmenter_->Process(res, prnd_.get());
+    const int n_channels = res.channels();
+    if (!tmp_.Size()) {
+      tmp_.Push(Parent::idx_++,
+                Parent::param_.data_shape.get<3>(),
+                Parent::param_.label_shape.get<1>());
+    }
+    mshadow::Tensor<cpu, 3> data = Parent::tmp_.data().Back();
+    std::vector<int> swap_indices;
+    if (n_channels == 1) swap_indices = {0};
+    if (n_channels == 3) swap_indices = {2, 1, 0};
+    for (int i = 0; i < res.rows; ++i) {
+      uchar* im_data = res.ptr<uchar>(i);
+      for (int j = 0; j < res.cols; ++j) {
+        for (int k = 0; k < n_channels; ++k) {
+          data[k][i][j] = im_data[swap_indices[k]];
+        }
+        im_data += n_channels;
+      }
+    }
+    mshadow::Tensor<cpu, 1> label = Parent::tmp_.label().Back();
+    Parent::Copy_<1>(label, gl_label);
+    res.release();
+    out_ = Parent::tmp_[0];
+    ++current_it_;
+    return true;
+  }
+
+ private:
+  /*! \brief parent type */
+  typedef SFrameIterBase Parent;
+  /*! \brief image augmenter */
+  std::unique_ptr<ImageAugmenter> augmenter_;
+  /*! \brief randim generator*/
+  std::unique_ptr<common::RANDOM_ENGINE> prnd_;
+};  // class SFrameImageIter
+
+class SFrameDataIter : public SFrameIterBase {
+ public:
+  bool Next() override {
+    if (Parent::current_it_ == Parent::range_it_->end()) {
+      return false;
+    }
+    graphlab::flex_vec gl_data = (*Parent::current_it_)[0];
+    graphlab::flex_vec gl_label = (*Parent::current_it_)[1];
+    CHECK_EQ(gl_data.size(), Parent::param_.data_shape.Size()) << "Data shape does not match";
+    CHECK_EQ(gl_label.size(), Parent::param_.label_shape.Size()) << "Label shape does not match";
+    if (!Parent::tmp_.Size()) {
+        Parent::tmp_.Push(Parent::idx_++,
+                  Parent::param_.data_shape.get<3>(),
+                  Parent::param_.label_shape.get<1>());
+    }
+    mshadow::Tensor<cpu, 3> data = Parent::tmp_.data().Back();
+    Parent::Copy_<3>(data, gl_data);
+    mshadow::Tensor<cpu, 1> label = Parent::tmp_.label().Back();
+    Parent::Copy_<1>(label, gl_label);
+    out_ = Parent::tmp_[0];
+    ++current_it_;
+    return true;
+  }
+
+ private:
+  /*! \brief parent type */
+  typedef SFrameIterBase Parent;
+};  // class SFrameDataIter
+
+DMLC_REGISTER_PARAMETER(SFrameParam);
+
+MXNET_REGISTER_IO_ITER(SFrameImageIter)
+.describe("Naive SFrame image iterator prototype")
+.add_arguments(SFrameParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.add_arguments(ImageAugmentParam::__FIELDS__())
+.add_arguments(ImageNormalizeParam::__FIELDS__())
+.set_body([]() {
+    return new PrefetcherIter(
+        new BatchLoader(
+            new ImageNormalizeIter(
+              new SFrameImageIter())));
+    });
+
+MXNET_REGISTER_IO_ITER(SFrameDataIter)
+.describe("Naive SFrame data iterator prototype")
+.add_arguments(SFrameParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.set_body([]() {
+    return new PrefetcherIter(
+        new BatchLoader(
+              new SFrameDataIter()));
+    });
+
+
+}  // namespace io
+}  // namespace mxnet
+
diff --git a/ps-lite b/ps-lite
index e86dac79d4ae..ca2a28e27a6d 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit e86dac79d4ae93274af8abf6c20c91dc118dc9a2
+Subproject commit ca2a28e27a6d3b305d14222f5aa44d419a1a8c14