diff --git a/CHANGES.md b/CHANGES.md
new file mode 100644
index 000000000000..03bb16936acd
--- /dev/null
+++ b/CHANGES.md
@@ -0,0 +1,12 @@
+Change Log
+=====
+
+mshadow-1.0
+=====
+* Initial release
+
+mshadow-2.0: in progress
+=====
+* Support multiple data type
+* Great refactoring of code
+* Parameter server interface for MultiGPU and distributed learning
diff --git a/README.md b/README.md
index 8cc278707b64..c19420f7e4a3 100644
--- a/README.md
+++ b/README.md
@@ -1,48 +1,38 @@
 mshadow: Matrix Shadow
 ======
 
-Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA
+MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The goal of mshadow is to support ***efficient***,
+***device invariant*** and ***simple*** tensor library for machine learning project that aims for both simplicity and performance.
 
-Creater: Bing Xu and Tianqi Chen
-
-
-Documentation and Tutorial: https://github.com/tqchen/mshadow/wiki
-
-Description
-=====
-Most machine learning algorithms requires matrix,tensor operations frequently. For example, Eq.(1) is a common SGD update rule, where the weight can be a vector, matrix or 3D tensor. Eq.(2) is the backpropagtion rule:
-```
-(1) weight =  - eta * ( grad + lambda * weight ); 
-(2) gradin = dot( gradout, netweight.T() );
-```
-
-These operations are not hard to implement, even in C++. The first one is elementwise operations, and can easily be written as 
-```
-for( int index = 0; index < weight.length; index ++ ){ 
-  weight[index] = - eta * ( grad[index] + lambda * weight[index] ); 
-}
-```
-Eq.(2) is matrix product, and we can use standard BLAS packages such as Intel MKL. It will looklike
-```
-sgemm( CblasNoTrans, CblasTrans, n, m, k, 1.0, gradout.ptr, lda, netweight.ptr, ldb, 0.0, gradin.ptr, ldc );
-```
-However:
-
-* It is annoying to write these codes repeatively, and they are not intuitive. 
-* What if we want to port our code to GPU? We need to rewrite our code in CUDA
-
-mshadow is a unified C++/CUDA lib to to write Eq.(1) and Eq.(2) in C++, and *translate* them to the for loop and standard packages such as MKL, CuBLAS *in compile time*. 
+MShadow also provides interface that allows writing Multi-GPU and distributed deep learning programs in an easy and unified way.
 
+* [Contributors](https://github.com/tqchen/mshadow/graphs/contributors)
+* [Tutorial](guide)
+* [Documentation](doc)
+* [Parameter Server Interface for GPU Tensor](guide/mshadow-ps)
 
 Features
 =====
-* Shadow instead of giant: mshadow does not implement all of the functions,  it is more of a wrapper to translated easy-to-read code to standard 'giant' packages such as MKL
-* Whitebox instead of blackbox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called
-* Unified CPU/GPU code: write a code and it should run in both CPU and GPU
+* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code
+  - No temporal memory allocation will happen for expression you write
+  - mshadow will generate specific kernel for every expression you write in compile time.
+* Device invariant: you can write one code and it will run on both CPU and GPU
+* Simple: mshadow allows you to write machine learning code using expressions.
+* Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called
 * Lightweight library: light amount of code to support frequently used functions in machine learning
 * Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required.
+* MultiGPU and Distributed ML: mshadow-ps interface allows user to write efficient MultiGPU and distributed programs in an unified way.
 
+Version
+======
+* This version mshadow-2.x, there are a lot of changes in the interface and it is not backward compatible with mshadow-1.0
+  - If you use older version of cxxnet, you will need to use the legacy mshadow code
+* For legacy code, refer to [Here](https://github.com/tqchen/mshadow/releases/tag/v1.1)
+* Change log in [CHANGES.md](CHANGES.md)
 
 Related Projects
 =====
-* CXXNET: neural network implementation based on mshadow: https://github.com/antinucleon/cxxnet
+* [CXXNET: large-scale deep learning backed by mshadow](https://github.com/antinucleon/cxxnet)
+* [Parameter Server](https://github.com/mli/parameter_server)
+  - Parameter server project provides distributed back-end for mshadow-ps
+  - mshadow-ps extends original parameter server to support async updates for GPU Tensor
diff --git a/doc/Doxyfile b/doc/Doxyfile
index bef8089a3021..f3cc429213c9 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -8,7 +8,7 @@ PROJECT_NAME           = "mshadow"
 PROJECT_NUMBER         =
 PROJECT_BRIEF          =
 PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = ../doc
+OUTPUT_DIRECTORY       = doc
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
 BRIEF_MEMBER_DESC      = YES
@@ -95,13 +95,13 @@ WARN_LOGFILE           =
 #---------------------------------------------------------------------------
 # configuration options related to the input files
 #---------------------------------------------------------------------------
-INPUT                  =
+INPUT                  = mshadow mshadow-ps
 INPUT_ENCODING         = UTF-8
 FILE_PATTERNS          =
 RECURSIVE              = NO
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       = *-inl.hpp 
+EXCLUDE_PATTERNS       = *-inl.* utils.h thread_util.h thread.h kv_array.h
 EXCLUDE_SYMBOLS        = mshadow::expr::Plan* mshadow::expr::*Engine* 
 EXAMPLE_PATH           =
 EXAMPLE_PATTERNS       =
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 000000000000..9ea6172f37a7
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,321 @@
+MShadow Documentation
+=====
+This is the documentation for mshadow: A Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. 
+
+### Links to Topics
+
+* [Tutorial](../guide)
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc)
+  - You can run ```./mkdoc.sh``` to make the document locally
+* [Tutorial about Expression Template](../guide/exp-template)
+* [Writing Multi-GPU and Distributed ML](../guide/mshadow-ps)
+* [Compile Configuration script](../make)
+* [Expression API](#expression-api)
+  - Expression api introduces the concept of expression in mshadow
+
+Expression API
+=====
+Expression is the key concept in mshadow, a common operation of mshadow is ```tensor = some code to construct expression```
+
+There are three major types of expression:
+* Mapper expression: only contain element-wise operations of Mapper expressions  
+  - Mapper expression can used as composition component of other operations.
+  - Tensor, scalar are Mapper expressions
+  - Example: ``` weight =  - eta * (grad + lambda * weight)```  is a Mapper expression.
+  - Mapper expressions are translated using expression template code implemented by mshadow.
+  - ***Assign safety***: Element-wise mapping are assign safe, which means, we can write ```A = A * 2 + B```, making lvalue appear in expression, the results are still correct.
+* Chainer expression: may contain element-wise operation such as reduction and broadcast
+  - Example: ```dst = mirror(src)``` is a chainer expression
+  - ***Assign safety***: Most of the chainer extensions are not assignment safe, which means user should avoid putting target in source epression.
+* Complex expression: complex operations, need special translation rule to translate to specific implementations.
+   - Complex expression can not be used as composition component of other operations.
+   - Example: ``` dot(lhs.T(), rhs)```,  is complex expression, we can not write
+``` dst =  1.0 + dot(lhs.T(), rhs)```
+   - But limited syntax is supported depending on specification, for example, we do support ``` dst +=  2.0f * dot(lhs.T(), rhs)```
+   - Complex expressions are translated into specific implementations such as BLAS.
+
+### Element-wise Operations
+The basic binary operators are overloaded to composite Mapper expressions, so we can write 
+```c++
+weight = (-eta) * (grad + lambda * weight);
+```
+We can also use customized binary operators, and unary operators:
+```c++
+struct maximum {
+  MSHADOW_XINLINE static float Map(float a, float b) {
+    return a > b ? a : b;
+  }
+};
+template<typename xpu>
+void ExampleMaximum(Tensor<xpu, 2> out,
+                    const Tensor<xpu, 2> &A,
+                    const Tensor<xpu, 2> &B) {
+  out= 10.0f * F<maximum>(A+1.0f, B); 
+}
+struct sigmoid {
+  MSHADOW_XINLINE static float Map(float a) {
+    return 1.0f/(1.0f+expf(-a));
+  }
+};
+template<typename xpu>
+void ExampleSigmoid(Tensor<xpu, 2> out, const Tensor<xpu, 2> &in) {
+  // equivalent to out = sigmoid(in*2) + 1; 
+  out = F<op::plus>(F<sigmoid>(in * 2.0f), ScalarExp(1.0f));
+}
+```
+### Matrix Multiplications
+Matrix multiplications are supported by following syntax, with things brackets [] are optional
+```
+dst <sv> [scale*] dot(lhs [.T()] , rhs [.T()]), <sv> can be =,+=,-=
+```
+Example:
+```c++
+template<typename xpu>
+void Backprop(Tensor<xpu, 2> gradin,
+              const Tensor<xpu, 2> &gradout,
+              const Tensor<xpu, 2> &netweight) {
+  gradin = 2.0 * dot(gradout, netweight.T());
+}
+```
+
+### Introducing Expression Extensions
+Naming conventions:
+* ```Tensor<xpu, dim>``` to refer to any Tensor with device any device and dimension. 
+* ```xpu```, ```dim```, are implicit template parameters. 
+* ```Expr<xpu, dim>``` will be used to refer to any mapper expression with type ```Tensor<xpu,dim>```.
+
+List of functions:
+* [reshape](#reshape): reshapes a tensor to another shape, number of content must be same
+* [broadcast<?>](#broadcast): replicate a 1 dimension tensor in certain dimension
+* [repmat](#repmat), special case of broadcast<0>: repeat vector over rows to form a matrix
+* [sumall_except_dim<?>](#sumall_except_dim): sum over all the dimensions, except the dimension specified in template parameter
+* [sum_rows](#sum_rows): special case of sumall_except_dim<0>, sum of rows in the matrix
+* [unpack_patch2col](#unpack_patch2col): unpack local (overlap) patches of image to column of mat, can be used to implement convolution
+* [pack_col2patch](#pack_col2patch): reverse operation of unpack_patch2col, can be used to implement deconvolution
+* [pool](#pool): do pooling on image
+* [unpool](#unpool): get gradient of pooling result
+* [crop](#crop): crop the original image to a smaller size
+* [mirror](#mirror): get the mirrored result of input expression
+
+======
+##### reshape
+* ```reshape(Expr<xpu,dim> src, Shape<dimdst> oshape)```
+* reshapes a tensor to another shape, total number of elements must be same
+* parameters:
+  - src:  input data
+  - oshape: target shape
+* result expression type: ```Tensor<xpu, dimdst>``` with ```shape=oshape```, is Mapper expression
+```c++
+void ExampleReshape(void) {
+  Tensor<cpu, 2> dst = NewTensor<cpu>(Shape2(4, 5));
+  Tensor<cpu, 1> src = NewTensor<cpu>(Shape1(20), 1.0f); 
+  dst = reshape(src, dst.shape_);
+  ...
+}
+```
+======
+
+##### broadcast
+* ```broadcast<dimcast>(Tensor<xpu,1> src, Shape<dimdst> oshape)```
+* replicate a 1 dimension tensor certain dimension, specified by template parameter dimcast
+* parameters:
+  - src: input 1 dimensional tensor
+  - oshape: shape of output
+* return expression type: ```Tensor<xpu, dimdst>```, ```shape = oshape```, is Chainer expression 
+```c++
+void ExampleBroadcast(void) {
+  Tensor<cpu, 2> dst = NewTensor<cpu>(Shape2(2, 3));
+  Tensor<cpu, 1> src = NewTensor<cpu>(Shape1(2), 1.0f);
+  src[0] = 2.0f; src[1] = 1.0f;
+  dst = broadcast<0>(src, dst.shape_);
+  // dst[0][0] = 2, dst[0][1] = 2; dst[1][0]=1, dst[1][1] = 1
+  ...
+}
+```
+======
+##### repmat
+* ```repmat(Tensor<xpu, 1> src, int nrows) ````
+* special case of broadcast, repeat 1d tensor over rows
+* input parameters:
+  - src: input vector
+  - nrows: number of rows in target
+* return expression type:  ```Tensor<xpu, 2>```, with ```shape=(nrows, src.size(0))```,  is Chainer expression
+```c++
+void ExampleRepmat(void) {
+  Tensor<cpu,2> dst = NewTensor<cpu>(Shape2(3, 2));
+  Tensor<cpu,1> src = NewTensor<cpu>(Shape1(2), 1.0f);
+  src[0] = 2.0f; src[1] = 1.0f;
+  dst = repmat(src, 3);
+  // dst[0][0] = 2, dst[0][1] = 1; dst[1][0]=2, dst[1][1] = 1
+  ...
+}
+```
+======
+##### sumall_except_dim
+* ```sumall_except_dim<dimkeep>(Expr<xpu,dim> src) ````
+* sum over all dimensions, except dimkeep
+* input parameters:
+  - src: input mapper expression
+* return expression type:  ```Tensor<xpu, 1>```, with ```shape=(src.size(dimkeep))```,  is Complex expression
+* Syntax: ```dst [sv] [scale*] sumall_except_dim<dimkeep>(src) , <sv> can be =, +=, -=, *=, /=````
+```c++
+void ExampleSumAllExceptDim(void) {
+  Tensor<cpu,3> src = NewTensor<cpu>(Shape3(2, 3, 2), 1.0f);
+  Tensor<cpu,1> dst = NewTensor<cpu>(Shape1(3), 1.0f);
+  dst += sum_all_except<1>(src * 2.0f);
+  // dst[0] = 1.0 + 4.0 *2.0 = 9.0
+  ...
+}
+```
+======
+##### sum_rows
+* ```sum_rows(Expr<xpu, 2> src) ````
+* sum of rows in the matrix
+* input parameters:
+  - src: input mapper  expression
+* return expression type:  ```Tensor<xpu,1>```, with ```shape=(src.size(0))```,  is Complex expression
+* Syntax: ```dst [sv] [scale*] sum_rows(src) , <sv> can be =,+=,-=,*=,/=````
+```c++
+void ExampleSumRows(void) {
+  Tensor<cpu, 2> src = NewTensor<cpu>(Shape2(3, 2), 1.0f);
+  Tensor<cpu, 1> dst = NewTensor<cpu>(Shape1(2), 1.0f);
+  dst += sum_rows(src + 1.0f);
+  // dst[0] = 1.0 + 3.0 *(1.0+1.0) = 7.0
+  ...
+}
+```
+======
+##### unpack_patch2col
+* ```unpack_patch2col(Expr<xpu,3> img, int psize_y, int p_size_x, int pstride) ````
+* unpack local (overlap) patches of image to column of mat, can be used to implement convolution, after getting unpacked mat, we can use: ```output = dot(weight, mat)``` to get covolved results, the relations:
+  - weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x
+  - output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images
+  -  out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0
+  - out_width  = (in_width - psize_x) / pstride + 1
+* input parameters:
+  - img: source image, can be expression; (in_channels, in_height, in_width)
+  - psize_y height of each patch
+  - psize_x width of each patch
+  - pstride: stride of each patch
+* return expression type:  ```Tensor<xpu, 2>```, with ```shape=(in_channel*psize*psize, out_height*out_width)```,  is Chainer expression
+```c++
+void ExampleCovolution(Tensor<cpu, 3> dst, Tensor<cpu, 3> src,
+                       Tensor<cpu, 2> weight, int ksize, int stride) {
+  int o_height = (src.size(1)- ksize) / stride + 1;
+  int o_width  = (src.size(2)- ksize) / stride + 1;
+  utils::Assert(weight.size(0) == src.size(0) * ksize * ksize);
+  TensorContainer<cpu, 2> tmp_col(Shape2(src.size(0) * ksize * ksize,
+                                         o_height * o_width)); 
+  TensorContainer<cpu, 2> tmp_dst(Shape2(weight.size(0),
+                                         o_height * o_width)); 
+  tmp_col = unpack_patch2col(src, ksize, ksize, stride);
+  tmp_dst = dot(weight, tmp_col);
+  dst = reshape(tmp_dst, dst.shape_);
+}
+```
+
+======
+##### pack_col2patch
+* ```pack_col2patch(Tensor<xpu, 2> mat, Shape<3> imshape, int psize_y, int psize_x, int pstride) ````
+* reverse operation of unpack_patch2col, can be used to implement deconvolution
+* input parameters:
+  - mat: source mat, same shape as output of unpack_patch2col
+  - imshape: shape of target image
+  - psize_y height of each patch
+  - psize_x width of each patch
+  - pstride: stride of each patch
+* return expression type:  ```Tensor<xpu, 3>```, with ```shape = imshape```,  is Chainer expression
+```c++
+void ExampleDecovolution(Tensor<cpu, 3> bottom, Tensor<cpu, 3> top,
+                         Tensor<cpu, 2> weight, int ksize, int stride) {
+  int o_height = (bottom.size(1)- ksize) / stride + 1;
+  int o_width  = (bottom.size(2)- ksize) / stride + 1;
+  utils::Assert(weight.size(0) == bottom.size(0) * ksize * ksize);
+  TensorContainer<cpu, 2> tmp_col(Shape2(bottom.size(0) * ksize * ksize,
+                                         o_height * o_width)); 
+  TensorContainer<cpu, 2> tmp_dst(Shape2(weight.size(0), o_height*o_width)); 
+  tmp_dst = reshape(top, tmp_dst.shape_);
+  tmp_col = dot(weight.T(), tmp_dst);
+  bottom = pack_col2patch(tmp_col, bottom.shape_, ksize, ksize, stride);
+}
+```
+
+======
+##### pool
+* ```pool<Reducer>(Expr<xpu, dim> img, [Shape<2> pshape,] int ksize_y, int ksize_x, int kstride)```
+* Pooling on image with specify kernel size and stride, can be used to implement max pooilng and other pooling layer
+* input parameters:
+  - Reducer: operation can be max or sum
+  - img: source image, can be expression; (in_channels, in_height, in_width)
+  - [optional] Shape<2> pshape, output shape
+  - ksize_y height of each patch
+  - ksize_x width of each patch
+  - kstride: stride of each patch
+* return expression:  ```Expr<xpu, dim>```, with ```shape = (in_channel, (out_height - ksize) / kstride + 1, (out_width - ksize) / kstride + 1)```, or expression in pshape
+  - Chainer expression
+```c++
+void ExampleMaxPooling(TensorContainer<cpu, 3> &data, int ksize, int stride) {
+  TensorContainer<cpu, 3> pooled(Shape3(data.size(0),
+                                        (data.size(2) - ksize) / kstride + 1), 
+                                        (data.size(1) - ksize) / kstride + 1));
+  pooled = pool<red::maximum>(data, ksize, ksize, stride);
+}
+```
+
+======
+##### unpool
+* ```unpool<Reducer>(Tensor<xpu, 4> data_src, Tensor<xpu, 4> data_pooled, Tensor<xpu, 4> grad_pooled, int ksize_y,  int ksize_x, int kstride)```
+* Unpooling on image with specify kernel size and stride, can be used to implement backprop of max pooilng and other pooling layer
+* input parameters:
+  - Reducer: operation can be max or sum
+  - data_src: source image batch. 
+  - data_pooled: pooled image batch. 
+  - grad_pooled: gradient of upper layer
+  - ksize_y height of each patch
+  - ksize_x width of each patch
+  - kstride: stride of each patch
+* return:
+  Expression, same shape to data_src
+```c++
+void ExampleMaxUnpooling(Tensor<cpu, 4> &data_src, Tensor<cpu, 4> &data_pooled, 
+                         Tensor<cpu, 4> &grad_pooled, int ksize, int kstride) {
+  TensorContainer<cpu, 4> grad(data_src.shape_);
+  grad = unpool<red::maximum>(data_src, data_pooled,
+                              grad_pooled, ksize, ksize, kstride);
+}
+```
+
+======
+##### crop
+* ```crop(Expr<xpu, dim> src, Shape<2> oshape, int start_height, int start_width)```
+* input parameters:
+ - src: input expression 
+ - oshape: output shape after crop
+ - start_height: start height for cropping
+ - start_width: start width for cropping
+* Can also be ```crop(Expr<xpu, dim> src, Shape<2> oshape)``` where the crop will happen in center. 
+* return
+ - cropped expression
+```c++
+void ExampleCrop(TensorContainer<cpu, 3> img, int start_height, int start_width) {
+  TensorContainer<cpu> cropped(Shape3(img.size(0),
+                                      img.size(1) - start_height,
+                                      img.size(2) - start_width));
+  cropped = crop(img, start_height, start_width);
+}
+```
+
+======
+##### mirror
+* ```mirrow(Expr<xpu, dim> src)```
+* input:
+    - src, source expression to be mirrored
+* output:
+    - expression of mirrored result
+```c++
+void ExampleMirror(TensorContainer<cpu, 3> img) {
+  TensorContainer<cpu> mirrored(img.shape_);
+  mirrored = mirror(img);
+}
+```
+
diff --git a/doc/mkdoc.sh b/doc/mkdoc.sh
index 2c4b038106c1..3ee3d71b8ce8 100755
--- a/doc/mkdoc.sh
+++ b/doc/mkdoc.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
-cd ../mshadow
-doxygen ../doc/Doxyfile
-cd ../doc
+cd ..
+doxygen doc/Doxyfile
+cd doc
diff --git a/example/Makefile.openblas b/example/Makefile.openblas
deleted file mode 100644
index bd90eca3922a..000000000000
--- a/example/Makefile.openblas
+++ /dev/null
@@ -1,37 +0,0 @@
-# set LD_LIBRARY_PATH
-# echo "Link mshadow with precomplied Openblas"
-export OPENBLAS_ROOT=../../OpenBLAS-v0.2.13-Win64-int32
-export CC  = gcc
-export CXX = g++
-export NVCC =nvcc
-export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -I$(OPENBLAS_ROOT)/include -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1 -D__APPLE__
-export LDFLAGS= -static -lpthread -lopenblas -L$(OPENBLAS_ROOT)/lib
-export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX)
-
-# specify tensor path
-BIN = basic defop basic-matrix-dot
-OBJ =
-CUOBJ =
-CUBIN =
-.PHONY: clean all
-
-all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ)
-
-basic: basic.cpp
-defop: defop.cpp
-basic-matrix-dot: basic-matrix-dot.cpp
-
-$(BIN) :
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS)
-
-$(OBJ) :
-	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
-
-$(CUOBJ) :
-	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
-
-$(CUBIN) :
-	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
-
-clean:
-	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
diff --git a/example/basic-matrix-dot.cpp b/example/basic-matrix-dot.cpp
deleted file mode 100644
index 5c5485beb238..000000000000
--- a/example/basic-matrix-dot.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// header file to use mshadow
-#include "mshadow/tensor.h"
-// this namespace contains all data structures, functions
-using namespace mshadow;
-// this namespace contains all operator overloads
-using namespace mshadow::expr;
-
-int main( void ){
-    // intialize tensor engine before using tensor operation, needed for CuBLAS
-    InitTensorEngine();
-
-    Tensor<cpu,2> mat = NewTensor<cpu>( Shape2(1000,1000), 1.0 ); 
-	for (int i=0;i<100;i++)
-		mat = dot(mat, mat);
-	FreeSpace(mat);
-    // shutdown tensor enigne after usage
-	
-    ShutdownTensorEngine();
-    return 0;
-}
diff --git a/example/basic.cpp b/example/basic.cpp
deleted file mode 100644
index 53f85ae1a262..000000000000
--- a/example/basic.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// header file to use mshadow
-#include "mshadow/tensor.h"
-// this namespace contains all data structures, functions
-using namespace mshadow;
-// this namespace contains all operator overloads
-using namespace mshadow::expr;
-
-int main( void ){
-    // intialize tensor engine before using tensor operation, needed for CuBLAS
-    InitTensorEngine();
-    // assume we have a float space
-    real_t data[ 20 ];
-    // create a 2 x 5 x 2 tensor, from existing space
-    Tensor<cpu,3> ts( data, Shape3(2,5,2) );
-    // take first subscript of the tensor
-    Tensor<cpu,2> mat = ts[0];
-    // Tensor object is only a handle, assignment means they have same data content
-    Tensor<cpu,2> mat2 = mat;
-
-    // shape of matrix, note shape order is different from numpy
-    // shape[i] indicate the shape of i-th dimension
-    printf("%u X %u matrix\n", mat.shape[1], mat.shape[0] );
-
-    // initialize all element to zero
-    mat = 0.0f;
-    // assign some values
-    mat[0][1] = 1.0f; mat[1][0] = 2.0f;
-    // elementwise operations
-    mat += ( mat + 10.0f ) / 10.0f + 2.0f;
-
-    // print out matrix, note: mat2 and mat1 are handles(pointers)
-    for( index_t i = 0; i < mat.shape[1]; i ++ ){
-        for( index_t j = 0; j < mat.shape[0]; j ++ ){
-            printf("%.2f ", mat2[i][j]);
-        }
-        printf("\n");
-    }
-    // shutdown tensor enigne after usage
-    ShutdownTensorEngine();
-    return 0;
-}
diff --git a/example/defop.cpp b/example/defop.cpp
deleted file mode 100644
index 990f4728bed7..000000000000
--- a/example/defop.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <cmath>
-// header file to use mshadow
-#include "mshadow/tensor.h"
-// this namespace contains all data structures, functions
-using namespace mshadow;
-// this namespace contains all operator overloads 
-using namespace mshadow::expr;
-
-// user defined unary operator addone
-struct addone{
-    MSHADOW_XINLINE static real_t Map(real_t a) {
-        return  a + 1.0f;
-    }
-};
-// user defined binary operator max of two
-struct maxoftwo{
-    MSHADOW_XINLINE static real_t Map(real_t a,real_t b) {
-        if( a > b ) return a;
-        else return b;
-    }
-};
-
-int main( void ){
-    // intialize tensor engine before using tensor operation, needed for CuBLAS
-    InitTensorEngine();
-    // take first subscript of the tensor 
-    Tensor<cpu,2> mat = NewTensor<cpu>( Shape2(2,3), 0.0f ); 
-    Tensor<cpu,2> mat2= NewTensor<cpu>( Shape2(2,3), 0.0f );
-
-    mat[0][0] = -2.0f;
-    mat = F<maxoftwo>( F<addone>( mat ) + 1.0f, mat2 );
-    
-    for( index_t i = 0; i < mat.shape[1]; i ++ ){
-        for( index_t j = 0; j < mat.shape[0]; j ++ ){
-            printf("%.2f ", mat[i][j]);
-        }
-        printf("\n");
-    }
-
-    FreeSpace( mat ); FreeSpace( mat2 );
-    // shutdown tensor enigne after usage
-    ShutdownTensorEngine();
-    return 0;
-}
diff --git a/example/exp-template/README.md b/example/exp-template/README.md
deleted file mode 100644
index 8c30a2998c2a..000000000000
--- a/example/exp-template/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-This folder is not example of mshadow code.
-It is example code introducing expression template, the trick behind mshadow.
-
-See: https://github.com/tqchen/mshadow/wiki/Expression-Template
diff --git a/example/exp-template/exp_lazy.cpp b/example/exp-template/exp_lazy.cpp
deleted file mode 100644
index 91f49b4fca78..000000000000
--- a/example/exp-template/exp_lazy.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Example Lazy evaluation code
-// for simplicity, we use struct and make all members public
-#include <cstdio>
-struct Vec;
-// expression structure holds the expression
-struct BinaryAddExp{
-    const Vec& lhs;
-    const Vec& rhs;
-    BinaryAddExp(const Vec& lhs, const Vec& rhs):lhs(lhs),rhs(rhs){}
-};
-// no constructor and destructor to allocate and de-allocate memory, allocation done by user
-struct Vec {
-    int len;
-    float* dptr;
-    Vec (void){}
-    Vec (float *dptr, int len):len(len),dptr(dptr){}
-    // here is where evaluation happens
-    inline Vec& operator= (const BinaryAddExp& src){
-        for( int i = 0; i < len; ++i ){
-            dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i];
-        }
-        return *this;
-    }
-};
-// no evaluation happens here
-inline BinaryAddExp operator+ (const Vec& lhs, const Vec& rhs){
-    return BinaryAddExp(lhs, rhs);
-}
-
-const int n = 3; 
-int main( void ){
-    float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5};
-    Vec A(sa,n), B(sb,n), C(sc,n);
-    // run expression
-    A = B + C;
-    for( int i = 0; i < n; ++ i ){
-        printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i] );
-    }
-    return 0;
-}
diff --git a/example/exp-template/exp_template.cpp b/example/exp-template/exp_template.cpp
deleted file mode 100644
index d9ec4622f706..000000000000
--- a/example/exp-template/exp_template.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// Example code, expression template, and more length equations
-// for simplicity, we use struct and make all members public
-
-#include <cstdio>
-
-// this is expression, all expressions must inheritate it, and put their type in subtype
-template<typename SubType>
-struct Exp{
-    // returns const reference of the actual type of this expression
-    inline const SubType& self(void) const{
-        return *static_cast<const SubType*>(this);
-    }
-};
-
-// binary add expression
-// note how it is inheritates from Exp
-// and put its own type into the template argument
-template<typename TLhs, typename TRhs>
-struct BinaryAddExp: public Exp< BinaryAddExp<TLhs,TRhs> >{
-    const TLhs& lhs;
-    const TRhs& rhs;
-    BinaryAddExp(const TLhs& lhs, const TRhs& rhs):lhs(lhs),rhs(rhs){}
-    // evaluation function, evaluate this expression at position i
-    inline float Eval( int i ) const{
-        return lhs.Eval(i) + rhs.Eval(i);
-    }
-};
-// no constructor and destructor to allocate and de-allocate memory, allocation done by user
-struct Vec: public Exp<Vec>{
-    int len;
-    float* dptr;
-    Vec (void){}
-    Vec (float *dptr, int len):len(len),dptr(dptr){}
-    // here is where evaluation happens
-    template<typename EType>
-    inline Vec& operator= (const Exp<EType>& src_){
-        const EType &src = src_.self();
-        for( int i=0; i < len; ++i ){
-            dptr[i] = src.Eval(i);
-        }
-        return *this;
-    }    
-    // evaluation function, evaluate this expression at position i
-    inline float Eval( int i ) const{
-        return dptr[i];
-    }
-};
-// template add, works for any expressions
-template<typename TLhs, typename TRhs>
-inline BinaryAddExp<TLhs,TRhs> operator+ (const Exp<TLhs>& lhs, const Exp<TRhs>& rhs){
-    return BinaryAddExp<TLhs,TRhs>(lhs.self(), rhs.self());
-}
-
-const int n = 3; 
-int main( void ){
-    float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5};
-    Vec A(sa,n), B(sb,n), C(sc,n);
-    // run expression, this expression is longer:)
-    A = B + C + C;
-    for( int i = 0; i < n; ++ i ){
-        printf("%d:%f==%f+%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i], C.dptr[i] );
-    }
-    return 0;
-}
diff --git a/example/exp-template/exp_template_op.cpp b/example/exp-template/exp_template_op.cpp
deleted file mode 100644
index 4399936b6981..000000000000
--- a/example/exp-template/exp_template_op.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Example code, expression template
-// with binary operator definition and extension
-// for simplicity, we use struct and make all members public
-
-#include <cstdio>
-
-// this is expression, all expressions must inheritate it, and put their type in subtype
-template<typename SubType>
-struct Exp{
-    // returns const reference of the actual type of this expression
-    inline const SubType& self(void) const{
-        return *static_cast<const SubType*>(this);
-    }
-};
-
-// binary operators
-struct mul{
-    inline static float Map(float a, float b){
-        return a * b;
-    } 
-};
-
-// binary add expression
-// note how it is inheritates from Exp
-// and put its own type into the template argument
-template<typename OP,typename TLhs, typename TRhs>
-struct BinaryMapExp: public Exp< BinaryMapExp<OP,TLhs,TRhs> >{
-    const TLhs& lhs;
-    const TRhs& rhs;
-    BinaryMapExp(const TLhs& lhs, const TRhs& rhs):lhs(lhs),rhs(rhs){}
-    // evaluation function, evaluate this expression at position i
-    inline float Eval( int i ) const{
-        return OP::Map( lhs.Eval(i), rhs.Eval(i) );
-    }
-};
-// no constructor and destructor to allocate and de-allocate memory, allocation done by user
-struct Vec: public Exp<Vec>{
-    int len;
-    float* dptr;
-    Vec (void){}
-    Vec (float *dptr, int len):len(len),dptr(dptr){}
-    // here is where evaluation happens
-    template<typename EType>
-    inline Vec& operator= (const Exp<EType>& src_){
-        const EType &src = src_.self();
-        for( int i=0; i < len; ++i ){
-            dptr[i] = src.Eval(i);
-        }
-        return *this;
-    }    
-    // evaluation function, evaluate this expression at position i
-    inline float Eval( int i ) const{
-        return dptr[i];
-    }
-};
-// template add, works for any expressions
-template<typename OP,typename TLhs, typename TRhs>
-inline BinaryMapExp<OP,TLhs,TRhs> F(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs){
-    return BinaryMapExp<OP,TLhs,TRhs>(lhs.self(), rhs.self());
-}
-
-template<typename TLhs, typename TRhs>
-inline BinaryMapExp<mul,TLhs,TRhs> operator* (const Exp<TLhs>& lhs, const Exp<TRhs>& rhs){
-    return F<mul>(lhs, rhs);
-}
-
-// user defined operation
-struct maximum{
-    inline static float Map(float a, float b){
-        return a > b ? a : b;  
-    }
-};
-
-const int n = 3; 
-int main( void ){
-    float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5};
-    Vec A(sa,n), B(sb,n), C(sc,n);
-    // run expression, this expression is longer:)
-    A = B * F<maximum>(C, B);
-    for( int i = 0; i < n; ++ i ){
-        printf("%d:%f==%f*max(%f,%f)\n", i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i] );
-    }
-    return 0;
-}
diff --git a/example/neuralnet/README.md b/example/neuralnet/README.md
deleted file mode 100644
index fb5b59a3fb07..000000000000
--- a/example/neuralnet/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-This folder contains a mshadow example of simple neural net implementation
-
-To compile the code, type make:
-* You will need to have CUDA and MKL installed.
-* Alternatively, you can compile with CBLAS packages to replace MKL such as BLAS or ATLAS, type make blas=1
-
-To run the demo, download  MNIST dataset from: http://yann.lecun.com/exdb/mnist/
-unzip all the files into current folder
-
-and run by  ./nnet cpu or ./nnet gpu. ./convnet cpu or ./convnet gpu
diff --git a/example/neuralnet/build_openblash.sh b/example/neuralnet/build_openblash.sh
deleted file mode 100644
index dd33f2cbc07c..000000000000
--- a/example/neuralnet/build_openblash.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-mv nnet.cu	nnet.cpp
-mv convnet.cu	convnet.cpp
-make -f Makefile.openblas
\ No newline at end of file
diff --git a/example/neuralnet/convnet.cu b/example/neuralnet/convnet.cu
deleted file mode 100644
index de8f65b5568b..000000000000
--- a/example/neuralnet/convnet.cu
+++ /dev/null
@@ -1,259 +0,0 @@
-// this implements a simple convolution neural net: conv-maxpool-fullc
-#include <vector>
-// header file to use mshadow
-#include "mshadow/tensor.h"
-// helper function to load mnist dataset
-#include "util.h"
-// this namespace contains all data structures, functions
-using namespace mshadow;
-// this namespace contains all operator overloads
-using namespace mshadow::expr;
-
-// define operations 
-struct relu{
-    MSHADOW_XINLINE static real_t Map(real_t a) {
-        using namespace std;
-        return max( a, 0.0f );
-    }    
-};
-struct relu_grad {
-    MSHADOW_XINLINE static real_t Map(real_t a) {
-        return a > 0.0f ? 1.0f : 0.0f;
-    }
-};
-
-/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
-class INNet{
-public:
-    virtual void Forward( const Tensor<cpu,4>& inbatch, Tensor<cpu,2> &oubatch ) = 0;
-    virtual void Backprop( const Tensor<cpu,2>& gradout ) = 0;    
-    virtual void Update( void ) = 0;
-    virtual ~INNet(){}
-};
-
-/*! 
- * \brief simple two layer conv-net conv-pool-flat-fullc
- *        this implementation is device invariant
- */
-template<typename xpu>
-class ConvNet : public INNet{
-public:
-    // initialize the network
-    ConvNet( int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out )
-        :rnd(0), ksize(ksize), kstride(kstride), psize(psize){
-        // setup nodes
-        ninput.Resize( Shape4( batch_size, 1, insize, insize ) );
-        nhidden.Resize( Shape4( batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1) ); 
-        nhiddenbak.Resize( nhidden.shape );
-        npool.Resize( Shape4( batch_size, nchannel, (nhidden.shape[1]+1-psize)/psize, (nhidden.shape[0]+1-psize)/psize ) );
-        npoolbak.Resize( npool.shape );
-        nflat.Resize( Shape2( batch_size, npool.shape[2]*npool.shape[1]*npool.shape[0] ) );
-        nout.Resize( Shape2( batch_size, num_out ) );
-        // setup bias
-        hbias.Resize( Shape1( nchannel ) ); g_hbias.Resize( hbias.shape );
-        obias.Resize( Shape1( num_out ) );  g_obias.Resize( obias.shape );
-        hbias = 0.0f; obias = 0.0f;
-        // setup weights
-        Ki2h.Resize( Shape2( nchannel, ksize*ksize ) );  g_Ki2h.Resize( Ki2h.shape );
-        Wh2o.Resize( Shape2( nflat.shape[0], num_out ) );   g_Wh2o.Resize( Wh2o.shape );
-        rnd.SampleGaussian( Ki2h, 0, 0.01f );
-        rnd.SampleGaussian( Wh2o, 0, 0.01f );
-
-        printf("conv=%d, pool=%d\n", nhidden.shape[0], npool.shape[0] );
-    }
-    virtual ~ConvNet(){}
-    // forward propagation
-    virtual void Forward( const Tensor<cpu,4>& inbatch, Tensor<cpu,2> &oubatch ){
-        index_t batch_size = inbatch.shape[3];
-        // copy data to input layer
-        Copy( ninput, inbatch );
-        // first layer, conv, use stride=2
-        ConvForward( ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst );
-        // add bias
-        nhidden += broadcast<2>( hbias, nhidden.shape );
-        // activation, relu, backup activation in nhidden 
-        nhidden = F<relu>( nhidden );
-        Copy( nhiddenbak, nhidden );
-        // max pooling 
-        npool = pool<red::maximum>( nhiddenbak, npool[0][0].shape, psize, psize );
-        Copy( npoolbak, npool );
-        // flat
-        nflat = reshape( npool, nflat.shape );
-        // second layer fullc
-        nout = dot( nflat, Wh2o );
-        nout += repmat( obias, batch_size );
-        // softmax calculation
-        Softmax( nout, nout );
-        // copy result out
-        Copy( oubatch, nout );
-    }
-    // back propagation
-    virtual void Backprop( const Tensor<cpu,2>& gradout ){        
-        // copy gradient to output layer
-        Copy( nout, gradout );
-        // calc grad of final layer
-        g_obias = sum_rows( nout );
-        g_Wh2o  = dot( nflat.T(), nout );
-        // backprop to previous layer
-        nflat = dot( nout, Wh2o.T() );
-        npool = reshape( nflat, npool.shape );
-        // backprop pooling layer
-        nhiddenbak = unpool<red::maximum>( nhiddenbak, npoolbak, npool, psize, psize );        
-        // calculate gradient of relu layer
-        nhidden = F<relu_grad>( nhidden ) * nhiddenbak;
-        // calc grad of layer 1
-        g_hbias = sumall_except_dim<2>( nhidden );
-        ConvBackWard( nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst );
-    }
-    // update weight
-    virtual void Update( void ){
-        // run SGD
-        const float eta = 0.1;
-        const float wd = 0.00001;
-        // update weight
-        Ki2h -= eta * ( wd * Ki2h + g_Ki2h );
-        Wh2o -= eta * ( wd * Wh2o + g_Wh2o );
-        // no regularization for bias
-        hbias-= eta * g_hbias;
-        obias-= eta * g_obias;
-    }
-private:
-    // forward convolution, tmp_col and tmp_dst are helper structure 
-    inline static void ConvForward( const Tensor<xpu,4> &in, const Tensor<xpu,2> &kernel, Tensor<xpu,4> &out, 
-                                    int ksize, int kstride,
-                                    TensorContainer<xpu,2> &tmp_col, TensorContainer<xpu,2>& tmp_dst ){
-        index_t oheight  = (in.shape[1] - ksize)/kstride + 1;
-        index_t owidth   = (in.shape[0] - ksize)/kstride + 1;
-        index_t nbatch   = in.shape[3];
-        index_t nchannel = out.shape[2];
-        // we directly unpack all local patches and do a dot product
-        // this cost lots of memory, normally for large image, only unpack several image at a time 
-        tmp_col.Resize( Shape2( in.shape[2]*ksize*ksize, nbatch*oheight*owidth ) );
-        tmp_dst.Resize( Shape2( nchannel, nbatch*oheight*owidth ) );
-        // unpack local patches , stride=1
-        tmp_col = unpack_patch2col( in, ksize, kstride );
-        tmp_dst = dot( kernel, tmp_col );
-        // reshape, then swap axis, we chain equations together 
-        out = swapaxis<2,3>( reshape( tmp_dst, Shape4( nchannel, nbatch, oheight, owidth ) ) );
-    }
-
-    // backward convolution, calculate gradient of kernel, and backprop back to in
-    inline static void ConvBackWard( const Tensor<xpu,4> &out, const Tensor<xpu,2> &kernel, 
-                                     Tensor<xpu,2> &g_kernel, Tensor<xpu,4> &in, 
-                                     int ksize, int kstride,
-                                     TensorContainer<xpu,2> &tmp_col, TensorContainer<xpu,2>& tmp_dst ){
-        index_t oheight  = (in.shape[1] - ksize)/kstride + 1;
-        index_t owidth   = (in.shape[0] - ksize)/kstride + 1;
-        index_t nbatch   = in.shape[3];
-        index_t nchannel = out.shape[2];
-        // we directly unpack all local patches and do a dot product
-        // this cost lots of memory, normally for large image, only unpack several image at a time 
-        tmp_col.Resize( Shape2( in.shape[2]*ksize*ksize, nbatch*oheight*owidth ) );
-        tmp_dst.Resize( Shape2( nchannel, nbatch*oheight*owidth ) );
-        // unpack local patches 
-        tmp_col = unpack_patch2col( in, ksize, kstride );        
-        tmp_dst = reshape( swapaxis<2,3>( out ), tmp_dst.shape );         
-        g_kernel = dot( tmp_dst, tmp_col.T() );
-        // backpropgation: not necessary for first layer, but included anyway
-        tmp_col = dot( kernel.T(), tmp_dst );
-        in = pack_col2patch( tmp_col, in.shape, ksize, kstride );
-    }
-private:
-    // random seed generator
-    Random<xpu> rnd;
-    // kernel size, pooling size
-    int ksize, kstride, psize;
-    // nodes in neural net
-    TensorContainer<xpu,4> ninput, nhidden, nhiddenbak, npool, npoolbak;
-    TensorContainer<xpu,2> nflat, nout;
-    // temp helper structure
-    TensorContainer<xpu,2> tmp_col, tmp_dst;
-    // hidden bias, gradient
-    TensorContainer<xpu,1> hbias, obias, g_hbias, g_obias;
-    // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize)
-    TensorContainer<xpu,2> Ki2h,  Wh2o, g_Ki2h, g_Wh2o;
-};
-
-// helper function to get the max inde
-inline int MaxIndex( Tensor<cpu,1> pred ){
-    int maxidx = 0;
-    for( index_t i = 1; i < pred.shape[0]; ++i ){
-        if( pred[i] > pred[maxidx] ) maxidx = (int)i;
-    }
-    return maxidx;
-}
-
-int main( int argc, char *argv[] ){
-    if( argc < 2 ){
-        printf("Usage: cpu or gpu\n"); return 0;
-    }
-    srand(0);
-    InitTensorEngine();
-
-    // settings
-    int batch_size = 100;
-    int insize = 28;
-    int nchannel = 10;
-    int ksize = 5;
-    int kstride = 1;
-    int psize = 2;
-    int num_out = 10;
-
-    // choose which version to use
-    INNet *net;
-    if( !strcmp( argv[1], "gpu") ) {
-#if DMSHADOW_USE_CUDA==1
-        net = new ConvNet<gpu>( batch_size, insize, nchannel, ksize, kstride, psize, num_out );
-#endif
-    }else{
-        net = new ConvNet<cpu>( batch_size, insize, nchannel, ksize, kstride, psize, num_out );
-    }
-
-    // temp output layer
-    TensorContainer<cpu,2> pred;    
-    pred.Resize( Shape2( batch_size, num_out ) );
-    
-    // label 
-    std::vector<int> ytrain, ytest;
-    // data
-    TensorContainer<cpu,2> xtrain_, xtest_;
-    LoadMNIST( "train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true);
-    LoadMNIST( "t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false);
-
-    TensorContainer<cpu,4> xtrain( Shape4(xtrain_.shape[1], 1, insize, insize) );
-    TensorContainer<cpu,4> xtest( Shape4(xtest_.shape[1],  1, insize, insize) );
-    xtrain = reshape( xtrain_, xtrain.shape );
-    xtest  = reshape( xtest_, xtest.shape );
-    
-    int num_iter = 20;
-
-    for( int i = 0; i < num_iter; ++ i ){
-        // training 
-        for( index_t j = 0; j + batch_size <= xtrain.shape[3]; j += batch_size ){
-            net->Forward( xtrain.Slice( j, j + batch_size ), pred );
-            // set gradient into pred
-            for( int k = 0; k < batch_size; ++ k ){
-                pred[k][ ytrain[k+j] ] -= 1.0f;
-            }
-            // scale gradient by batchs zie
-            pred *= 1.0f / batch_size;
-            // run backprop
-            net->Backprop( pred );
-            // update net parameters
-            net->Update();
-        }
-        // evaluation
-        long nerr = 0;
-        for( index_t j = 0; j + batch_size <= xtest.shape[3]; j += batch_size ){
-            net->Forward( xtest.Slice( j, j + batch_size ), pred );            
-            for( int k = 0; k < batch_size; ++ k ){                
-                nerr += MaxIndex( pred[k] ) != ytest[j+k];
-                
-            }
-        }
-        printf("round %d: test-err=%f\n", i, (float)nerr/xtest.shape[3] );
-    }    
-    delete net;
-    ShutdownTensorEngine();
-    return 0;
-}
diff --git a/example/neuralnet/nnet.cu b/example/neuralnet/nnet.cu
deleted file mode 100644
index a1b4dc2f67f5..000000000000
--- a/example/neuralnet/nnet.cu
+++ /dev/null
@@ -1,187 +0,0 @@
-// this implements a simple two layer neural net
-#include <vector>
-// header file to use mshadow
-#include "mshadow/tensor.h"
-// helper function to load mnist dataset
-#include "util.h"
-// this namespace contains all data structures, functions
-using namespace mshadow;
-// this namespace contains all operator overloads
-using namespace mshadow::expr;
-
-// define sigmoid operation
-struct sigmoid{
-    MSHADOW_XINLINE static real_t Map(real_t a) {
-        return  1.0f/(1.0f+expf(-a));
-    }
-};
-
-/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
-class INNet{
-public:
-    virtual void Forward( const Tensor<cpu,2>& inbatch, Tensor<cpu,2> &oubatch ) = 0;
-    virtual void Backprop( const Tensor<cpu,2>& gradout ) = 0;    
-    virtual void Update( void ) = 0;
-    virtual ~INNet(){}
-};
-
-/*! 
- * \brief simple two layer neural net 
- *        this implementation is device invariant
- */
-template<typename xpu>
-class NNet : public INNet{
-public:
-    // initialize the network
-    NNet( int batch_size, int num_in, int num_hidden, int num_out ):rnd(0){
-        // setup nodes
-        ninput.Resize( Shape2( batch_size, num_in ) );
-        nhidden.Resize( Shape2( batch_size, num_hidden ) );
-        nhiddenbak.Resize( nhidden.shape );
-        nout.Resize( Shape2( batch_size, num_out ) );
-        // setup bias
-        hbias.Resize( Shape1( num_hidden ) ); g_hbias.Resize( hbias.shape );
-        obias.Resize( Shape1( num_out ) ); g_obias.Resize( obias.shape );
-        hbias = 0.0f; obias = 0.0f;
-        // setup weights
-        Wi2h.Resize( Shape2( num_in, num_hidden ) );  g_Wi2h.Resize( Wi2h.shape );
-        Wh2o.Resize( Shape2( num_hidden, num_out ) ); g_Wh2o.Resize( Wh2o.shape );
-        rnd.SampleGaussian( Wi2h, 0, 0.01f );
-        rnd.SampleGaussian( Wh2o, 0, 0.01f );
-
-    }
-    virtual ~NNet(){}
-    // forward propagation
-    virtual void Forward( const Tensor<cpu,2>& inbatch, Tensor<cpu,2> &oubatch ){
-        // note: in mshadow, shape[0] means lowest dimension, shape[1] is number of rows in matrix
-        // this is different from numpy convention
-        index_t batch_size = inbatch.shape[1];
-        // copy data to input layer
-        Copy( ninput, inbatch );
-        // first layer, fullc
-        nhidden = dot( ninput, Wi2h );
-        nhidden+= repmat( hbias, batch_size );
-        // activation, sigmloid, backup activation in nhidden 
-        nhidden = F<sigmoid>( nhidden );
-        Copy( nhiddenbak, nhidden );
-        // second layer fullc
-        nout = dot( nhiddenbak, Wh2o );
-        nout += repmat( obias, batch_size );
-        // softmax calculation
-        Softmax( nout, nout );
-        // copy result out
-        Copy( oubatch, nout );
-    }
-    // back propagation
-    virtual void Backprop( const Tensor<cpu,2>& gradout ){        
-        // copy gradient to output layer
-        Copy( nout, gradout );
-        // calc grad of layer 2
-        g_obias = sum_rows( nout );
-        g_Wh2o  = dot( nhiddenbak.T(), nout );
-        // backprop to layer 1 
-        nhiddenbak = dot( nout, Wh2o.T() );
-        // calculate gradient of sigmoid layer
-        nhidden = nhidden * (1.0f-nhidden) * nhiddenbak;
-        // calc grad of layer 1
-        g_hbias = sum_rows( nhidden );
-        g_Wi2h  = dot( ninput.T(), nhidden );        
-    }
-    // update weight
-    virtual void Update( void ){
-        // run SGD
-        const float eta = 0.8;
-        const float wd = 0.00001;
-        // update weight
-        Wi2h -= eta * ( wd * Wi2h + g_Wi2h );
-        Wh2o -= eta * ( wd * Wh2o + g_Wh2o );
-        // no regularization for bias
-        hbias-= eta * g_hbias;
-        obias-= eta * g_obias;        
-    }
-private:
-    // random seed generator
-    Random<xpu> rnd;
-    // nodes in neural net
-    TensorContainer<xpu,2> ninput, nhidden, nhiddenbak, nout;
-    // hidden bias, gradient
-    TensorContainer<xpu,1> hbias, obias, g_hbias, g_obias;
-    // weight gradient
-    TensorContainer<xpu,2> Wi2h, Wh2o, g_Wi2h, g_Wh2o;    
-};
-
-// helper function to get the max inde
-inline int MaxIndex( Tensor<cpu,1> pred ){
-    int maxidx = 0;
-    for( index_t i = 1; i < pred.shape[0]; ++i ){
-        if( pred[i] > pred[maxidx] ) maxidx = (int)i;
-    }
-    return maxidx;
-}
-
-int main( int argc, char *argv[] ){
-    if( argc < 2 ){
-        printf("Usage: cpu or gpu\n"); return 0;
-    }
-    srand(0);
-    InitTensorEngine();
-
-    // settings
-    int batch_size = 100;
-    int num_in = 28 * 28;
-    int num_hidden = 100;
-    int num_out = 10;
-
-    // choose which version to use
-    INNet *net;
-    if( !strcmp( argv[1], "gpu") ) {
-#if DMSHADOW_USE_CUDA==1
-        net = new NNet<gpu>( batch_size, num_in, num_hidden, num_out );
-#endif
-    }else{
-        net = new NNet<cpu>( batch_size, num_in, num_hidden, num_out );
-    }
-
-    // temp output layer
-    TensorContainer<cpu,2> pred;    
-    pred.Resize( Shape2( batch_size, num_out ) );
-    
-    // label 
-    std::vector<int> ytrain, ytest;
-    // data
-    TensorContainer<cpu,2> xtrain, xtest;
-    LoadMNIST( "train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true);
-    LoadMNIST( "t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false);
-    
-    int num_iter = 20;
-
-    for( int i = 0; i < num_iter; ++ i ){
-        // training 
-        for( index_t j = 0; j + batch_size <= xtrain.shape[1]; j += batch_size ){
-            net->Forward( xtrain.Slice( j, j + batch_size ), pred );
-            // set gradient into pred
-            for( int k = 0; k < batch_size; ++ k ){
-                pred[k][ ytrain[k+j] ] -= 1.0f;
-            }
-            // scale gradient by batchs zie
-            pred *= 1.0f / batch_size;
-            // run backprop
-            net->Backprop( pred );
-            // update net parameters
-            net->Update();
-        }
-        // evaluation
-        long nerr = 0;
-        for( index_t j = 0; j + batch_size <= xtest.shape[1]; j += batch_size ){
-            net->Forward( xtest.Slice( j, j + batch_size ), pred );            
-            for( int k = 0; k < batch_size; ++ k ){                
-                nerr += MaxIndex( pred[k] ) != ytest[j+k];
-                
-            }
-        }
-        printf("round %d: test-err=%f\n", i, (float)nerr/xtest.shape[1] );
-    }    
-    delete net;
-    ShutdownTensorEngine();
-    return 0;
-}
diff --git a/example/neuralnet/run.sh b/example/neuralnet/run.sh
deleted file mode 100644
index 8b137891791f..000000000000
--- a/example/neuralnet/run.sh
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/example/neuralnet/util.h b/example/neuralnet/util.h
deleted file mode 100644
index 50bcef3fdd90..000000000000
--- a/example/neuralnet/util.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-#include <assert.h>
-#include <cstdio>
-#include <cstdlib>
-#include "mshadow/tensor.h"
-
-using namespace mshadow;
-
-int pack( unsigned char zz[4] ){
-    return (int)(zz[3]) 
-        | (((int)(zz[2])) << 8)
-        | (((int)(zz[1])) << 16)
-        | (((int)(zz[0])) << 24);
-}
-
-template<typename T>
-inline void shuffle( T *data, size_t sz ){
-    if( sz == 0 ) return;
-    for( size_t i = sz - 1; i > 0; i-- ){
-        std::swap( data[i], data[ rand() % ( i+1 ) ] );
-    } 
-}
-// random shuffle the data inside, require PRNG 
-template<typename T>
-inline void shuffle( std::vector<T> &data ){
-    shuffle( &data[0], data.size() );
-}
-
-// simple function to load in mnist
-inline void LoadMNIST( const char *path_img, const char *path_label,
-                       std::vector<int>& ylabel, TensorContainer<cpu,2>& xdata, bool do_shuffle ){
-    // load in data
-    FILE *fi = fopen( path_img, "rb" );
-    if( fi == NULL ){
-        printf("cannot open %s\n", path_img );
-        exit(-1);
-    }
-    unsigned char zz[4];
-    unsigned char *t_data, *l_data;
-    int num_image, width, height, nlabel;            
-    assert( fread(zz, 4 , 1, fi ) );
-    assert( fread(zz, 4 , 1, fi ) );    
-    num_image = pack( zz );
-    assert( fread(zz, 4 , 1, fi ) );                
-    width = pack( zz );
-    assert( fread(zz, 4 , 1, fi ) );                    
-    height = pack( zz );
-
-    int step = width * height;
-    t_data = new unsigned char[ num_image * step ];    
-    assert( fread( t_data, step*num_image , 1 , fi ) );
-    fclose( fi );
-    
-    // load in label
-    fi = fopen( path_label, "rb" );
-    assert( fread(zz, 4 , 1, fi ) );
-    assert( fread(zz, 4 , 1, fi ) );    
-    nlabel = pack( zz );
-    assert( num_image == nlabel );
-    l_data = new unsigned char[ num_image ];
-    assert( fread( l_data, num_image , 1 , fi ) );    
-    // try to do shuffle 
-    std::vector<int> rindex;
-    for( int i = 0; i < num_image; ++ i ){
-        rindex.push_back( i );
-    }
-    if( do_shuffle ){
-        shuffle( rindex );
-    }
-
-    // save out result
-    ylabel.resize( num_image );
-    xdata.Resize( Shape2( num_image, width * height ) );
-    for( int i = 0 ; i < num_image ; ++i ){
-        for( int j = 0; j < step; ++j ) {
-            xdata[ i ][ j ] = (float)(t_data[ rindex[i]*step + j ]) / 256.0f;            
-        }        
-        ylabel[ i ] = l_data[ rindex[i] ];
-    }
-    delete[] t_data; delete [] l_data;
-    printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle );
-}
diff --git a/guide/.gitignore b/guide/.gitignore
new file mode 100644
index 000000000000..f4ccede58e76
--- /dev/null
+++ b/guide/.gitignore
@@ -0,0 +1,2 @@
+defop
+basic
\ No newline at end of file
diff --git a/example/Makefile b/guide/Makefile
similarity index 72%
rename from example/Makefile
rename to guide/Makefile
index cceb3567f859..930867bb7bf2 100644
--- a/example/Makefile
+++ b/guide/Makefile
@@ -2,15 +2,17 @@
 export CC  = gcc
 export CXX = g++
 export NVCC =nvcc
-export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../
-export LDFLAGS= -lm -lcudart -lcublas -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread 
-export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX)
+include config.mk
+include ../make/mshadow.mk
+export CFLAGS = -Wall -O3 -I../ $(MSHADOW_CFLAGS)
+export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 
 # specify tensor path
 BIN = basic defop
 OBJ =
 CUOBJ =
-CUBIN =
+CUBIN = 
 .PHONY: clean all
 
 all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ)
diff --git a/guide/README.md b/guide/README.md
new file mode 100644
index 000000000000..ee36ca789208
--- /dev/null
+++ b/guide/README.md
@@ -0,0 +1,221 @@
+Tutorial of mshadow
+=====
+This is a beginner's tutorial of mshadow. If you like mshadow and have ideas to improve this tutorial, you are more than welcomed:)
+Please send a pull-request if you would like to share your experience.
+
+See also other related materials about mshadow
+* [Expression Template Tutorial](exp-template)
+* [Writing Multi-GPU and Distributed ML](mshadow-ps)
+
+**List of Topics**
+* [Tensor Data Structure](#tensor-data-structure)
+* [Memory Allocation](#memory-allocation)
+* [Elementwise Operations](#elementwise-operations)
+* [One code for both CPU and GPU](#one-code-for-both-cpu-and-gpu)
+* [Matrix Multiplications](#matrix-multiplications)
+* [User Defined Operator](#user-defined-operator)
+
+Tensor Data Structure
+====
+The basic data structure of mshadow is Tensor. The following is a simplified equivalent version of
+the declaration in [mashadow/tensor.h](../mshadow/tensor.h)
+```c++
+typedef unsigned index_t;
+template<int dimension>
+struct Shape {
+  index_t shape_[dimension];
+};
+template<typename Device, int dimension, typename DType = float>
+struct Tensor {
+  DType *dptr_;
+  Shape<dimension> shape_;
+  index_t stride_;
+};
+// this is how shape object declaration look like
+Shape<2> shape2;
+// this is how tensor object declaration look like
+// you can 
+Tensor<cpu, 2> ts2;
+Tensor<gpu, 3, float> ts3;
+```
+``` Tensor<cpu,2>``` means a two dimensional tensor in CPU, while ``` Tensor<gpu,3>``` means three dimensional tensor in GPU. 
+```Shape<k>``` gives the shape information of k-dimensional tensor. The declaration use template, and
+can be specialized into tensor of specific device and dimension. This is what two dimensional tensor will look like:
+```c++
+struct Shape<2> {
+  index_t shape_[2];
+};
+struct Tensor<cpu, 2, float> {
+  float *dptr_;
+  Shape<2> shape_;
+  index_t stride_;
+};
+```  
+* ``` Tensor<cpu, 2>``` contains ```dptr_```, which points to the space that backup the tensor. 
+* ```Shape<2>``` is a structure that stores shape information, the convention is same as numpy
+* ```stride_``` gives the number of cell space allocated in the smallest dimension (if we use numpy convention, the dimension corresponds to shape_[-1]).
+  This is introduced when we introduce some padding cells in lowest dimension to make sure memory is aligned.
+  - ```stride_``` is automatically set during memory allocation of tensor in mshadow.
+
+To understand the data structure, consider the following code:
+``` c++
+float data[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+Tensor<cpu, 2> ts;
+ts.dptr_ = data;
+ts.shape_ = mshadow::Shape2(3, 2);
+ts.stride_ = 3;
+// now: ts[0][0] == 0, ts[0][1] == 1 , ts[1][0] == 3, ts[1][1] == 4 
+for (index_t i = 0; i < ts.size(0); ++i) {
+  for (index_t j = 0; j < ts.size(1), ++j) {
+    printf("ts[%u][%u]=%f\n", i, j, ts[i][j]);
+  }
+}
+```
+The result ts should be a 3 * 2 matrix, where data[2], data[5], data[8] are padding cells that are ignored. If you want a continuous memory, set ```stride_=shape_[1]```.
+
+Memory Allocation
+====
+An important design choice about mshadow is that the data structure is a **whitebox**:
+it works so long as we set the space pointer ```dptr_```, corresponding ```shape_``` and ```stride_```: 
+* For ```Tensor<cpu, k>```, the space can be created by ```new float[]```, or pointer to some existing space such as float array in last example.
+* For ```Tensor<gpu, k>```, the space need to lie in GPU, created by ```cudaMallocPitch```
+
+mshadow also provide explicit memory allocation routine, demonstrated shown by following code
+``` c++
+// create a 5 x 3 tensor on GPU, and allocate space
+Tensor<gpu, 2> ts2(Shape2(5, 3));
+AllocSpace(&ts2);
+// allocate 5 x 3 x 2 tensor on CPU, initialized by 0
+Tensor<cpu, 3> ts3 = NewTensor<cpu>(Shape3(5,3,2), 0.0f);
+// free space
+FreeSpace(&ts2); FreeSpace(&ts3);
+```
+All memory allocations in mshadow are **explicit**. There is **no** implicit memory allocation and de-allocation during any operations.
+This means ```Tensor<cpu, k>``` variable is more like a reference handle(pointer), instead of a object. If we assign a tensor to another variable, the two share the same content space.
+
+This also allows user to use mshadow in their existing project easily, simply give mshadow the pointer of the memory and you can get the benefit of all the mshadow expressions with zero cost:)
+
+Elementwise Operations
+====
+All the operators(+, -, *, /, += etc.) in mshadow are element-wise. Consider the following SGD update code:
+```c++
+void UpdateSGD(Tensor<cpu, 2> weight, Tensor<cpu, 2> grad, float eta, float lambda) {
+  weight -= eta * (grad + lambda * weight);
+}
+```
+During compilation, this code will be translated to the following form:
+```c++
+void UpdateSGD(Tensor<cpu,2> weight, Tensor<cpu,2> grad, float eta, float lambda) {
+  for (index_t y = 0; y < weight.size(0); ++y) {
+    for (index_t x = 0; x < weight.size(1); ++x) {
+      weight[y][x] -= eta * (grad[y][x] + lambda * weight[y][x]);
+    }
+  }
+}
+```
+As we can see, *no memory allocation* is happened in the translated code. For ```Tensor<gpu, k>```, the corresponding function will be translated into a CUDA kernel of same spirit.
+Using [Expression Template](exp-template), the translation is happened during compile time. We can write simple lines of code while get the full performance of the translated code.
+
+One code for both CPU and GPU
+====
+Since mshadow have identical interface for ```Tensor<cpu, k>``` and ```Tensor<gpu, k>```, we can easily write one code that works in both CPU and GPU.
+For example, the following code compiles for both GPU and CPU Tensors.
+```c++
+template<typename xpu>
+void UpdateSGD(Tensor<xpu, 2> weight, const Tensor<xpu, 2> &grad,
+               float eta, float lambda) {
+  weight -= eta * (grad + lambda * weight);
+}
+```
+Matrix Multiplications
+====
+We also have short hands for dot product, as like follows. The code will be translated to call standard packages such as MKL and CuBLAS.
+```c++
+template<typename xpu>
+void Backprop(Tensor<xpu, 2> gradin,
+              const Tensor<xpu, 2> &gradout,
+              const Tensor<xpu, 2> &netweight) {
+  gradin = dot(gradout, netweight.T());
+}
+```
+Again, the code can compile for both GPU and CPU Tensors
+
+User Defined Operator
+====
+There are common cases when we want to define our own function. For example, assume we do not have element-wise sigmoid transformation in mshadow,
+which is very commonly used in machine learning algorithms. We simply use the following code to add sigmoid to mshadow
+```c++
+struct sigmoid {
+  MSHADOW_XINLINE static float Map(float a) {
+    return 1.0f / (1.0f + expf(-a));
+  }
+};
+template<typename xpu>
+void ExampleSigmoid(Tensor<xpu, 2> out, const Tensor<xpu, 2> &in) {
+  out = F<sigmoid>(in * 2.0f) + 1.0f;
+}
+```
+The equivalent translated code for CPU is given by
+```c++
+template<typename xpu>
+void ExampleSigmoid(Tensor<xpu, 2> out, const Tensor<xpu, 2> &in) {
+  for (index_t y = 0; y < out.size(0); ++y) {
+    for(index_t x = 0; x < out.size(1); ++x) {
+      out[y][x] = sigmoid::Map(in[y][x] * 2.0f) + 1.0f;
+    }
+  }
+}
+```
+Also note that the defined operation can be **composited into expressions**, not only we can write ```out = F<sigmoid>(in)```,
+we can also write ```out = F<sigmoid>+2.0``` or ```out = F<sigmoid>(F<sigmoid>(in))```.
+
+There will also be a translated CUDA kernel version that runs in GPU. Check out [defop.cpp](defop.cpp) for complete example.
+
+Complete Example
+====
+The following code is from [basic.cpp](basic.cpp), that illustrate basic usage of mshadow.
+
+```c++
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<cpu>();
+  // assume we have a float space
+  float data[20];
+  // create a 2 x 5 x 2 tensor, from existing space
+  Tensor<cpu, 3> ts(data, Shape3(2,5,2));
+    // take first subscript of the tensor
+  Tensor<cpu, 2> mat = ts[0];
+  // Tensor object is only a handle, assignment means they have same data content
+  // we can specify content type of a Tensor, if not specified, it is float bydefault
+  Tensor<cpu, 2, float> mat2 = mat;
+  
+  // shaape of matrix, note size order is same as numpy
+  printf("%u X %u matrix\n", mat.size(1), mat.size(1));
+  
+  // initialize all element to zero
+  mat = 0.0f;
+  // assign some values
+  mat[0][1] = 1.0f; mat[1][0] = 2.0f;
+  // elementwise operations
+  mat += (mat + 10.0f) / 10.0f + 2.0f;
+  
+  // print out matrix, note: mat2 and mat1 are handles(pointers)
+  for (index_t i = 0; i < mat.size(0); ++i) {
+    for (index_t j = 0; j < mat.size(1); ++j) {
+      printf("%.2f ", mat2[i][j]);
+    }
+    printf("\n");
+  }
+  // shutdown tensor enigne after usage
+  ShutdownTensorEngine<cpu>();
+  return 0;
+}
+```
+
diff --git a/guide/basic.cpp b/guide/basic.cpp
new file mode 100644
index 000000000000..cb6586d398d0
--- /dev/null
+++ b/guide/basic.cpp
@@ -0,0 +1,42 @@
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<cpu>();
+  // assume we have a float space
+  float data[20];
+  // create a 2 x 5 x 2 tensor, from existing space
+  Tensor<cpu, 3> ts(data, Shape3(2,5,2));
+    // take first subscript of the tensor
+  Tensor<cpu, 2> mat = ts[0];
+  // Tensor object is only a handle, assignment means they have same data content
+  // we can specify content type of a Tensor, if not specified, it is float bydefault
+  Tensor<cpu, 2, float> mat2 = mat;
+  mat = Tensor<cpu, 1>(data, Shape1(10)).FlatTo2D();
+
+  // shaape of matrix, note size order is same as numpy
+  printf("%u X %u matrix\n", mat.size(0), mat.size(1));
+  return 0;
+  // initialize all element to zero
+  mat = 0.0f;
+  // assign some values
+  mat[0][1] = 1.0f; mat[1][0] = 2.0f;
+  // elementwise operations
+  mat += (mat + 10.0f) / 10.0f + 2.0f;
+  
+  // print out matrix, note: mat2 and mat1 are handles(pointers)
+  for (index_t i = 0; i < mat.size(0); ++i) {
+    for (index_t j = 0; j < mat.size(1); ++j) {
+      printf("%.2f ", mat2[i][j]);
+    }
+    printf("\n");
+  }
+  // shutdown tensor enigne after usage
+  ShutdownTensorEngine<cpu>();
+  return 0;
+}
diff --git a/guide/basic_stream.cu b/guide/basic_stream.cu
new file mode 100644
index 000000000000..18dc64ed4c7f
--- /dev/null
+++ b/guide/basic_stream.cu
@@ -0,0 +1,33 @@
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine();
+  // create a 2 x 5 tensor, from existing space
+  Tensor<gpu, 2, float> ts1 = NewTensor<gpu, float>(Shape2(2, 5), 0.0f);
+  Tensor<gpu, 2, float> ts2 = NewTensor<gpu, float>(Shape2(2, 5), 0.0f);
+  ts1.stream_ = NewStream<gpu>();
+  ts2.stream_ = NewStream<gpu>();
+  ts1 = 1; // Should use stream 0.
+  ts2 = 2; // Should use stream 1. Can run in parallel with stream 0.
+  Tensor<gpu, 2> res = NewTensor<gpu, float>(Shape2(2, 2), 0.0f);
+  res.stream_ = NewStream<gpu>();
+  res = dot(ts1, ts2.T()); //Should use stream 2.
+
+  Tensor<cpu, 2> cpu_res = NewTensor<cpu, float>(Shape2(2, 2), 0.0f);
+  Copy(cpu_res, res); // default stream, should be 0.
+  for (index_t i = 0; i < cpu_res.size(0); ++i){
+    for (index_t j = 0; j < cpu_res.size(1); ++j){
+      printf("%.2f ", cpu_res[i][j]);
+    }
+    printf("\n");
+  }
+  // shutdown tensor enigne after usage
+  ShutdownTensorEngine();
+  return 0;
+}
diff --git a/guide/config.mk b/guide/config.mk
new file mode 100644
index 000000000000..b28f41741543
--- /dev/null
+++ b/guide/config.mk
@@ -0,0 +1,35 @@
+#---------------------------------------------------------------------------------------
+#  mshadow: the configuration compile script
+#
+#  This is configuration script that you can use to compile mshadow
+#  Usage:
+# 
+#  include config.mk in your Makefile, or directly include the definition of variables
+#  include mshadow.mk after the variables are set
+#  
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas, apple
+USE_BLAS = atlas
+#
+# add path to intel library, you may need it
+# for MKL, if you did not add the path to enviroment variable
+# 
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = NONE
+PS_THIRD_PATH = NONE
diff --git a/guide/defop.cpp b/guide/defop.cpp
new file mode 100644
index 000000000000..074b81cc141e
--- /dev/null
+++ b/guide/defop.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// user defined unary operator addone
+struct addone {
+  // map can be template function
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return  a + static_cast<DType>(1);
+  }
+};
+// user defined binary operator max of two
+struct maxoftwo {
+  // map can also be normal functions,
+  // however, this can only be applied to float tensor
+  MSHADOW_XINLINE static float Map(float a, float b) {
+    if(a > b) return a;
+    else return b;
+  }
+};
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<cpu>();
+  // take first subscript of the tensor
+  Tensor<cpu,2, float> mat = NewTensor<cpu>(Shape2(2,3), 0.0f);
+  Tensor<cpu,2, float> mat2= NewTensor<cpu>(Shape2(2,3), 0.0f);
+
+  mat[0][0] = -2.0f;
+  mat = F<maxoftwo>(F<addone>(mat) + 0.5f, mat2);
+
+  for (index_t i = 0; i < mat.size(0); ++i) {
+    for (index_t j = 0; j < mat.size(1); ++j) {
+      printf("%.2f ", mat[i][j]);
+    }
+    printf("\n");
+  }
+  FreeSpace(&mat); FreeSpace(&mat2);
+  // shutdown tensor enigne after usage
+  ShutdownTensorEngine<cpu>();
+  return 0;
+}
diff --git a/guide/exp-template/.gitignore b/guide/exp-template/.gitignore
new file mode 100644
index 000000000000..fc070ad5bd7e
--- /dev/null
+++ b/guide/exp-template/.gitignore
@@ -0,0 +1 @@
+exp_*
\ No newline at end of file
diff --git a/example/exp-template/Makefile b/guide/exp-template/Makefile
similarity index 100%
rename from example/exp-template/Makefile
rename to guide/exp-template/Makefile
diff --git a/guide/exp-template/README.md b/guide/exp-template/README.md
new file mode 100644
index 000000000000..c824d8e4e3c6
--- /dev/null
+++ b/guide/exp-template/README.md
@@ -0,0 +1,340 @@
+Expression Template Tutorial
+====
+This page explains how mshadow works. The main trick behind mshadow is called [Expression Template](http://en.wikipedia.org/wiki/Expression_templates).
+We will explain how it will affect the performance of compiled code. Expression template is the major trick behind the C++ matrix libraries such as Eigen, GSL, boost.uBLAS.
+
+How to write efficient machine learning code
+====
+Before we start, let us think of the question above. Assume we want to write down the update rule
+```c++
+weight =  - eta * (grad + lambda * weight);
+```
+Where weight and grad are vectors of length ```n```. When you choose C++ as your programming language,
+I guess the major concern is efficiency. There is one principle that is important and used in most C/C++ programs:
+* Pre-allocate necessary memory, **no temporal memory allocation** during running.
+
+An example code is like
+```c++
+void UpdateWeight (const float *grad, float eta, float lambda,
+                   int n, float *weight) {
+  for (int i = 0; i < n; ++i) {
+    weight[i] =  - eta * (grad[i] + lambda * weight[i]);
+  }
+}
+``` 
+The function takes the pre-allocated space grad, and weight, and run the calculation. Writing these functions are simple,
+however, it can be annoying when we write them repeatedly. So the question is, can we write as follows, and get same performance as previous code?
+```c++
+void UpdateWeight (const Vec& grad, float eta, float lambda, Vec& weight) {
+  weight = -eta * (grad + lambda * weight);
+} 
+```
+The answer is yes, but not by the most obvious solution.
+
+A Naive Bad Solution
+====
+Let us first take a look at a most straight forward solution: operator overloading.
+```c++
+// Naive solution for vector operation overloading 
+struct Vec {
+  int len;
+  float* dptr;
+  Vec(int len) : len(len) { 
+    dptr = new float[len];
+  }
+  Vec(const Vec& src) : len(src.len) {
+    dptr = new float[len];
+    memcpy(dptr, src.dptr, sizeof(float)*len ); 
+  }
+  ~Vec(void) {
+    delete [] dptr;
+  }
+};
+
+inline Vec operator+(const Vec &lhs, const Vec &rhs) {
+  Vec res(lhs.len);
+  for (int i = 0; i < lhs.len; ++i) {
+    res.dptr[i] = lhs.dptr[i] + rhs.dptr[i];
+  } 
+  return res;
+} 
+```
+If we add more operators overloading in the same style, we can get what we want, and write equations instead of loop.
+However, this kind of approach is inefficient, because temporal memory is allocated and de-allocated during each operation, while we could have done better.
+
+An alternative, more effective way is only overload operator+=, operator-=, which can be implemented without temporal memory allocation. But this limits the equations we can write.
+
+We will discuss why we still need expression template although C++11 provides move assignment operator and rvalue reference at the end of this tutorial. 
+
+Lazy Evaluation
+====
+Let us think why we need temporal memory allocation when doing operator+. This is because we *do not know* the target that will be assigned to in operator+,
+otherwise we could have directly storing into target memory instead of temporal memory. 
+
+What if we can know the target? The following code ([exp_lazy.cpp](exp_lazy.cpp)) achieves this. 
+```c++
+// Example Lazy evaluation code
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+struct Vec;
+// expression structure holds the expression
+struct BinaryAddExp {
+  const Vec &lhs;
+  const Vec &rhs;
+  BinaryAddExp(const Vec &lhs, const Vec &rhs)
+  : lhs(lhs), rhs(rhs) {}
+};
+// no constructor and destructor to allocate and de-allocate memory,
+//  allocation done by user
+struct Vec {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  inline Vec &operator=(const BinaryAddExp &src) {
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i];
+    }
+    return *this;
+  }
+};
+// no evaluation happens here
+inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) {
+  return BinaryAddExp(lhs, rhs);
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression
+  A = B + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
+```
+The idea is that we do not actually do computation in operator+, but only return a expression structure (like abstract syntax tree),
+and when we overload operator=, we see the target, as well as all the operands, and we can run computation without introducing extra memory!
+Similarly, we can define a DotExp and lazily evaluate at operator=, and redirect matrix(vector) multiplications to BLAS.
+
+
+More Lengthy Expressions and Expression Template
+====
+By using lazy evaluation, we are cool by avoiding temporal memory allocations. But the ability of the code is limited:
+* We can only write ```A=B+C```, but not more lengthy expressions.
+* When we add more expression, we need to write more operator= to evaluate each equations.
+
+Here is where the magic of template programming comes to rescue. The following code ([exp_template.cpp](exp_template.cpp)),
+which is a bit more lengthy, also allows you to write lengthy equations.
+```c++
+// Example code, expression template, and more length equations
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+//  and put their type in subtype
+template<typename SubType>
+struct Exp {
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename TLhs, typename TRhs>
+struct BinaryAddExp: public Exp<BinaryAddExp<TLhs, TRhs> > {
+  const TLhs &lhs;
+  const TRhs &rhs;
+  BinaryAddExp(const TLhs& lhs, const TRhs& rhs)
+      : lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return lhs.Eval(i) + rhs.Eval(i);
+  }
+};
+// no constructor and destructor to allocate
+// and de-allocate memory, allocation done by user
+struct Vec: public Exp<Vec> {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      :len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator= (const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template add, works for any expressions
+template<typename TLhs, typename TRhs>
+inline BinaryAddExp<TLhs, TRhs>
+operator+(const Exp<TLhs> &lhs, const Exp<TRhs> &rhs) {
+  return BinaryAddExp<TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B + C + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f + %f + %f\n", i,
+           A.dptr[i], B.dptr[i],
+           C.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
+```
+The key idea of the code is the template ```Exp<SubType>``` takes type of its derived class as template argument, so it can convert itself to
+the SubType via ```self()```.  BinaryAddExp now is a template class that can composite expressions together, like a template version of Composite pattern.
+The evaluation is done through function Eval, which is done in a recursive way in BinaryAddExp.
+* Due to inlining, the function calls of ```src.Eval(i)``` in ```operator=``` will be compiled into ```B.dptr[i] + C.dptr[i] + C.dptr[i]``` in compile time.
+* We can write equations for element-wise operations with same efficiency as if we write a loop  
+
+Make it more flexible
+====
+As we can find in the previous example, template programming is a powerful to make things flexible in compile time, our final example,
+which is closer to mshadow, allows user customized binary operators ([exp_template_op.cpp](exp_template_op.cpp)). 
+```c++
+// Example code, expression template
+// with binary operator definition and extension
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+// and put their type in subtype
+template<typename SubType>
+struct Exp{
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary operators
+struct mul{
+  inline static float Map(float a, float b) {
+    return a * b;
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename OP, typename TLhs, typename TRhs>
+struct BinaryMapExp: public Exp<BinaryMapExp<OP, TLhs, TRhs> >{
+  const TLhs& lhs;
+  const TRhs& rhs;
+  BinaryMapExp(const TLhs& lhs, const TRhs& rhs)
+      :lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return OP::Map(lhs.Eval(i), rhs.Eval(i));
+  }
+};
+// no constructor and destructor to allocate and de-allocate memory
+// allocation done by user
+struct Vec: public Exp<Vec>{
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator=(const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template add, works for any expressions
+template<typename OP, typename TLhs, typename TRhs>
+inline BinaryMapExp<OP, TLhs, TRhs>
+F(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return BinaryMapExp<OP, TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+template<typename TLhs, typename TRhs>
+inline BinaryMapExp<mul, TLhs, TRhs>
+operator*(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return F<mul>(lhs, rhs);
+}
+
+// user defined operation
+struct maximum{
+  inline static float Map(float a, float b) {
+    return a > b ? a : b;
+  }
+};
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B * F<maximum>(C, B);
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f * max(%f, %f)\n",
+           i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]);
+  }
+  return 0;
+}
+```
+
+Summary
+=====
+Up to this point, you should have understand basic ideas how it works:
+* Lazy evaluation, to allow us see all the operands and target
+* Template composition and recursive evaluation, to allows us evaluate arbitrary composite expressions for element-wise operations.
+* Due to template and inlining, writing expressions are as efficient as if we directly write a for loop to implement the update rule:)
+
+So write expressions when you write machine learning codes, and focus your energy on the algorithm part that matters.
+
+The Expression Template in MShadow
+=====
+Expression template in mshadow use the same key points as we introduced in the tutorial, with some minor differences:
+* We separate evaluation code from expression construction and composition code.  
+    - Instead of putting Eval in Exp class. A Plan class is created from expression, and used to evaluate the result. 
+    - This allows us to put less variables in Plan, for example, we do not need array length when we evaluate a data.
+    - One important reason is CUDA kernel cannot take class with const references 
+    - This design choice is debatable, but we find it is useful so far.
+* Lazy support for complex expressions such as matrix dot product
+    - Besides element-wise expressions, we also want to support sugars such as ```A = dot(B.T(), C)```,  again, lazy evaluation is used and no extra memory is allocated.
+* Type checking and array length checking.
+
+Notes
+====
+* Expression Template and C++11: in C++11, move constructor can be used to save repetitive allocation memory, which removes some need to expression template. However, the space still needs to be allocated at least once. 
+   - This only removes the need of expression template then expression generate space, say dst = A+B+C, dst does not contain space allocated before assignment.
+   - If we want to keep the syntax that everything is pre-allocated, and expression executes without memory allocation (which is what we did in mshadow), we still need expression template.
+
diff --git a/guide/exp-template/exp_lazy.cpp b/guide/exp-template/exp_lazy.cpp
new file mode 100644
index 000000000000..4e6a6b14b9de
--- /dev/null
+++ b/guide/exp-template/exp_lazy.cpp
@@ -0,0 +1,45 @@
+// Example Lazy evaluation code
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+struct Vec;
+// expression structure holds the expression
+struct BinaryAddExp {
+  const Vec &lhs;
+  const Vec &rhs;
+  BinaryAddExp(const Vec &lhs, const Vec &rhs)
+  : lhs(lhs), rhs(rhs) {}
+};
+// no constructor and destructor to allocate and de-allocate memory,
+//  allocation done by user
+struct Vec {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  inline Vec &operator=(const BinaryAddExp &src) {
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i];
+    }
+    return *this;
+  }
+};
+// no evaluation happens here
+inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) {
+  return BinaryAddExp(lhs, rhs);
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression
+  A = B + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
diff --git a/guide/exp-template/exp_template.cpp b/guide/exp-template/exp_template.cpp
new file mode 100644
index 000000000000..556b10316a3b
--- /dev/null
+++ b/guide/exp-template/exp_template.cpp
@@ -0,0 +1,72 @@
+// Example code, expression template, and more length equations
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+//  and put their type in subtype
+template<typename SubType>
+struct Exp {
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename TLhs, typename TRhs>
+struct BinaryAddExp: public Exp<BinaryAddExp<TLhs, TRhs> > {
+  const TLhs &lhs;
+  const TRhs &rhs;
+  BinaryAddExp(const TLhs& lhs, const TRhs& rhs)
+      : lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return lhs.Eval(i) + rhs.Eval(i);
+  }
+};
+// no constructor and destructor to allocate
+// and de-allocate memory, allocation done by user
+struct Vec: public Exp<Vec> {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      :len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator= (const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template add, works for any expressions
+template<typename TLhs, typename TRhs>
+inline BinaryAddExp<TLhs, TRhs>
+operator+(const Exp<TLhs> &lhs, const Exp<TRhs> &rhs) {
+  return BinaryAddExp<TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B + C + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f + %f + %f\n", i,
+           A.dptr[i], B.dptr[i],
+           C.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
diff --git a/guide/exp-template/exp_template_op.cpp b/guide/exp-template/exp_template_op.cpp
new file mode 100644
index 000000000000..249b181ada5b
--- /dev/null
+++ b/guide/exp-template/exp_template_op.cpp
@@ -0,0 +1,92 @@
+// Example code, expression template
+// with binary operator definition and extension
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+// and put their type in subtype
+template<typename SubType>
+struct Exp{
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary operators
+struct mul{
+  inline static float Map(float a, float b) {
+    return a * b;
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename OP, typename TLhs, typename TRhs>
+struct BinaryMapExp: public Exp<BinaryMapExp<OP, TLhs, TRhs> >{
+  const TLhs& lhs;
+  const TRhs& rhs;
+  BinaryMapExp(const TLhs& lhs, const TRhs& rhs)
+      :lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return OP::Map(lhs.Eval(i), rhs.Eval(i));
+  }
+};
+// no constructor and destructor to allocate and de-allocate memory
+// allocation done by user
+struct Vec: public Exp<Vec>{
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator=(const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template add, works for any expressions
+template<typename OP, typename TLhs, typename TRhs>
+inline BinaryMapExp<OP, TLhs, TRhs>
+F(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return BinaryMapExp<OP, TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+template<typename TLhs, typename TRhs>
+inline BinaryMapExp<mul, TLhs, TRhs>
+operator*(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return F<mul>(lhs, rhs);
+}
+
+// user defined operation
+struct maximum{
+  inline static float Map(float a, float b) {
+    return a > b ? a : b;
+  }
+};
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B * F<maximum>(C, B);
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f * max(%f, %f)\n",
+           i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]);
+  }
+  return 0;
+}
diff --git a/guide/mshadow-ps/Makefile b/guide/mshadow-ps/Makefile
new file mode 100644
index 000000000000..70cb724248f0
--- /dev/null
+++ b/guide/mshadow-ps/Makefile
@@ -0,0 +1,36 @@
+# set LD_LIBRARY_PATH
+export CC  = gcc
+export CXX = g++
+export NVCC =nvcc
+include config.mk
+include ../../make/mshadow.mk
+export CFLAGS = -Wall -O3 -fopenmp -I../../ $(MSHADOW_CFLAGS)
+export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+
+# specify tensor path
+BIN = local_sum.cpu 
+OBJ =
+CUOBJ =
+CUBIN = local_sum.gpu
+.PHONY: clean all
+
+all: $(BIN) $(CUBIN) 
+
+local_sum.cpu: local_sum.cpp
+local_sum.gpu: local_sum.cu
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(CUOBJ) :
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+
+$(CUBIN) :
+	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
diff --git a/guide/mshadow-ps/README.md b/guide/mshadow-ps/README.md
new file mode 100644
index 000000000000..3a95798aae98
--- /dev/null
+++ b/guide/mshadow-ps/README.md
@@ -0,0 +1,174 @@
+mshadow-ps
+====
+### Parameter Server Interface for GPU Tensor
+
+mshadow-ps provides asynchronize parameter server interface for mshadow GPU/CPU Tensor.
+This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in
+an ***easy*** and ***unified*** way.
+
+####List of Resources
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc/namespacemshadow_1_1ps.html)
+* [Library Interface Header](../../mshadow-ps/ps.h)
+* Tutorial in this page
+
+Tutorial
+====
+Suppose that we are now implementing a Multi-GPU learning program.
+One way to do that is through data parallelism. We can launch many
+threads, with each thread compute gradient on one GPU, and aggregate
+the statistics together.
+However, the gradient synchronization step could be cost time, and in
+many cases, we can do the computation in an smarter way, so that 
+we ***overlaps the computation with the synchronization***.
+
+mshadow-ps provides interface to do such synchronization in an easy way.
+The following documents provides a way 
+
+### Getting Sum from Multiple GPUs
+We first get familiar with the interface of mshadow-ps. Through the following
+program in [local_sum-inl.h](local_sum-inl.h). You can compile the program
+by setup the [config.mk](config.mk) according to your computers's enviroment, and type make.
+
+In the following program, each thread first does some computation locally, then tries to get the sum
+of ```data``` through mshadow-ps interface.
+There are four key functions in ```ISharedModel``` interface
+* [InitKey](../../mshadow-ps/ps.h#L76) allocates a key to specific tensor shape
+* [Push](../../mshadow-ps/ps.h#L100) pushes out the local data to the synchronization interface
+  - The data pushed by different devices will be aggregated together by key
+  - Push is an asynchronize call and returns immediately
+* [PullReq](../../mshadow-ps/ps.h#L122) requests the result of synchronization to be copied back
+  - In the local default case, the synchronized result is the sum of pushed data
+  - mshadow-ps also support the weight update on server side, where the result of PullReq is the updated weight instead of sum of gradient
+  - PullReq is also asynchronize
+* [PullWait](../../mshadow-ps/ps.h#L87) wait until the pull request of corresponding key finishes
+
+```c++
+// this function is runed by specific thread
+template<typename xpu>
+inline void RunWorkerThread(int devid,
+                            mshadow::ps::ISharedModel<xpu, float> *ps) {
+  // initialize tensor engine
+  mshadow::InitTensorEngine<xpu>(devid);
+  mshadow::Stream<xpu> *stream  = mshadow::NewStream<xpu>();
+  // allocate tensor on xpu
+  mshadow::TensorContainer<xpu, 2> data(mshadow::Shape2(2, 3));
+  // set the computation stream to the new allocated stream
+  // this will make subsequent computation whose target is data
+  // to use the stream, stream is needed for async execution in GPU
+  data.set_stream(stream);
+  // assume these operations sets the content of dataient
+  data[0] = 1.0f;
+  data[1] = devid + data[0];
+  printf("dev%d: before sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  // intiaialize the key, register the shape on parameter server
+  ps->InitKey(data[0].shape_, 0, devid);
+  ps->InitKey(data[1].shape_, 1, devid);
+  // push data[0] out, for update, or aggregation
+  // 0 is the key of the data, devid is the current device id
+  ps->Push(data[0], 0, devid);
+  // pull request is used to request the data to be copied back
+  // once computation is done
+  ps->PullReq(data[0], 0, devid);
+  // computation can be done here..
+  // the pull request handler will be overlapped with   
+  // similar as previous call
+  ps->Push(data[1], 1, devid);
+  ps->PullReq(data[1], 1, devid);
+  // more computation can be done here...
+  // the computation will be overlapped 
+  // PullWait will block until these request finishes
+  ps->PullWait(0, devid);
+  ps->PullWait(1, devid);
+  printf("dev%d: after sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  mshadow::DeleteStream(stream);
+  mshadow::ShutdownTensorEngine<xpu>();
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: device list\n"\
+           "\tfor CPU the device list can be arbitrary\n"\
+           "\tfor GPU the device list need to be actual device index\n");
+    return 0;
+  }
+  // list of device ids
+  std::vector<int> devs;
+  // initialization
+  for (int i = 1; i < argc; ++i) {
+    // record the device id
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, float>
+      *ps = mshadow::ps::CreateSharedModel<xpu, float>("local");
+  // intiaialize the ps
+  ps->Init(devs);  
+  // use openmp to launch #devs threads
+  #pragma omp parallel num_threads(devs.size())
+  {
+    int tid = omp_get_thread_num();
+    RunWorkerThread<xpu>(devs[tid], ps);
+  }
+  delete ps;
+  return 0;
+}
+```
+In the above example, we did not do weight update on server side, so the synchronization result is
+simply the sum of data on each device. The key property of this interface is that the Push and PullReq are asynchronize.
+* We can call these two functions once the gradient is ready, and the mshadow-ps will do the data synchronization in the background.
+* When we need the result of synchronization, we simply call PullWait to wait the synchronization task to finish.
+* Such interface allows us to do additional computation between the Push/PullReq and PullWait
+
+### A MultiGPU Neural Net
+To get a more concrete understanding of the interface. We give an example of multi-GPU two layer neuralnet
+in [../neuralnet/nnet_ps.cu](../neuralnet/nnet_ps.cu). The general idea is follows
+* Push and PullReq is called once we get the gradient of certain layer
+* PullWait is called before we do forward on that layer next time
+* This creates a ***time lag*** between the backprop and next forward to that layer
+  - mshadow-ps do synchronization concurrently with computations during the time lag
+  - The time lag is big for latter layers, which also usually need more time to synchronize
+
+There are several note of the mshadow-ps on the neural net code
+* Callback function in PullReq
+  - A callback function can be pass to PullReq to be called when the request complete
+  - We place weight update in the callback to perform update when we get the gradient sum
+* Computing stream
+  - Due to GPU's programming model, we need to do computation on non-default stream
+  - Use set_stream in mshadow tensors to set stream to computation stream
+  - To report error when you did not use stream, you can compile with -DMSHADOW_FORCE_STREAM
+
+We should note thate because the example runs on MNIST, which is an quite small dataset, you may not observe
+speedup with multiple cards. However, you will find significant speedup when you run on other tasks.
+The newest version of [cxxnet](https://github.com/antinucleon/cxxnet)
+
+### Moving Parameter Update to the Server
+In all the examples so far, we use mshadow-ps to get the aggregated sum of gradients, and update
+weights locally on each GPU. For more advanced usage of mshadow-ps, we can move the weight update
+to the server. The communication pattern is as follows
+* Each thread still call Push to push out gradient
+* The server will apply the update rule to update the weight
+* Each thread call PullReq to pull back the weight from server
+
+Such update pattern is suitable under distributed setting. To do so, user need to implement an
+[IModelUpdater](../../mshadow-ps/ps.h#L202) interface. And define the following CreateModelUpdater function
+in the program
+```c++
+namespace mshadow {
+namespace ps {
+template<>
+IModelUpdater<float> *CreateModelUpdater() {
+  return new MyModelUpdater();
+}
+}
+}
+```
+Before calling ISharedModel.Init, user need to call ```ps->SetParam("update_on_server", "1")``` to set the update 
+mode on the server side. If user uses distributed shared model, user must define ModelUpdater.
diff --git a/guide/mshadow-ps/config.mk b/guide/mshadow-ps/config.mk
new file mode 100644
index 000000000000..834b430c0f8c
--- /dev/null
+++ b/guide/mshadow-ps/config.mk
@@ -0,0 +1,35 @@
+#---------------------------------------------------------------------------------------
+#  mshadow: the configuration compile script
+#
+#  This is configuration script that you can use to compile mshadow
+#  Usage:
+# 
+#  include config.mk in your Makefile, or directly include the definition of variables
+#  include mshadow.mk after the variables are set
+#  
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas, apple
+USE_BLAS = atlas
+#
+# add path to intel library, you may need it
+# for MKL, if you did not add the path to enviroment variable
+# 
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = NONE
+PS_THIRD_PATH = NONE
diff --git a/guide/mshadow-ps/local_sum-inl.h b/guide/mshadow-ps/local_sum-inl.h
new file mode 100644
index 000000000000..5120590a2768
--- /dev/null
+++ b/guide/mshadow-ps/local_sum-inl.h
@@ -0,0 +1,113 @@
+// This is an example demonstrating the usage of mshadow ps
+#include <cstdio>
+// use openmp to launch multiple threads
+#include <omp.h>
+#include <mshadow/tensor.h>
+#include <mshadow-ps/ps.h>
+
+// simple util to print result
+void Print_(mshadow::Tensor<mshadow::cpu, 2, float> ts) {
+  for (mshadow::index_t i = 0; i < ts.size(0); ++i) {
+    for (mshadow::index_t j = 0; j < ts.size(1); ++j) {
+      printf("%g ", ts[i][j]);
+    }
+    printf("\n");
+  }
+}
+template<typename xpu>
+inline void Print(mshadow::Tensor<xpu, 2, float> ts) {
+  mshadow::TensorContainer<mshadow::cpu, 2, float> tmp;
+  tmp.Resize(ts.shape_);
+  mshadow::Copy(tmp, ts);
+  Print_(tmp);
+}
+
+// this function is runed by specific thread
+template<typename xpu>
+inline void RunWorkerThread(int devid,
+                            mshadow::ps::ISharedModel<xpu, float> *ps) {
+  // initialize tensor engine
+  mshadow::InitTensorEngine<xpu>(devid);
+  mshadow::Stream<xpu> *stream  = mshadow::NewStream<xpu>();
+  // allocate tensor on xpu
+  mshadow::TensorContainer<xpu, 2> data(mshadow::Shape2(2, 3));
+  // set the computation stream to the new allocated stream
+  // this will make subsequent computation whose target is data
+  // to use the stream, stream is needed for async execution in GPU
+  data.set_stream(stream);
+  // assume these operations sets the content of dataient
+  data[0] = 1.0f;
+  data[1] = devid + data[0];
+  printf("dev%d: before sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  // intiaialize the key, register the shape on parameter server
+  ps->InitKey(data[0].shape_, 0, devid);
+  ps->InitKey(data[1].shape_, 1, devid);
+  // push data[0] out, for update, or aggregation
+  // 0 is the key of the data, devid is the current device id
+  ps->Push(data[0], 0, devid);
+  // pull request is used to request the data to be copied back
+  // once computation is done
+  ps->PullReq(data[0], 0, devid);
+  // computation can be done here..
+  // the pull request handler will be overlapped with   
+  // similar as previous call
+  ps->Push(data[1], 1, devid);
+  ps->PullReq(data[1], 1, devid);
+  // more computation can be done here...
+  // the computation will be overlapped 
+  // PullWait will block until these request finishes
+  ps->PullWait(0, devid);
+  ps->PullWait(1, devid);
+  printf("dev%d: after sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  mshadow::DeleteStream(stream);
+  mshadow::ShutdownTensorEngine<xpu>();
+}
+
+namespace mshadow {
+namespace ps {
+// model updater is used when update is happening on server side
+// if we only use parameter server for sum aggregation
+// this is not needed, but we must declare this function to return NULL
+template<>
+IModelUpdater<float> *CreateModelUpdater(void) {
+  return NULL;
+}
+}
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: device list\n"\
+           "\tfor CPU the device list can be arbitrary\n"\
+           "\tfor GPU the device list need to be actual device index\n");
+    return 0;
+  }
+  // list of device ids
+  std::vector<int> devs;
+  // initialization
+  for (int i = 1; i < argc; ++i) {
+    // record the device id
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, float>
+      *ps = mshadow::ps::CreateSharedModel<xpu, float>("local");
+  // intiaialize the ps
+  ps->Init(devs);  
+  // use openmp to launch #devs threads
+  #pragma omp parallel num_threads(devs.size())
+  {
+    int tid = omp_get_thread_num();
+    RunWorkerThread<xpu>(devs[tid], ps);
+  }
+  delete ps;
+  return 0;
+}
diff --git a/guide/mshadow-ps/local_sum.cpp b/guide/mshadow-ps/local_sum.cpp
new file mode 100644
index 000000000000..7f0eed0df42e
--- /dev/null
+++ b/guide/mshadow-ps/local_sum.cpp
@@ -0,0 +1,4 @@
+#include "./local_sum-inl.h"
+int main(int argc, char *argv[]) {
+  return Run<mshadow::cpu>(argc, argv);
+}
diff --git a/guide/mshadow-ps/local_sum.cu b/guide/mshadow-ps/local_sum.cu
new file mode 100644
index 000000000000..6e839601a265
--- /dev/null
+++ b/guide/mshadow-ps/local_sum.cu
@@ -0,0 +1,4 @@
+#include "./local_sum-inl.h"
+int main(int argc, char *argv[]) {
+  return Run<mshadow::gpu>(argc, argv);
+}
diff --git a/example/neuralnet/Makefile b/guide/neuralnet/Makefile
similarity index 54%
rename from example/neuralnet/Makefile
rename to guide/neuralnet/Makefile
index 7cb45e4afa2d..826384b5f3b0 100644
--- a/example/neuralnet/Makefile
+++ b/guide/neuralnet/Makefile
@@ -2,31 +2,27 @@
 export CC  = gcc
 export CXX = g++
 export NVCC =nvcc
-export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../../
-
-
-ifeq ($(blas),1)
-	LDFLAGS= -lcblas -lm -lcudart -lcublas -lcurand 
-	CFLAGS+= -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1
-else
-	LDFLAGS=  -lm -lcudart -lcublas -lcurand  -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread 
-endif
-export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX)
+include config.mk
+include ../../make/mshadow.mk
+export CFLAGS = -Wall -O3 -I../../ -fopenmp $(MSHADOW_CFLAGS)
+export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 
 # specify tensor path
 BIN =
 OBJ =
 CUOBJ =
-CUBIN = nnet convnet
+CUBIN = nnet convnet nnet_ps
 .PHONY: clean all
 
 all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ)
 
 nnet: nnet.cu
+nnet_ps: nnet_ps.cu
 convnet: convnet.cu
 
 $(BIN) :
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS)
 
 $(OBJ) :
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
@@ -39,3 +35,4 @@ $(CUBIN) :
 
 clean:
 	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
+
diff --git a/guide/neuralnet/README.md b/guide/neuralnet/README.md
new file mode 100644
index 000000000000..dd181e758c65
--- /dev/null
+++ b/guide/neuralnet/README.md
@@ -0,0 +1,16 @@
+Example Neural Net code with MShadow
+====
+
+To compile the code, modify ```config.mk``` to the setting you like and type make
+* You will need to have CUDA and  a version of BLAS
+
+To run the demo, download  MNIST dataset from: http://yann.lecun.com/exdb/mnist/
+unzip all the files into current folder
+
+and run by  ./nnet cpu or ./nnet gpu. ./convnet cpu or ./convnet gpu
+
+MultiGPU Version
+====
+* If you have two GPUs, you can run it by ```./nnet_ps gpu 0 1```.
+* You can run it using CPUs ```./nnet_ps cpu 0 1```.
+* This is an demonstration of mshadow-ps interface, see introduction in [../mshadow-ps](../mshadow-ps)
diff --git a/guide/neuralnet/config.mk b/guide/neuralnet/config.mk
new file mode 100644
index 000000000000..112396d5557b
--- /dev/null
+++ b/guide/neuralnet/config.mk
@@ -0,0 +1,35 @@
+#---------------------------------------------------------------------------------------
+#  mshadow: the configuration compile script
+#
+#  This is configuration script that you can use to compile mshadow
+#  Usage:
+# 
+#  include config.mk in your Makefile, or directly include the definition of variables
+#  include mshadow.mk after the variables are set
+#  
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas, apple
+USE_BLAS = mkl
+#
+# add path to intel library, you may need it
+# for MKL, if you did not add the path to enviroment variable
+# 
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = NONE
+PS_THIRD_PATH = NONE
diff --git a/guide/neuralnet/convnet.cu b/guide/neuralnet/convnet.cu
new file mode 100644
index 000000000000..97b6a03fc416
--- /dev/null
+++ b/guide/neuralnet/convnet.cu
@@ -0,0 +1,267 @@
+// this implements a simple convolution neural net: conv-maxpool-fullc
+#include <vector>
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// helper function to load mnist dataset
+#include "util.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// define operations 
+struct relu{
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    using namespace std;
+    return max(a, 0.0f);
+  }    
+};
+struct relu_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a > 0.0f ? 1.0f : 0.0f;
+  }
+};
+
+/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
+class INNet{
+ public:
+  virtual void Forward(const Tensor<cpu, 4, real_t>& inbatch, Tensor<cpu, 2, real_t> &oubatch) = 0;
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) = 0;    
+  virtual void Update(void) = 0;
+  virtual ~INNet() {}
+};
+
+/*! 
+ * \brief simple two layer conv-net conv-pool-flat-fullc
+ *        this implementation is device invariant
+ */
+template<typename xpu>
+class ConvNet : public INNet {
+ public:
+  // initialize the network
+  ConvNet(int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out)
+      :rnd(0), ksize(ksize), kstride(kstride), psize(psize) {
+    // setup nodes
+    ninput.Resize(Shape4(batch_size, 1, insize, insize));
+    nhidden.Resize(Shape4(batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1)); 
+    nhiddenbak.Resize(nhidden.shape_);
+    npool.Resize(Shape4(batch_size, nchannel, (nhidden.size(2)+1-psize)/psize, (nhidden.size(3)+1-psize)/psize));
+    npoolbak.Resize(npool.shape_);
+    nflat.Resize(Shape2(batch_size, npool.size(1)*npool.size(2)*npool.size(3)));
+    nout.Resize(Shape2(batch_size, num_out));
+    // setup bias
+    hbias.Resize(Shape1(nchannel)); g_hbias.Resize(hbias.shape_);
+    obias.Resize(Shape1(num_out));  g_obias.Resize(obias.shape_);
+    hbias = 0.0f; obias = 0.0f;
+    // setup weights
+    Ki2h.Resize(Shape2(nchannel, ksize*ksize));  g_Ki2h.Resize(Ki2h.shape_);
+    Wh2o.Resize(Shape2(nflat.size(1), num_out));   g_Wh2o.Resize(Wh2o.shape_);
+    rnd.SampleGaussian(&Ki2h, 0, 0.01f);
+    rnd.SampleGaussian(&Wh2o, 0, 0.01f);
+    
+    printf("conv=%d, pool=%d\n", nhidden.size(3), npool.size(3));
+  }
+  virtual ~ConvNet() {}
+  // forward propagation
+  virtual void Forward(const Tensor<cpu, 4, real_t>& inbatch, Tensor<cpu, 2, real_t> &oubatch) {
+    index_t batch_size = inbatch.size(0);
+        // copy data to input layer
+    Copy(ninput, inbatch);
+    // first layer, conv, use stride=2
+    ConvForward(ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst);
+    // add bias
+    nhidden += broadcast<1>(hbias, nhidden.shape_);
+    // activation, relu, backup activation in nhidden 
+    nhidden = F<relu>(nhidden);
+    Copy(nhiddenbak, nhidden);
+    // max pooling 
+    npool = pool<red::maximum>(nhiddenbak, npool[0][0].shape_, psize, psize, psize);
+    Copy(npoolbak, npool);
+    // flat
+    nflat = reshape(npool, nflat.shape_);
+    // second layer fullc
+    nout = dot(nflat, Wh2o);
+    nout += repmat(obias, batch_size);
+    // softmax calculation
+    Softmax(nout, nout);
+    // copy result out
+    Copy(oubatch, nout);
+  }
+  // back propagation
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) {
+    // copy gradient to output layer
+    Copy(nout, gradout);
+    // calc grad of final layer
+    g_obias = sum_rows(nout);
+    g_Wh2o  = dot(nflat.T(), nout);
+    // backprop to previous layer
+    nflat = dot(nout, Wh2o.T());
+    npool = reshape(nflat, npool.shape_);
+    // backprop pooling layer
+    nhiddenbak = unpool<red::maximum>(nhiddenbak, npoolbak, npool, psize, psize, psize);        
+    // calculate gradient of relu layer
+    nhidden = F<relu_grad>(nhidden) * nhiddenbak;
+    // calc grad of layer 1
+    g_hbias = sumall_except_dim<1>(nhidden);
+    ConvBackWard(nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst);
+  }
+  // update weight
+  virtual void Update(void) {
+    // run SGD
+    const float eta = 0.1;
+    const float wd = 0.00001;
+    // update weight
+    Ki2h -= eta * (wd * Ki2h + g_Ki2h);
+    Wh2o -= eta * (wd * Wh2o + g_Wh2o);
+    // no regularization for bias
+    hbias-= eta * g_hbias;
+    obias-= eta * g_obias;
+  }
+ private:
+  // forward convolution, tmp_col and tmp_dst are helper structure 
+  inline static void ConvForward(const Tensor<xpu, 4, real_t> &in,
+                                 const Tensor<xpu, 2, real_t> &kernel,
+                                 Tensor<xpu, 4, real_t> &out, 
+                                 int ksize, int kstride,
+                                 TensorContainer<xpu, 2, real_t> &tmp_col,
+                                 TensorContainer<xpu, 2, real_t> &tmp_dst) {
+    index_t oheight  = (in.size(2) - ksize)/kstride + 1;
+    index_t owidth   = (in.size(3) - ksize)/kstride + 1;
+    index_t nbatch   = in.size(0);
+    index_t nchannel = out.size(1);
+    // we directly unpack all local patches and do a dot product
+    // this cost lots of memory, normally for large image, only unpack several image at a time 
+    tmp_col.Resize(Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth));
+    tmp_dst.Resize(Shape2(nchannel, nbatch*oheight*owidth));
+    // unpack local patches , stride=1
+    tmp_col = unpack_patch2col(in, ksize, ksize, kstride);
+    tmp_dst = dot(kernel, tmp_col);
+    // reshape, then swap axis, we chain equations together 
+    out = swapaxis<1,0>(reshape(tmp_dst, Shape4(nchannel, nbatch, oheight, owidth)));
+  }  
+  // backward convolution, calculate gradient of kernel, and backprop back to in
+  inline static void ConvBackWard(const Tensor<xpu, 4, real_t> &out,
+                                  const Tensor<xpu, 2, real_t> &kernel, 
+                                  Tensor<xpu, 2, real_t> &g_kernel,
+                                  Tensor<xpu, 4, real_t> &in, 
+                                  int ksize, int kstride,
+                                  TensorContainer<xpu, 2, real_t> &tmp_col,
+                                  TensorContainer<xpu, 2, real_t> &tmp_dst) {
+    index_t oheight  = (in.size(2) - ksize)/kstride + 1;
+    index_t owidth   = (in.size(3) - ksize)/kstride + 1;
+    index_t nbatch   = in.size(0);
+    index_t nchannel = out.size(1);
+    // we directly unpack all local patches and do a dot product
+    // this cost lots of memory, normally for large image, only unpack several image at a time 
+    tmp_col.Resize(Shape2(in.size(1) * ksize * ksize,
+                          nbatch * oheight * owidth));
+    tmp_dst.Resize(Shape2(nchannel, nbatch * oheight * owidth));
+    // unpack local patches 
+    tmp_col = unpack_patch2col(in, ksize, ksize, kstride);        
+    tmp_dst = reshape(swapaxis<1,0>(out), tmp_dst.shape_);         
+    g_kernel = dot(tmp_dst, tmp_col.T());
+        // backpropgation: not necessary for first layer, but included anyway
+    tmp_col = dot(kernel.T(), tmp_dst);
+    in = pack_col2patch(tmp_col, in.shape_, ksize, ksize, kstride);
+  }
+ private:
+  // random seed generator
+  Random<xpu, real_t> rnd;
+  // kernel size, pooling size
+  int ksize, kstride, psize;
+  // nodes in neural net
+  TensorContainer<xpu, 4, real_t> ninput, nhidden, nhiddenbak, npool, npoolbak;
+  TensorContainer<xpu, 2, real_t> nflat, nout;
+  // temp helper structure
+  TensorContainer<xpu, 2, real_t> tmp_col, tmp_dst;
+  // hidden bias, gradient
+  TensorContainer<xpu, 1, real_t> hbias, obias, g_hbias, g_obias;
+  // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize)
+  TensorContainer<xpu, 2, real_t> Ki2h,  Wh2o, g_Ki2h, g_Wh2o;
+};
+
+// helper function to get the max inde
+inline int MaxIndex(Tensor<cpu, 1, real_t> pred) {
+  int maxidx = 0;
+  for (index_t i = 1; i < pred.size(0); ++i) {
+    if(pred[i] > pred[maxidx]) maxidx = (int)i;
+  }
+  return maxidx;
+}
+
+int main(int argc, char *argv[]) {
+  if(argc < 2) {
+    printf("Usage: cpu or gpu\n"); return 0;
+  }
+  srand(0); 
+  // settings
+  int batch_size = 100;
+  int insize = 28;
+  int nchannel = 10;
+  int ksize = 5;
+  int kstride = 1;
+  int psize = 2;
+  int num_out = 10;
+  
+  // choose which version to use
+  INNet *net;
+  if (!strcmp(argv[1], "gpu")) {
+    InitTensorEngine<gpu>();
+    net = new ConvNet<gpu>(batch_size, insize, nchannel, ksize, kstride, psize, num_out);
+  } else {
+    InitTensorEngine<cpu>();
+    net = new ConvNet<cpu>(batch_size, insize, nchannel, ksize, kstride, psize, num_out);
+  }
+  
+  // temp output layer
+  TensorContainer<cpu, 2, real_t> pred;    
+  pred.Resize(Shape2(batch_size, num_out));
+  
+  // label 
+  std::vector<int> ytrain, ytest;
+  // data
+  TensorContainer<cpu, 2, real_t> xtrain_, xtest_;
+  LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true);
+  LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false);
+  
+  TensorContainer<cpu, 4, real_t> xtrain(Shape4(xtrain_.size(0), 1, insize, insize));
+  TensorContainer<cpu, 4, real_t> xtest(Shape4(xtest_.size(0),  1, insize, insize));
+  xtrain = reshape(xtrain_, xtrain.shape_);
+  xtest = reshape(xtest_, xtest.shape_);
+  
+  int num_iter = 20;
+  
+  for (int i = 0; i < num_iter; ++ i) {
+    // training 
+    for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) {
+      net->Forward(xtrain.Slice(j, j + batch_size), pred);
+      // set gradient into pred
+      for (int k = 0; k < batch_size; ++ k) {
+        pred[k][ ytrain[k+j] ] -= 1.0f;
+      }
+      // scale gradient by batchs zie
+      pred *= 1.0f / batch_size;
+      // run backprop
+      net->Backprop(pred);
+      // update net parameters
+      net->Update();
+    }
+    // evaluation
+    long nerr = 0;
+    for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) {
+      net->Forward(xtest.Slice(j, j + batch_size), pred);            
+      for (int k = 0; k < batch_size; ++ k) {                
+        nerr += MaxIndex(pred[k]) != ytest[j+k];        
+      }
+    }
+    printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0));
+  }    
+  delete net;
+  
+  if (!strcmp(argv[1], "gpu")) {
+    ShutdownTensorEngine<gpu>();
+  } else {
+    ShutdownTensorEngine<cpu>();
+  }
+  return 0;
+}
diff --git a/guide/neuralnet/nnet.cu b/guide/neuralnet/nnet.cu
new file mode 100644
index 000000000000..8e79cf608f3c
--- /dev/null
+++ b/guide/neuralnet/nnet.cu
@@ -0,0 +1,188 @@
+// this implements a simple two layer neural net
+#include <vector>
+#include <cmath>
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// helper function to load mnist dataset
+#include "util.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// define sigmoid operation
+struct sigmoid{
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return  1.0f/(1.0f+expf(-a));
+  }
+};
+
+/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
+class INNet{
+ public:
+  virtual void Forward(const Tensor<cpu, 2, real_t>& inbatch, Tensor<cpu, 2, real_t> &oubatch) = 0;
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) = 0;    
+  virtual void Update(void) = 0;
+  virtual ~INNet() {}
+};
+
+/*! 
+ * \brief simple two layer neural net 
+ *        this implementation is device invariant
+ */
+template<typename xpu>
+class NNet : public INNet {
+ public:
+  // initialize the network
+  NNet(int batch_size, int num_in, int num_hidden, int num_out) : rnd(0) {
+    // setup nodes
+    ninput.Resize(Shape2(batch_size, num_in));
+    nhidden.Resize(Shape2(batch_size, num_hidden));
+    nhiddenbak.Resize(nhidden.shape_);
+    nout.Resize(Shape2(batch_size, num_out));
+    // setup bias
+    hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_);
+    obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_);
+    hbias = 0.0f; obias = 0.0f;
+    // setup weights
+    Wi2h.Resize(Shape2(num_in, num_hidden));  g_Wi2h.Resize(Wi2h.shape_);
+    Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_);
+    rnd.SampleGaussian(&Wi2h, 0, 0.01f);
+    rnd.SampleGaussian(&Wh2o, 0, 0.01f);    
+  }
+  virtual ~NNet() {}
+  // forward propagation
+  virtual void Forward(const Tensor<cpu, 2, real_t>& inbatch,
+                       Tensor<cpu, 2, real_t> &oubatch) {
+    // size is same conventsion as numpy
+    index_t batch_size = inbatch.size(0);
+    // copy data to input layer
+    Copy(ninput, inbatch);
+    // first layer, fullc
+    nhidden = dot(ninput, Wi2h);
+    nhidden+= repmat(hbias, batch_size);
+    // activation, sigmloid, backup activation in nhidden 
+    nhidden = F<sigmoid>(nhidden);
+    Copy(nhiddenbak, nhidden);
+    // second layer fullc
+    nout = dot(nhiddenbak, Wh2o);
+    nout += repmat(obias, batch_size);
+    // softmax calculation
+    Softmax(nout, nout);
+    // copy result out
+    Copy(oubatch, nout);
+  }
+  // back propagation
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) {
+    // copy gradient to output layer
+    Copy(nout, gradout);
+    // calc grad of layer 2
+    g_obias = sum_rows(nout);
+    g_Wh2o  = dot(nhiddenbak.T(), nout);
+    // backprop to layer 1 
+    nhiddenbak = dot(nout, Wh2o.T());
+    // calculate gradient of sigmoid layer
+    nhidden = nhidden * (1.0f-nhidden) * nhiddenbak;
+    // calc grad of layer 1
+    g_hbias = sum_rows(nhidden);
+    g_Wi2h  = dot(ninput.T(), nhidden);        
+  }
+  // update weight
+  virtual void Update(void) {
+    // run SGD
+    const float eta = 0.8;
+    const float wd = 0.00001;
+    // update weight
+    Wi2h -= eta * (wd * Wi2h + g_Wi2h);
+    Wh2o -= eta * (wd * Wh2o + g_Wh2o);
+    // no regularization for bias
+    hbias-= eta * g_hbias;
+    obias-= eta * g_obias;
+  }
+ private:
+  // random seed generator
+  Random<xpu, real_t> rnd;
+  // nodes in neural net
+  TensorContainer<xpu, 2, real_t> ninput, nhidden, nhiddenbak, nout;
+  // hidden bias, gradient
+  TensorContainer<xpu, 1, real_t> hbias, obias, g_hbias, g_obias;
+  // weight gradient
+  TensorContainer<xpu, 2, real_t> Wi2h, Wh2o, g_Wi2h, g_Wh2o;    
+};
+// helper function to get the max inde
+inline int MaxIndex(Tensor<cpu, 1, real_t> pred) {
+  int maxidx = 0;
+  for(index_t i = 1; i < pred.size(0); ++i) {
+    if(pred[i] > pred[maxidx]) maxidx = (int)i;
+  }
+  return maxidx;
+}
+
+int main(int argc, char *argv[]) {
+  if(argc < 2) {
+    printf("Usage: cpu or gpu\n"); return 0;
+  }
+  srand(0);
+  
+  // settings
+  int batch_size = 100;
+  int num_in = 28 * 28;
+  int num_hidden = 100;
+  int num_out = 10;  
+  // choose which version to use
+  INNet *net;
+  if (!strcmp(argv[1], "gpu")) {
+    InitTensorEngine<gpu>();
+    net = new NNet<gpu>(batch_size, num_in, num_hidden, num_out);
+  } else {
+    InitTensorEngine<cpu>();
+    net = new NNet<cpu>(batch_size, num_in, num_hidden, num_out);
+  }
+  
+  // temp output layer
+  TensorContainer<cpu, 2, real_t> pred;    
+  pred.Resize(Shape2(batch_size, num_out));
+  
+  // label 
+  std::vector<int> ytrain, ytest;
+  // data
+  TensorContainer<cpu,2> xtrain, xtest;
+  LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true);
+  LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false);
+  
+  int num_iter = 20;
+  
+  for (int i = 0; i < num_iter; ++ i) {
+    // training 
+    for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) {
+      net->Forward(xtrain.Slice(j, j + batch_size), pred);
+      // set gradient into pred
+      for (int k = 0; k < batch_size; ++ k) {
+        pred[k][ ytrain[k+j] ] -= 1.0f;
+      }
+      // scale gradient by batchs zie
+      pred *= 1.0f / batch_size;
+      // run backprop
+      net->Backprop(pred);
+      // update net parameters
+      net->Update();
+    }
+    // evaluation
+    long nerr = 0;
+    for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) {
+      net->Forward(xtest.Slice(j, j + batch_size), pred);            
+      for (int k = 0; k < batch_size; ++ k) {                
+        nerr += MaxIndex(pred[k]) != ytest[j+k];
+        
+      }
+    }
+    printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0));
+  }
+  delete net;
+  if (!strcmp(argv[1], "gpu")) {
+    ShutdownTensorEngine<gpu>();
+  } else {
+    ShutdownTensorEngine<cpu>();
+  }
+  return 0;
+}
diff --git a/guide/neuralnet/nnet_ps.cu b/guide/neuralnet/nnet_ps.cu
new file mode 100644
index 000000000000..996bbe266d7b
--- /dev/null
+++ b/guide/neuralnet/nnet_ps.cu
@@ -0,0 +1,312 @@
+// this implements a simple two layer Multi-GPU neural net
+// this implementation uses mshadow-ps to get gradient aggregation
+// between cards
+// this code is modified from nnet.cu
+#include <vector>
+#include <cmath>
+#include <omp.h>
+// header file to use mshadow
+#include <mshadow/tensor.h>
+#include <mshadow-ps/ps.h>
+// helper function to load mnist dataset
+#include "./util.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// define sigmoid operation
+struct sigmoid {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f / (1.0f + expf(-a));
+  }
+};
+
+/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
+class INNet{
+ public:
+  virtual void Forward(const Tensor<cpu, 2, real_t>& inbatch,
+                       Tensor<cpu, 2, real_t> &oubatch) = 0;
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) = 0;
+  virtual ~INNet() {}
+};
+
+/*!
+ * \brief simple two layer neural net 
+ *        this implementation is device invariant
+ */
+template<typename xpu>
+class NNet : public INNet {
+ public:
+  // initialize the network
+  NNet(int batch_size, int num_in, int num_hidden, int num_out,
+       int devid, mshadow::ps::ISharedModel<xpu, real_t> *ps)
+      : rnd(0), devid(devid), ps(ps) {
+    mshadow::SetDevice<xpu>(devid);
+    stream = mshadow::NewStream<xpu>();
+    // set the computing streams
+    ninput.set_stream(stream);
+    nhidden.set_stream(stream);
+    nhiddenbak.set_stream(stream);
+    nout.set_stream(stream);
+    hbias.set_stream(stream);
+    obias.set_stream(stream);
+    g_hbias.set_stream(stream);
+    g_obias.set_stream(stream);
+    Wi2h.set_stream(stream);
+    Wh2o.set_stream(stream);
+    g_Wi2h.set_stream(stream);
+    g_Wh2o.set_stream(stream);
+    rnd.set_stream(stream);
+    // setup nodes
+    ninput.Resize(Shape2(batch_size, num_in));
+    nhidden.Resize(Shape2(batch_size, num_hidden));
+    nhiddenbak.Resize(nhidden.shape_);
+    nout.Resize(Shape2(batch_size, num_out));
+    // setup bias
+    hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_);
+    obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_);
+    hbias = 0.0f; obias = 0.0f;
+    // setup weights
+    Wi2h.Resize(Shape2(num_in, num_hidden));  g_Wi2h.Resize(Wi2h.shape_);
+    Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_);
+    rnd.SampleGaussian(&Wi2h, 0, 0.01f);
+    rnd.SampleGaussian(&Wh2o, 0, 0.01f);    
+    // initialize the key
+    ps->InitKey(Wi2h.shape_, 0, devid);
+    ps->InitKey(hbias.shape_, 1, devid);
+    ps->InitKey(Wh2o.shape_, 2, devid);
+    ps->InitKey(obias.shape_, 3, devid);
+  }
+  virtual ~NNet() {
+    mshadow::SetDevice<xpu>(devid);
+    mshadow::DeleteStream(stream);
+  }
+  // forward propagation
+  virtual void Forward(const Tensor<cpu, 2, real_t> &inbatch,
+                       Tensor<cpu, 2, real_t> &oubatch) {
+    // size is same conventsion as numpy
+    index_t batch_size = inbatch.size(0);
+    // copy data to input layer
+    Copy(ninput, inbatch, stream);
+    // wait the last pull requst on layer to complete
+    ps->PullWait(0, devid);
+    // first layer, fullc
+    nhidden = dot(ninput, Wi2h);
+    // wait the pull request on hbias to complete
+    ps->PullWait(1, devid);
+    nhidden+= repmat(hbias, batch_size);
+    // activation, sigmloid, backup activation in nhidden 
+    nhidden = F<sigmoid>(nhidden);
+    Copy(nhiddenbak, nhidden, stream);
+    // second layer fullc
+    ps->PullWait(2, devid);
+    nout = dot(nhiddenbak, Wh2o);
+    ps->PullWait(3, devid);
+    nout += repmat(obias, batch_size);
+    // softmax calculation
+    Softmax(nout, nout);
+    // copy result out
+    Copy(oubatch, nout, stream);
+    // Copy with stream is non-blocking, use wait to wait until copy finishes
+    stream->Wait();
+  }
+  // back propagation
+  virtual void Backprop(const Tensor<cpu, 2, real_t> &gradout) {
+    // copy gradient to output layer
+    Copy(nout, gradout, stream);
+    // calc grad of layer 2
+    g_obias = sum_rows(nout);
+    // sync proc defines the synchronization step
+    this->SyncProc(obias, g_obias, 3);
+    // update second layer weights
+    g_Wh2o = dot(nhiddenbak.T(), nout);   
+    // backprop to layer 1 
+    nhiddenbak = dot(nout, Wh2o.T());
+    this->SyncProc(Wh2o, g_Wh2o, 2);
+    // calculate gradient of sigmoid layer
+    nhidden = nhidden * (1.0f-nhidden) * nhiddenbak;
+    // calc grad of layer 1
+    g_hbias = sum_rows(nhidden);
+    this->SyncProc(hbias, g_hbias, 1);
+    g_Wi2h = dot(ninput.T(), nhidden);
+    this->SyncProc(Wi2h, g_Wi2h, 0);
+  }
+  // synchronization function
+  template<int dim>
+  inline void SyncProc(mshadow::Tensor<xpu, dim> weight,
+                       mshadow::Tensor<xpu, dim> grad,
+                       int data_key) {
+    // wait till last computation finishes
+    stream->Wait();
+    ps->Push(grad, data_key, devid, -data_key);
+    ps->PullReq(grad, data_key, devid, -data_key,
+                UpdateEntry::ApplyUpdate,
+                new UpdateEntry(weight.FlatTo2D(), grad.FlatTo2D(), dim == 1));
+  }
+  // data structure defined to help using callback function
+  struct UpdateEntry {
+    mshadow::Tensor<xpu, 2> weight;
+    mshadow::Tensor<xpu, 2> grad;
+    bool is_bias;
+    // constructor
+    UpdateEntry(mshadow::Tensor<xpu, 2> weight,
+                mshadow::Tensor<xpu, 2> grad,
+                bool is_bias)
+        : weight(weight), grad(grad),
+          is_bias(is_bias) {}
+    inline void Update(mshadow::Stream<xpu> *stream) {
+      weight.set_stream(stream);
+      const float wd = 0.00001;
+      const float eta = 0.8;
+      if (!is_bias) {
+        weight -= eta * (wd * weight + grad);
+      } else {
+        weight -= eta * grad;
+      }
+    }
+    // callback function to apply update
+    inline static void ApplyUpdate(mshadow::Stream<xpu> *stream, void *arg) {
+      UpdateEntry *e = static_cast<UpdateEntry*>(arg);
+      e->Update(stream);
+      delete e;
+    }
+  };
+  
+ private:
+  // computing stream
+  mshadow::Stream<xpu> *stream;
+  // device id
+  int devid;
+  // parameter server interface
+  mshadow::ps::ISharedModel<xpu, real_t> *ps;
+  // random seed generator
+  Random<xpu, real_t> rnd;
+  // nodes in neural net
+  TensorContainer<xpu, 2, real_t> ninput, nhidden, nhiddenbak, nout;
+  // hidden bias, gradient
+  TensorContainer<xpu, 1, real_t> hbias, obias, g_hbias, g_obias;
+  // weight gradient
+  TensorContainer<xpu, 2, real_t> Wi2h, Wh2o, g_Wi2h, g_Wh2o;    
+};
+
+// helper function to get the max inde
+inline int MaxIndex(Tensor<cpu, 1, real_t> pred) {
+  int maxidx = 0;
+  for(index_t i = 1; i < pred.size(0); ++i) {
+    if(pred[i] > pred[maxidx]) maxidx = (int)i;
+  }
+  return maxidx;
+}
+
+namespace mshadow {
+namespace ps {
+// model updater is used when update is happening on server side
+// if we only use parameter server for sum aggregation
+// this is not needed, but we must declare this function to return NULL
+template<>
+IModelUpdater<float> *CreateModelUpdater(void) {
+  return NULL;
+}
+}
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  srand(0);  
+  // settings
+  int batch_size = 100;
+  int num_in = 28 * 28;
+  int num_hidden = 100;
+  int num_out = 10;
+  int ndev = argc - 2;
+  if (batch_size % ndev != 0) {
+    fprintf(stderr, "choose number of devices ndev such that 100 MOD ndev == 0\n");
+    return 0;
+  }
+  // choose which version to use
+  std::vector<int> devs;
+  for (int i = 2; i < argc; ++i) {
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, real_t>
+      *ps = mshadow::ps::CreateSharedModel<xpu, real_t>("local");
+  ps->Init(devs);
+  
+  std::vector<INNet *> nets(ndev);
+  for (int i = 0; i < ndev; ++i) {
+    mshadow::InitTensorEngine<xpu>(devs[i]);
+    nets[i] = new NNet<xpu>(batch_size / ndev, num_in, num_hidden, num_out, devs[i], ps);
+  }   
+  
+  // label 
+  std::vector<int> ytrain, ytest;
+  // data
+  TensorContainer<cpu,2> xtrain, xtest;
+  LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true);
+  LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false);  
+  int num_iter = 20;
+  
+  for (int i = 0; i < num_iter; ++ i) {
+    // mini-batch per device
+    int step = batch_size / ndev;
+    // running parallel threads
+    #pragma omp parallel num_threads(ndev)
+    {
+      // temp output layer
+      TensorContainer<cpu, 2, real_t> pred;
+      pred.Resize(Shape2(step, num_out));
+      int tid = omp_get_thread_num();
+      mshadow::SetDevice<xpu>(devs[tid]);
+      for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) {
+        nets[tid]->Forward(xtrain.Slice(j + tid * step, j + (tid + 1) * step), pred); 
+        // set gradient into pred
+        for (int k = 0; k < step; ++ k) {
+          pred[k][ytrain[j + tid * step + k]] -= 1.0f;
+        }
+        // scale gradient by batchs zie
+        pred *= 1.0f / batch_size;
+        // run backprop
+        nets[tid]->Backprop(pred);
+      }
+    }
+    // evaluation
+    long nerr = 0;
+    #pragma omp parallel num_threads(ndev) reduction(+:nerr)
+    {
+      // temp output layer
+      TensorContainer<cpu, 2, real_t> pred;
+      pred.Resize(Shape2(step, num_out));
+      int tid = omp_get_thread_num();
+      mshadow::SetDevice<xpu>(devs[tid]);
+      for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) {
+        nets[tid]->Forward(xtest.Slice(j + tid * step, j + (tid + 1) * step), pred);
+        for (int k = 0; k < step; ++ k) {
+          nerr += MaxIndex(pred[k]) != ytest[j + tid * step + k];        
+        }
+      }
+    }
+    printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0));
+  }
+  
+  for(int i = 0; i < ndev; ++i) {    
+    mshadow::SetDevice<xpu>(devs[i]);
+    delete nets[i];
+    ShutdownTensorEngine<xpu>();
+  }
+  return 0;
+}
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <device> devicelist\n"\
+           "\tExample1: ./nnet_ps cpu 1 2 3\n"\
+           "\tExample2: ./nnet_ps gpu 0 1\n");
+    return 0;
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    Run<mshadow::cpu>(argc, argv);
+  } else {
+    Run<mshadow::gpu>(argc, argv);
+  }
+  return 0;
+}
diff --git a/guide/neuralnet/util.h b/guide/neuralnet/util.h
new file mode 100644
index 000000000000..f58203c7667a
--- /dev/null
+++ b/guide/neuralnet/util.h
@@ -0,0 +1,86 @@
+#pragma once
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include "mshadow/tensor.h"
+
+typedef float real_t;
+
+using namespace mshadow;
+
+int pack(unsigned char zz[4]){
+    return (int)(zz[3]) 
+        | (((int)(zz[2])) << 8)
+        | (((int)(zz[1])) << 16)
+        | (((int)(zz[0])) << 24);
+}
+
+template<typename T>
+inline void shuffle(T *data, size_t sz){
+  if(sz == 0) return;
+  for(size_t i = sz - 1; i > 0; i--){
+    std::swap(data[i], data[rand() % (i+1)]);
+  } 
+}
+// random shuffle the data inside, require PRNG 
+template<typename T>
+inline void shuffle(std::vector<T> &data){
+  shuffle(&data[0], data.size());
+}
+
+// simple function to load in mnist
+inline void LoadMNIST(const char *path_img, const char *path_label,
+                      std::vector<int> &ylabel,
+                      TensorContainer<cpu, 2, real_t> &xdata,
+                      bool do_shuffle){
+  // load in data
+  FILE *fi = fopen(path_img, "rb");
+  if (fi == NULL) {
+    printf("cannot open %s\n", path_img);
+    exit(-1);
+  }
+  unsigned char zz[4];
+  unsigned char *t_data, *l_data;
+  int num_image, width, height, nlabel;            
+  assert(fread(zz, 4 , 1, fi));
+  assert(fread(zz, 4 , 1, fi));    
+  num_image = pack(zz);
+  assert(fread(zz, 4 , 1, fi));                
+  width = pack(zz);
+  assert(fread(zz, 4 , 1, fi));                    
+  height = pack(zz);
+  
+  int step = width * height;
+  t_data = new unsigned char[num_image * step];    
+  assert(fread(t_data, step*num_image , 1 , fi));
+  fclose(fi);
+  
+  // load in label
+  fi = fopen(path_label, "rb");
+  assert(fread(zz, 4 , 1, fi));
+  assert(fread(zz, 4 , 1, fi));    
+  nlabel = pack(zz);
+  assert(num_image == nlabel);
+  l_data = new unsigned char[num_image];
+  assert(fread(l_data, num_image , 1 , fi));    
+  // try to do shuffle 
+  std::vector<int> rindex;
+  for (int i = 0; i < num_image; ++ i) {
+    rindex.push_back(i);
+  }
+  if (do_shuffle) {
+    shuffle(rindex);
+  }
+  
+  // save out result
+  ylabel.resize(num_image);
+  xdata.Resize(Shape2(num_image, width * height));
+  for (int i = 0 ; i < num_image ; ++i) {
+    for(int j = 0; j < step; ++j) {
+      xdata[i][j] = (float)(t_data[rindex[i]*step + j]) / 256.0f;            
+    }        
+    ylabel[i] = l_data[rindex[i]];
+  }
+  delete[] t_data; delete [] l_data;
+  printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle);
+}
diff --git a/make/README.md b/make/README.md
new file mode 100644
index 000000000000..6ef24d6d467c
--- /dev/null
+++ b/make/README.md
@@ -0,0 +1,18 @@
+Makefile Configuration of MShadow
+=====
+MShadow is a template library, you only need to include mshadow to use it. So this folder is not used to build mshadow library file.
+
+However, mshadow is a flexible library that allows you to compile with different configurations. For example,
+you can compile mshadow without CUDA, and specify your own choice of BLAS.
+There are different compile flags that you might need to set in your own configuration.
+This folder provides a Makefile script to help you do that.
+
+Usage
+=====
+* Set the configurations via variables in your Makefile, see example in [../guide/config.mk](../guide/config.mk)
+* include [mshadow.mk](mshadow.mk) in your Makefile
+* mshadow.mk will give you compiler variables that you can include when compiling
+  - Add MSHADOW_CFLAGS to the compile flags
+  - Add MSHADOW_LDFLAGS to the linker flags
+  - Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+* For example Makefile, see [../guide/Makefile](../guide/Makefile)
diff --git a/make/mshadow.mk b/make/mshadow.mk
new file mode 100644
index 000000000000..6e7b68b7b989
--- /dev/null
+++ b/make/mshadow.mk
@@ -0,0 +1,62 @@
+#---------------------------------------------------------------------------------------
+#  mshadow configuration script
+#
+#  include mshadow.mk after the variables are set
+#  
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+MSHADOW_CFLAGS = -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
+MSHADOW_LDFLAGS = -lm
+MSHADOW_NVCCFLAGS = 
+
+ifeq ($(USE_CUDA), 0)
+	MSHADOW_CFLAGS += -DMSHADOW_USE_CUDA=0
+else
+	MSHADOW_LDFLAGS += -lcudart -lcublas -lcurand
+endif
+ifneq ($(USE_CUDA_PATH), NONE)
+	MSHADOW_CFLAGS += -I$(USE_CUDA_PATH)/include
+	MSHADOW_LDFLAGS += -L$(USE_CUDA_PATH)/lib64
+endif
+
+ifeq ($(USE_BLAS), mkl)
+ifneq ($(USE_INTEL_PATH), NONE)
+	MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/mkl/lib/intel64
+	MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/lib/intel64
+	MSHADOW_CFLAGS += -I$(USE_INTEL_PATH)/mkl/include
+endif
+	MSHADOW_LDFLAGS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5
+else
+	MSHADOW_CFLAGS += -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+endif
+ifeq ($(USE_BLAS), openblas)
+	MSHADOW_LDFLAGS += -lopenblas
+else ifeq ($(USE_BLAS), atlas)
+	MSHADOW_LDFLAGS += -lcblas
+else ifeq ($(USE_BLAS), blas)
+	MSHADOW_LDFLAGS += -lblas
+else ifeq ($(USE_BLAS), apple)
+	MSHADOW_CFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/
+	MSHADOW_LDFLAGS += -framework Accelerate
+endif
+
+ifeq ($(PS_PATH), NONE)
+PS_PATH = ..
+endif
+ifeq ($(PS_THIRD_PATH), NONE)
+PS_THIRD_PATH = $(PS_PATH)/third_party
+endif
+
+ifeq ($(USE_DIST_PS),1)
+MSHADOW_CFLAGS += -DMSHADOW_DIST_PS=1 -std=c++11 \
+	-I$(PS_PATH)/src -I$(PS_THIRD_PATH)/include
+PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libpsmain.a) \
+	$(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \
+	libglog.a libz.a libsnappy.a)
+MSHADOW_NVCCFLAGS += --std=c++11
+else
+	MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0
+endif
diff --git a/mshadow-ps/.gitignore b/mshadow-ps/.gitignore
new file mode 100644
index 000000000000..076c1aa82e8b
--- /dev/null
+++ b/mshadow-ps/.gitignore
@@ -0,0 +1,3 @@
+Makefile
+test
+test.cpp
diff --git a/mshadow-ps/README.md b/mshadow-ps/README.md
new file mode 100644
index 000000000000..9c90cc9f3c9d
--- /dev/null
+++ b/mshadow-ps/README.md
@@ -0,0 +1,4 @@
+mshadow-ps
+====
+This folder contains mshadow-ps parameter server interface for mshadow GPU/CPU Tensor. See [guide on mshadow-ps](../guide/mshadow-ps) for introduction of the interface.
+
diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h
new file mode 100644
index 000000000000..8f9c96e2807c
--- /dev/null
+++ b/mshadow-ps/kv_array.h
@@ -0,0 +1,125 @@
+#pragma once
+#include "parameter/shared_parameter.h"
+#include "ps.h"
+namespace PS {
+
+DECLARE_string(app_name);
+
+template <typename V>
+class KVArray : public SharedParameter<Key> {
+ public:
+  KVArray(const string& my_name = FLAGS_app_name + "_model",
+          const string& parent_name = FLAGS_app_name) :
+      SharedParameter<Key>(my_name, parent_name) { }
+  virtual ~KVArray() { }
+
+  void setArray(int key, V* data, size_t size) {
+    val_[key] = SArray<V>(data, size, false);
+  }
+  void setUpdater(mshadow::ps::IModelUpdater<V>* updater) {
+    updater_ = updater;
+  }
+
+  // SArray<V>& array(int key) { return val_[key]; }
+
+  // funcs will be called by the system
+  MessagePtrList slice(const MessagePtr& msg, const KeyRangeList& krs);
+  void getValue(const MessagePtr& msg);
+  void setValue(const MessagePtr& msg);
+ protected:
+  std::unordered_map<int, SArray<V>> val_;
+  // an array is placed into multiple servers only if its length > min_slice_size
+  size_t min_slice_size_ = 1000;
+  mshadow::ps::IModelUpdater<V>* updater_ = nullptr;
+};
+
+
+template <typename V>
+void KVArray<V>::setValue(const MessagePtr& msg) {
+  CHECK_EQ(msg->value.size(), 1);
+  SArray<V> recv_data(msg->value[0]);
+  Range<Key> kr(msg->task.key_range());
+  CHECK_EQ(kr.size(), recv_data.size());
+  int key = msg->task.key_channel();
+  auto& my_val = val_[key];
+
+  if (isWorker()) {
+    if (my_val.empty()) my_val.resize(kr.size(), 0);
+    CHECK_GE(my_val.size(), kr.end());
+    my_val.segment(kr).copyFrom(recv_data);
+  } else if (isServer()) {
+    // TODO this server can do flexible consistency control here
+
+    if (my_val.empty()) {
+      // initialize weight
+      my_val.resize(kr.size(), 0);
+      CHECK_NOTNULL(updater_)->InitModel(key, my_val.data(), my_val.size());
+    }
+
+    // update weight
+    CHECK_GE(my_val.size(), kr.size());
+    CHECK_NOTNULL(updater_)->Update(key, recv_data.data(), recv_data.size());
+  }
+}
+
+// only be called at servers, namely a worker pull data from this server
+template <typename V>
+void KVArray<V>::getValue(const MessagePtr& msg) {
+  auto& my_val = val_[msg->task.key_channel()];
+  Range<Key> kr(msg->task.key_range());
+  if (my_val.empty()) {
+    // initialize weight
+    my_val.resize(kr.size(), 0);
+    CHECK_NOTNULL(updater_)->InitModel(msg->task.key_channel(), my_val.data(), my_val.size());
+  }
+
+  // TODO store the kr in memory
+  CHECK_EQ(my_val.size(), kr.size());
+  SArray<V> send_data(kr.size());
+  send_data.copyFrom(my_val);
+  msg->addValue(send_data);
+}
+
+// divide a message into n part, where part i goes to server i. it's a zero-copy
+// implementation
+template <typename V>
+MessagePtrList KVArray<V>::slice(const MessagePtr& msg, const KeyRangeList& krs) {
+  // divide the key range
+  size_t n = krs.size();
+  MessagePtrList ret(n);
+  Range<Key> kr(msg->task.key_range());
+  for (size_t i = 0; i < n; ++i) {
+    ret[i] = MessagePtr(new Message());
+    ret[i]->miniCopyFrom(*msg);
+    ret[i]->valid = true;
+    auto mut_kr = ret[i]->task.mutable_key_range();
+    if (kr.size() < min_slice_size_) {
+      if (i == 0) {
+        // server 0 get all data
+        kr.to(mut_kr);
+      } else {
+        Range<Key>(0,0).to(mut_kr);
+        // do not sent to server 1 - n
+        ret[i]->valid = false;
+      }
+    } else {
+      kr.evenDivide(n, i).to(mut_kr);
+    }
+  }
+
+  // divide the data
+  for (size_t i = 0; i < msg->value.size(); ++i) {
+    SArray<V> data(msg->value[i]);
+    CHECK_EQ(data.size(), kr.size());
+    for (size_t j = 0; j < n; ++j) {
+      if (ret[j]->valid) {
+        Range<Key> kr(ret[j]->task.key_range());
+        ret[j]->addValue(data.segment(kr));
+      }
+    }
+  }
+  return ret;
+}
+
+
+} // namespace PS
diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h
new file mode 100644
index 000000000000..6e6b08d2bd64
--- /dev/null
+++ b/mshadow-ps/ps.h
@@ -0,0 +1,303 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file ps.h
+ * \brief parameter server abstraction for mshadow tensor
+ *  this is a plugin of mshadow that can be used to syncrhonize
+ *  parameters across device and machines
+ *
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_H_
+#define MSHADOW_PS_H_
+#include <vector>
+// optionally support of lambda function in C++11, if available
+#if __cplusplus >= 201103L
+#include <functional>
+#endif  // C++11
+#include "../mshadow/tensor.h"
+
+/*! \brief whether to adapt distributed PS from parameter-server */
+#ifndef MSHADOW_DIST_PS
+#define MSHADOW_DIST_PS 1
+#endif
+
+namespace mshadow {
+/*! \brief namespace of mshadow-ps */
+namespace ps {
+/*!
+ * \brief interface of parameter server
+ * \tparam xpu the device of the data lies
+ * \tparam DType the type of element in the tensor
+ */
+template<typename xpu,
+         typename DType MSHADOW_DEFAULT_DTYPE>
+class ISharedModel {
+ public:
+  /*!
+   * \brief callback function that will be executed when pull request finishes
+   *        before calling the callback, the thread context is already switched
+   *        to the device of pullrequest
+   * \param stream the stream of callback thread, it is recommended to operate using this stream
+   * \param arg the argument of callback function
+   */
+  typedef void (CallbackFunction) (Stream<xpu> *stream, void *arg);
+  /*! \brief virtual destructor */
+  virtual ~ISharedModel(void) {}
+  /*!
+   * \brief Set param for the layer from string
+   * \param name parameter name
+   * \param val string for configuration
+   */
+  virtual void SetParam(const char *name, const char *val) {}
+  /*!
+   * \brief initialize the paramerver server client
+   * \param devices specifies the possible device id
+   *   to be input from Push and Pull,
+   */
+  virtual void Init(const std::vector<int> &devices) {}
+  /*!
+   * \brief initialize the paramerver server client
+   * without specifying the devices, only device 0 is allowed
+   */
+  inline void Init(void) {
+    std::vector<int> dev;
+    dev.push_back(0);
+    this->Init(dev);
+  }
+  /*!
+   * \brief initialize a key with certain shape
+   *  must be called before using Push/PullReq/PullWait
+   *  on the corresponding key
+   * \param shape the shape content of the key
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  template<int dim>
+  inline void InitKey(Shape<dim> shape,
+                      int key, int devid) {
+    this->InitKey_(shape.FlatTo2D(), key, devid);
+  }
+  /*!
+   * \brief wait until the pull event finishes
+   * if there was no pull request, wait will directly returns
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  virtual void PullWait(int key, int devid = 0) = 0;
+  /*!
+   * \brief push out a tensor to parameter server
+   *  this call is asynchronize and returns immediately
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   */
+  template<int dim>
+  inline void Push(Tensor<xpu, dim, DType> data,
+                   int key,
+                   int devid = 0,
+                   int priority = 0) {
+    this->Push_(data.FlatTo2D(), key, devid, priority);
+  }
+  /*!
+   * \brief send a pull request, to pull parameter into data
+   *  this call is asynchronize and returns immediately
+   *  use PullWait to wait the event of copy finish
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor,
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   * \param callback the callback function that will
+   *                 be invoked when the request finishes
+   * \param callback_arg the argument to pass to callback
+   */
+  template<int dim>
+  inline void PullReq(Tensor<xpu, dim, DType> data,
+                      int key,
+                      int devid = 0,
+                      int priority = 0,
+                      CallbackFunction callback = NULL,
+                      void *callback_arg = NULL) {
+    this->PullReq_(data.FlatTo2D(), key,
+                   devid, priority, callback, callback_arg);
+  }
+#if __cplusplus >= 201103L
+  /*!
+   * \brief send a pull request, to pull parameter into data
+   *  this call is asynchronize and returns immediately
+   *  use PullWait to wait the event of copy finish
+   *  this is the c++11 version that allows lambda function as callback
+   * \param data the data
+   * \param key the unique key to indicate the tensor,
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   * \param callback the callback function 
+   */
+  template<int dim>  
+  inline void PullReq(Tensor<xpu, dim, DType> data,
+                      int key,
+                      int devid,
+                      int priority,
+                      std::function<void(Stream<xpu> *stream)> callback) {
+    // need to allocate space, because callback can happen latter..
+    auto calbk = new std::function<void(Stream<xpu> *stream)>();
+    *calbk = callback;
+    this->PullReq(data, key, devid, priority, InvokeLambda_, calbk);
+  }
+#endif  // C++11
+ protected:
+  /*!
+   * \brief initialize a key with certain shape
+   * \param shape the shape content of the key
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  virtual void InitKey_(Shape<2> shape,
+                        int key, int devid) = 0;
+  /*!
+   * \brief push out a tensor to parameter server
+   *  this call is asynchronize and returns immediately
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   */
+  virtual void Push_(Tensor<xpu, 2, DType> data,
+                     int key,
+                     int devid = 0,
+                     int priority = 0) = 0;
+  /*!
+   * \brief send a pull request, to pull parameter into data
+   *  this call is asynchronize and returns immediately
+   *  use PullWait to wait the event of copy finish
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor,
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   * \param callback the callback function that will
+   *                 be invoked when the request finishes
+   * \param callback_arg the argument to pass to callback
+   */
+  virtual void PullReq_(Tensor<xpu, 2, DType> data,
+                        int key,
+                        int devid,
+                        int priority,
+                        CallbackFunction callback,
+                        void *callback_arg) = 0;
+
+ private:
+// C++11 support for lambda prepare function
+#if __cplusplus >= 201103L
+  /*! \brief hack function to convert lambda to callback function */
+  inline static void InvokeLambda_(Stream<xpu> *stream, void *fun) {
+    auto *fp = static_cast<std::function<void(Stream<xpu> *stream)>*>(fun);
+    (*fp)(stream);
+    delete fp;
+  }
+#endif  // C++11
+};
+/*! \brief interface for customized mshadow server */
+template<typename DType>
+class IModelUpdater {
+ public:
+  virtual ~IModelUpdater(void) {}
+  /*!
+   * \brief set parameters from outside
+   * \param name name of parameter
+   * \param val value of parameter
+   */
+  virtual void SetParam(const char *name, const char *val) {}
+  /*!
+   * \brief init the model updater
+   * \param rank the rank of the node
+   * \param conf configuration
+   */
+  virtual void InitUpdater(int rank, const std::string &conf) {}
+  /*!
+   * \brief initialize the model
+   * \param key the key of data we point to
+   * \param dptr the data pointer
+   * \param size size of the parameter key
+   */
+  virtual void InitModel(int key, DType *dptr, size_t size) {
+    this->InitModel_(key, Tensor<cpu, 1, DType>(dptr, Shape1(size)));
+  }
+  /*!
+   * update the model
+   * \param key the key of data we point to
+   * \param dptr the data pointer
+   * \param size size of the parameter key
+   */
+  virtual void Update(int key, DType *dptr, size_t size) {
+    this->Update_(key, Tensor<cpu, 1, DType>(dptr, Shape1(size)));    
+  }
+
+ protected:
+  /*!
+   * \brief initialize the model, user can implement this one
+   *   to take advantage of tensor operations
+   * \param key the key of data we point to
+   * \param data the tensor data corresponding to the data we want to initialize
+   */
+  virtual void InitModel_(int key, Tensor<cpu, 1, DType> data) {
+    utils::Error("InitModel: not implemented");
+  }
+  /*!
+   * \brief update the model, user can implement this one
+   *    to take advantage of tensor operations
+   * \param key the key of data we point to
+   * \param data the tensor data corresponding to the data we want to initialize
+   */
+  virtual void Update_(int key, Tensor<cpu, 1, DType> data) {
+    utils::Error("InitModel: not implemented");
+  }  
+};
+/*!
+ * \brief create customized server
+ * this is a server defined by user
+ * \return new server
+ */
+template<typename DType>
+IModelUpdater<DType> *CreateModelUpdater(void);
+}  // namespace ps
+}  // namespace mshadow
+
+#include "./ps_local-inl.h"
+#include "./ps_dist-inl.h"
+namespace mshadow {
+namespace ps {
+/*!
+ * \brief create a parameter server implementation
+ * \param type the type of paramerver server
+ *     can either be "local" or "dist"
+ * \return the ISharedModel that can be used to synchronize weights
+ */
+template<typename xpu, typename DType>
+inline ISharedModel<xpu, DType> *CreateSharedModel(const char *type) {
+  if (!strcmp("local", type)) return new LocalModel<xpu, DType>();
+#if MSHADOW_DIST_PS
+  if (!strcmp("dist", type)) return new DistModel<xpu, DType>();
+#endif
+  utils::Error("unknown server type %s\n", type);
+  return NULL;
+}
+}  // namespace ps
+}  // namespace mshadow
+#endif
diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h
new file mode 100644
index 000000000000..ed955e9da6a1
--- /dev/null
+++ b/mshadow-ps/ps_dist-inl.h
@@ -0,0 +1,117 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file ps_local-inl.h
+ * \brief local multi-threading implementation of PS abstraction
+ *
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_DIST_INL_H_
+#define MSHADOW_PS_DIST_INL_H_
+#include "./ps.h"
+#include "./ps_local-inl.h"
+
+#if MSHADOW_DIST_PS
+#include "./kv_array.h"
+#include "system/app.h"
+namespace mshadow {
+namespace ps {
+template<typename xpu, typename DType>
+class DistModel : public LocalModel<xpu, DType> {
+ public:
+  // parent type
+  typedef LocalModel<xpu, DType> Parent;
+
+  // initialize the parameter server
+  virtual void Init(const std::vector<int> &devices) {
+    Parent::Init(devices);
+    shared_model_ = new PS::KVArray<DType>();
+    if (this->custom_server != NULL) {
+      delete this->custom_server;
+      this->custom_server = NULL;
+    }
+  }
+  virtual ~DistModel(void) {
+  }
+
+ protected:
+  // do nothing
+  virtual void InitCustomerServer(void) {
+  }
+  virtual void ServerInitKey(Tensor<cpu, 2> weight, int key) {
+    // this is called when key get initialized for the first time
+    // weight can be used to hold the model that pulled back
+    // use this to initialize the key on serverside
+    using namespace PS;
+    MessagePtr pull_msg(new Message(kServerGroup));
+    pull_msg->task.set_key_channel(key);
+    Range<Key>(0, weight.MSize()).to(pull_msg->task.mutable_key_range());
+    shared_model_->setArray(key, weight.dptr_, weight.MSize());
+    pull_msg->fin_handle = [this, weight, key]() {
+      // call PullReady to notify LocalServer pulling is ready
+      this->PullReady(weight, key);
+    };
+    shared_model_->pull(pull_msg);
+  }
+  // override this function, to use parameter server
+  virtual void HandlePushFinish(Tensor<cpu, 3, DType> data,
+                                int key) {
+    // here we only use sum reduction, can change to others
+    for (index_t i = 1; i < data.size(0); ++i) {
+      data[0] += data[i];
+    }
+
+    // push
+    Tensor<cpu, 2> sendrecv = data[0];
+    using namespace PS;
+    utils::Assert(data[0].CheckContiguous(), "data must be contiguous");
+    SArray<DType> val; val.copyFrom(sendrecv.dptr_, sendrecv.MSize());
+    MessagePtr push_msg(new Message(kServerGroup));
+    push_msg->addValue(val);
+    // LL << val;
+    push_msg->task.set_key_channel(key);
+    Range<Key>(0, val.size()).to(push_msg->task.mutable_key_range());
+    int push_time = CHECK_NOTNULL(shared_model_)->push(push_msg);
+
+    // pull
+    MessagePtr pull_msg(new Message(kServerGroup, -1, push_time));
+    pull_msg->task.set_key_channel(key);
+    Range<Key>(0, sendrecv.MSize()).to(pull_msg->task.mutable_key_range());
+    shared_model_->setArray(key, sendrecv.dptr_, sendrecv.MSize());
+    pull_msg->fin_handle = [this, sendrecv, key]() {
+      // call PullReady to notify LocalServer pulling is ready
+      this->PullReady(sendrecv, key);
+    };
+    shared_model_->pull(pull_msg);
+  }
+
+ private:
+  PS::KVArray<DType>* shared_model_ = nullptr;
+};
+
+template<typename DType>
+class MShadowServerNode : public PS::App {
+ public:
+  // conf: get from the flag -app_conf
+  MShadowServerNode(const std::string &conf) : App() {
+    updater_ = CreateModelUpdater<DType>();
+
+    updater_->InitServer(myRank(), conf);
+    shared_model_ = new PS::KVArray<DType>();
+    shared_model_->setUpdater(updater_);
+  }
+  virtual ~MShadowServerNode() {
+    delete updater_;
+    delete shared_model_;
+  }
+ private:
+  IModelUpdater<DType> *updater_;
+  PS::KVArray<DType>* shared_model_;
+};
+
+// NOTE: do not add PS::CreateServer here add it in the program that uses
+// mshadow-ps
+
+}  // namespace ps
+}  // namespace msahdow
+#endif
+#endif
diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h
new file mode 100644
index 000000000000..fa092dc68bce
--- /dev/null
+++ b/mshadow-ps/ps_local-inl.h
@@ -0,0 +1,734 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file ps_local-inl.h
+ * \brief local multi-threading implementation of PS abstraction
+ *
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_LOCAL_INL_H_
+#define MSHADOW_PS_LOCAL_INL_H_
+#include <map>
+#include <utility>
+#if defined(_OPENMP)
+#include <omp.h>
+#ifdef _MSC_VER
+typedef int ms_omp_uint;
+#else
+typedef unsigned ms_omp_uint;
+#endif
+#endif
+
+#include "./thread.h"
+#include "./thread_util.h"
+
+namespace mshadow {
+namespace ps {
+// multi-threaded implementation of
+template<typename xpu, typename DType>
+class LocalModel : public ISharedModel<xpu, DType> {
+ public:
+  // redefine callback function
+  typedef typename ISharedModel<xpu, DType>::CallbackFunction
+  CallbackFunction;
+  // constructor
+  LocalModel(void) {
+    init_end = 0;
+    perdev_pull_thread = 1;
+    perdev_push_thread = 1;
+    bigarray_bound = 1000 * 1000;
+    nthread_reduction = 8;
+    use_pin_memory = 1;
+    destroy_signal = false;
+    custom_server = NULL;
+  }
+  // destructor
+  virtual ~LocalModel(void) {
+    if (init_end != 0) {
+      destroy_signal = true;
+      for (size_t i = 0; i < push_queues.size(); ++i) {
+        push_queues[i].Abort(1);
+      }
+      for (size_t i = 0; i < pull_queues.size(); ++i) {
+        pull_queues[i].Abort(1);
+      }
+      for (size_t i = 0; i < thread_push_handler.size(); ++i) {
+        thread_push_handler[i].Join();
+      }
+      for (size_t i = 0; i < thread_pull_handler.size(); ++i) {
+        thread_pull_handler[i].Join();
+      }
+      for (size_t i = 0; i < push_queues.size(); ++i) {
+        push_queues[i].Destroy();
+      }
+      push_map.Destroy();
+      push_lock.Destroy();
+      for (size_t i = 0; i < pull_queues.size(); ++i) {
+        pull_queues[i].Destroy();
+      }
+      pull_map.Destroy();
+      request_lock.Destroy();
+      wait_lock.Destroy();
+      wait_cond.Destroy();
+    }
+    if (custom_server != NULL) delete custom_server;
+  }
+  virtual void SetParam(const char *name, const char *val) {
+    int key;
+    if (sscanf(name, "push_op[%d]", &key) == 1) {
+      if (!strcmp(val, "gather")) {
+        request_lock.Lock();
+        push_operation[key] = kGather;
+        request_lock.Unlock();
+        return;
+      }
+      if (!strcmp(val, "sum")) {
+        push_operation[key] = kSum; return;
+      }
+      utils::Error("unknown push operation %s", val);
+    }
+    if (!strcmp(name, "reduce_thread")) {
+      nthread_reduction = atoi(val);
+    }
+    if (!strcmp(name, "use_pin_memory")) {
+      use_pin_memory = atoi(val);
+    }
+    if (!strcmp(name, "bigarray_bound")) {
+      bigarray_bound = static_cast<size_t>(atol(val));
+    }
+    if (!strcmp(name, "pull_thread")) {
+      if (!strcmp(val, "ndev")) {
+        perdev_pull_thread = 1;
+      } else if (!strcmp(val, "one")) {
+        perdev_pull_thread = 0;
+      } else {
+        utils::Error("invalid value for parameter pull_thread,"\
+                     " can only be ndev or one");
+      }
+    }
+    if (!strcmp(name, "push_thread")) {
+      if (!strcmp(val, "ndev")) {
+        perdev_push_thread = 1;
+      } else if (!strcmp(val, "one")) {
+        perdev_push_thread = 0;
+      } else {
+        utils::Error("invalid value for parameter push_thread,"\
+                     " can only be ndev or one");
+      }
+    }
+    if (!strcmp(name, "update_on_server")) {
+      update_on_server = atoi(val);
+    }
+    cfgvec.push_back(std::make_pair(std::string(name),
+                                    std::string(val)));
+  }
+  virtual void PullWait(int key, int devid) {
+    const int wid = GetWorkIndex(devid);
+    PullEntry *p = pull_map.Get(key);
+    if (p == NULL || p->wait.size() == 0) return;
+    PullEntry &e = *p;
+    // wake up waiters if any
+    utils::Assert(e.wait.size() == devices.size(),
+                  "PullWait: must initialize the wait");
+    PullWaitRecord &w = e.wait[wid];
+    if (!w.finished) {
+      wait_lock.Lock();
+      w.nwait += 1;
+      while (!w.finished) {
+        wait_cond.Wait(&wait_lock);
+      }
+      w.nwait -= 1;
+      utils::Assert(w.nwait >= 0, "boundary check");
+      wait_lock.Unlock();
+    }
+  }
+  virtual void Init(const std::vector<int> &devices) {
+    utils::Check(init_end == 0,
+                 "LocalServer.Init can only call Init once");
+    utils::Check(devices.size() != 0,
+                 "LocalServer.Init: must at least contain 1 devices");
+    this->devices = devices;
+    destroy_signal = false;
+    // initialize device id to local index
+    dev2index.clear();
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int devid = devices[i];
+      utils::Assert(devid >= 0, "device id must be bigger than 0");
+      if (devid >= static_cast<int>(dev2index.size())) {
+        dev2index.resize(devid + 1, -1);
+      }
+      dev2index[devid] = static_cast<int>(i);
+    }
+    // allocate space
+    pull_stream.resize(devices.size());
+    push_stream.resize(devices.size());
+    // initialize all the thread related things
+    if (perdev_push_thread != 0) {
+      push_queues.resize(devices.size());
+    } else {
+      push_queues.resize(1);
+    }
+    for (size_t i = 0; i < push_queues.size(); ++i) {
+      push_queues[i].Init();
+    }
+    push_map.Init();
+    push_lock.Init();
+    pull_map.Init();
+    request_lock.Init();
+    wait_lock.Init();
+    wait_cond.Init();
+    if (perdev_pull_thread != 0) {
+      pull_queues.resize(devices.size());
+    } else {
+      pull_queues.resize(1);
+    }
+    for (size_t i = 0; i < pull_queues.size(); ++i) {
+      pull_queues[i].Init();
+    }
+    // initialize the thread
+    if (perdev_push_thread != 0) {
+      thread_push_handler.resize(devices.size());
+      for (size_t i = 0; i < devices.size(); ++i) {
+        std::pair<LocalModel*, size_t> *p
+            = new std::pair<LocalModel*, size_t>();
+        *p = std::make_pair(this, i);
+        thread_push_handler[i].Start(PushLocalThread, p);
+      }
+    } else {
+      thread_push_handler.resize(1);
+      thread_push_handler[0].Start(PushGlobalThread, this);
+    }
+    // initialize pull handler
+    if (perdev_pull_thread != 0) {
+      thread_pull_handler.resize(devices.size());
+      for (size_t i = 0; i < devices.size(); ++i) {
+        std::pair<LocalModel*, size_t> *p
+            = new std::pair<LocalModel*, size_t>();
+        *p = std::make_pair(this, i);
+        thread_pull_handler[i].Start(PullLocalThread, p);
+      }
+    } else {
+      thread_pull_handler.resize(1);
+      thread_pull_handler[0].Start(PullGlobalThread, this);
+    }
+    this->InitCustomerServer();
+    this->init_end = 1;
+  }
+
+ protected:
+  /*! \brief operation performed locally in PS */
+  enum LocalOp {
+    /*! \brief take sum of all devices over the same key */
+    kSum = 0,
+    /*!
+     * \brief concatenate(gather),
+     *  the tensors in all devices with same key
+     */
+    kGather = 1
+  };
+  virtual void InitKey_(Shape<2> shape,
+                        int key, int devid) {
+    this->InitPullMap(key);
+    this->InitPushMap(key, shape);
+  }
+
+  virtual void Push_(Tensor<xpu, 2, DType> data,
+                     int key, int devid, int priority) {
+    PullEntry &e = pull_map.GetRef(key);
+    e.req[GetWorkIndex(devid)].ready = false;
+    if (perdev_push_thread != 0) {
+      int wid = GetWorkIndex(devid);
+      push_queues[wid].Push(PullTask(data, key, devid), priority);
+    } else {
+      push_queues[0].Push(PullTask(data, key, devid), priority);
+    }
+  }
+  virtual void PullReq_(Tensor<xpu, 2, DType> data,
+                        int key, int devid, int priority,
+                        CallbackFunction callback,
+                        void *callback_arg) {
+    PullEntry &e = pull_map.GetRef(key);
+    utils::Assert(e.req.size() == devices.size(),
+                  "PullReq: must initialize the key, req");
+    utils::Assert(e.wait.size() == devices.size(),
+                  "PullReq: must initialize the key, wait");
+    const int wid = GetWorkIndex(devid);
+    PullReqRecord &r = e.req[wid];
+    r.dest = data;
+    r.priority = priority;
+    r.callback = callback;
+    r.callback_arg = callback_arg;
+    // reset pull request finish mark
+    wait_lock.Lock();
+    e.wait[wid].finished = false;
+    wait_lock.Unlock();
+    // check ready event
+    request_lock.Lock();
+    utils::Check(!r.pending,
+                 "key = %d, cannot send duplicate pull request before it finishes",
+                 key);
+    if (e.req[wid].ready) {
+      if (perdev_pull_thread != 0) {
+        pull_queues[wid].Push(std::make_pair(key, devid));
+      } else {
+        pull_queues[0].Push(std::make_pair(key, devid));
+      }
+    } else {
+      r.pending = true;
+    }
+    request_lock.Unlock();
+  }
+  /*!
+   * \brief called to notify that the data is ready for pull
+   * \param data the data that can be pulled back
+   * \param the key of the data
+   */
+  virtual void PullReady(Tensor<cpu, 2> data, int key) {
+    PullEntry &e = pull_map.GetRef(key);
+    utils::Assert(e.req.size() == devices.size(),
+                  "PullReady: must initialize the key, req");
+    request_lock.Lock();
+    e.src = data;
+    for (index_t i = 0; i < e.req.size(); ++i) {
+      e.req[i].ready = true;
+      if (e.req[i].pending) {
+        if (perdev_pull_thread != 0) {
+          pull_queues[i].Push(std::make_pair(key, devices[i]));
+        } else {
+          pull_queues[0].Push(std::make_pair(key, devices[i]));
+        }
+        e.req[i].pending = false;
+      }
+    }
+    request_lock.Unlock();
+  }
+  virtual void ServerInitKey(Tensor<cpu, 2> weight, int key) {
+    if (custom_server != NULL) {
+      // intialize server, and ready for pullback
+      custom_server->InitModel(key, weight.dptr_, weight.MSize());
+      this->PullReady(weight, key);
+    }
+  }
+  /*!
+   * \brief event handler for push finish
+   *  called when all the data with same key comes int
+   * \param data the buffer holds the data in all devices
+   * \param result_buffer temporal buffer to hold the reduction result
+   * \param key the key of the data
+   */
+  virtual void HandlePushFinish(Tensor<cpu, 3, DType> data,
+                                int key) {
+    LocalOp op = kSum;
+    typename std::map<int, LocalOp>::const_iterator
+        it = push_operation.find(key);
+    if (it != push_operation.end() && it->first == key) {
+      op = it->second;
+    }
+    // customized server
+    if (custom_server != NULL) {
+      this->ReduceSum(data);
+      custom_server->Update(key, data[0].dptr_, data[0].MSize());
+      PushEntry &e = push_map.GetRef(key);
+      this->PullReady(e.weight, key);
+      return;
+    }
+    switch (op) {
+      case kSum: {
+        this->ReduceSum(data);
+        this->PullReady(data[0], key);
+        return;
+      }
+      case kGather: {
+        this->PullReady(data.FlatTo2D(), key);
+        return;
+      }
+      default: utils::Error("unknown LocalOp");
+    }
+  }
+
+  virtual void InitCustomerServer(void) {
+    if (update_on_server != 0) {
+      custom_server = CreateModelUpdater<DType>();
+      for (size_t j = 0; j < cfgvec.size(); ++j) {
+        custom_server->SetParam(cfgvec[j].first.c_str(),
+                                cfgvec[j].second.c_str());
+      }
+      custom_server->InitUpdater(0, std::string());
+    }
+  }
+ protected:
+  // customized server
+  IModelUpdater<DType> *custom_server;
+ private:
+  /*! \brief task running */
+  struct PullTask {
+    /*! \brief the task data source */
+    Tensor<xpu, 2, DType> data;
+    /*! \brief the key to the tensor */
+    int key;
+    /*!
+     * \brief the device id, (key,devid),
+     * uniquely identifies a mem location
+     */
+    int devid;
+    PullTask(void) {}
+    PullTask(Tensor<xpu, 2, DType> data, int key, int devid)
+        : data(data), key(key), devid(devid) {}
+  };
+  /*! \brief data structure to hold temporal push result */
+  struct PushEntry {
+    // temporal space to hold input data
+    Tensor<cpu, 4, DType> data;
+    // temporal space to hold weight, if needed
+    Tensor<cpu, 2, DType> weight;
+    // indicator whether the certain devices is already copied in
+    std::vector<bool> copied;
+    // number of data copied in
+    int num_copied;
+    // version number of data used to hold incomming data in push
+    int copyin_version;
+    // use pinned memory
+    bool pin_memory;
+    // constructor
+    PushEntry(void)
+        : copyin_version(0) {
+      weight.dptr_ = NULL;
+    }
+    ~PushEntry(void) {
+      if (data.dptr_ != NULL) {
+        if (pin_memory) {
+          mshadow::FreeHost<xpu>(&data);
+          if (weight.dptr_ != NULL) {
+            mshadow::FreeHost<xpu>(&weight);
+          }
+        } else {
+          mshadow::FreeSpace(&data);
+          if (weight.dptr_ != NULL) {
+            mshadow::FreeSpace(&weight);
+          }
+        }
+      }
+    }
+    // constructor
+    inline void Init(int ndevice, Shape<2> shape,
+                     bool pin_memory, bool need_weight) {
+      this->pin_memory = pin_memory;
+      data.shape_ = Shape4(2, ndevice, shape[0], shape[1]);
+      weight.shape_ = shape;
+      if (pin_memory) {
+        mshadow::AllocHost<xpu>(&data);
+        if (need_weight) mshadow::AllocHost<xpu>(&weight);
+      } else {
+        mshadow::AllocSpace(&data, false);
+        if (need_weight) mshadow::AllocSpace(&weight);
+      }
+      utils::Assert(data.CheckContiguous(), "Init");
+      utils::Assert(!need_weight || weight.CheckContiguous(), "Init");
+      num_copied = 0;
+      copied.resize(ndevice, false);
+    }
+  };
+  // a record to remember things related to pull request
+  struct PullReqRecord {
+    // whether this record contains a pending request
+    // whether pull is ready to go
+    bool ready;
+    // waiting for pull ready
+    bool pending;
+    // the destination to pull data into
+    Tensor<xpu, 2, DType> dest;
+    // the priority of the
+    int priority;
+    // callback function
+    CallbackFunction *callback;
+    // argument for callback
+    void *callback_arg;
+    PullReqRecord(void) : ready(false), pending(false) {
+    }
+  };
+  // a record to help handle pullwait
+  struct PullWaitRecord {
+    // number of thread that waits for the request to finish
+    int nwait;
+    // the request was finished
+    bool finished;
+    PullWaitRecord(void)
+        : nwait(0), finished(true) {
+      // set finished to true so pull without pull request returns
+    }
+  };
+  /*! \brief data structure to hold pull request */
+  struct PullEntry {
+    // data to be pulled back
+    Tensor<cpu, 2, DType> src;
+    // pullrequest record
+    std::vector<PullReqRecord> req;
+    // whether there is thread waiting on this event
+    std::vector<PullWaitRecord> wait;
+    PullEntry(void) {
+    }
+  };
+  // signal to notify all the thread about class destruction
+  bool destroy_signal;
+  // vector of devices
+  std::vector<int> devices;
+  // device index to local index
+  std::vector<int> dev2index;
+  //----- data structure used to support push ----
+  // stream used by push thread each device for memcpy
+  std::vector<Stream<xpu>*> push_stream;
+  // the queue used for push task
+  std::vector<utils::ThreadPQueue<PullTask> > push_queues;
+  // thread to handle push task
+  std::vector<utils::Thread> thread_push_handler;
+  // lock to lock push field
+  utils::Mutex push_lock;
+  // the map of push buffer
+  utils::ThreadSafeMap<PushEntry> push_map;
+  // customized local reduction operation
+  std::map<int, LocalOp> push_operation;
+  //----- data structure used to support pull ----
+  // the queue used for pull task
+  std::vector<utils::ThreadPQueue<std::pair<int, int> > > pull_queues;
+  // stream used by pull thread each device for memcpy
+  std::vector<Stream<xpu>*> pull_stream;
+  // the map to store pull status
+  utils::ThreadSafeMap<PullEntry> pull_map;
+  // thread to handle pull task
+  std::vector<utils::Thread> thread_pull_handler;
+  // lock to lock request field
+  utils::Mutex request_lock;
+  // lock to lock wait field
+  utils::Mutex wait_lock;
+  // conditional variable to do waiting
+  utils::ConditionVariable wait_cond;
+  //---------configurations of server-------
+  int init_end;
+  // whether perform update on serverside
+  int update_on_server;
+  // use pinned memory
+  int use_pin_memory;
+  // number of reduction thread
+  int nthread_reduction;
+  // the threshold for big array
+  size_t bigarray_bound;
+  // whether use pull thread per device
+  int perdev_pull_thread;
+  // whether use push thread per device
+  int perdev_push_thread;
+  /*! \brief history of configurations */
+  std::vector< std::pair<std::string, std::string> > cfgvec;
+  // perform sum reduction
+  inline void ReduceSum(Tensor<cpu, 3, DType> data) {
+    #if defined(_OPENMP)
+    if (data[0].MSize() >= bigarray_bound &&
+        nthread_reduction != 0) {
+      ms_omp_uint ntask = static_cast<ms_omp_uint>(data.size(1));
+      #pragma omp parallel for schedule(static) num_threads(nthread_reduction)
+      for (ms_omp_uint j = 0; j < ntask; ++j) {
+        for (index_t i = 1; i < data.size(0); ++i) {
+          data[0][j] += data[i][j];
+        }
+      }
+    } else
+      #endif
+    {
+      for (index_t i = 1; i < data.size(0); ++i) {
+        data[0] += data[i];
+      }
+    }
+  }
+  // push handler
+  inline void PushProc(utils::ThreadPQueue<PullTask> *queue) {
+    while (!destroy_signal) {
+      PullTask tsk;
+      if (queue->Pop(&tsk)) {
+        const int wid = GetWorkIndex(tsk.devid);
+        PushEntry &e = push_map.GetRef(tsk.key);
+        utils::Check(e.data[0][0].shape_ == tsk.data.shape_,
+                     "Tensor with same key must share same shape");
+        utils::Assert(!e.copied[wid], "data inconsistency");
+        // start copy
+        SetDevice<xpu>(tsk.devid);
+        Copy(e.data[e.copyin_version][wid], tsk.data, push_stream[wid]);
+        // wait till the copy finishes
+        push_stream[wid]->Wait();
+        // mark copied
+        e.copied[wid] = true;
+        push_lock.Lock();
+        e.num_copied += 1;
+        int cp_version = e.copyin_version;
+        bool push_finish = e.num_copied >= static_cast<int>(devices.size());
+        if (push_finish) {
+          // switch version
+          e.copyin_version = (e.copyin_version + 1) % e.data.size(0);
+          std::fill(e.copied.begin(), e.copied.end(), false);
+          e.num_copied = 0;
+        }
+        push_lock.Unlock();
+        if (push_finish) {
+          this->HandlePushFinish(e.data[cp_version], tsk.key);
+        }
+      } else {
+        utils::Assert(destroy_signal, "abort but not destroy");
+      }
+    }
+  }
+  inline void PushHandlerGlobal(void) {
+    // allocate stream resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      push_stream[i] = NewStream<xpu>();
+    }
+    this->PushProc(&push_queues[0]);
+    // free resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      DeleteStream(push_stream[i]);
+    }
+  }
+  inline void PushHandlerLocal(size_t tid) {
+    utils::Assert(tid < devices.size(), "threadid exceed boundary");
+    utils::Assert(push_queues.size() == devices.size(),
+                  "must have one pull_queue per device");
+    // allocate stream resources
+    SetDevice<xpu>(devices[tid]);
+    push_stream[tid] = NewStream<xpu>();
+    this->PushProc(&push_queues[tid]);
+    SetDevice<xpu>(devices[tid]);
+    DeleteStream(push_stream[tid]);
+  }
+  /*!\brief entry point of loader thread */
+  inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) {
+    static_cast<LocalModel*>(pthread)->PushHandlerGlobal();
+    utils::ThreadExit(NULL);
+    return NULL;
+  }
+  inline static MSHADOW_THREAD_PREFIX PushLocalThread(void *arg) {
+    std::pair<LocalModel*, size_t> *p
+        = static_cast<std::pair<LocalModel*, size_t>*>(arg);
+    p->first->PushHandlerLocal(p->second);
+    delete p;
+    utils::ThreadExit(NULL);
+    return NULL;
+  }
+  // push handler procedure
+  inline void PullProc(utils::ThreadPQueue<std::pair<int, int> > *queue) {
+    while (!destroy_signal) {
+      std::pair<int, int> tsk;
+      if (queue->Pop(&tsk)) {
+        const int key = tsk.first;
+        const int devid = tsk.second;
+        const int wid = GetWorkIndex(devid);
+        PullEntry &e = pull_map.GetRef(key);
+        {
+          // handle request
+          utils::Assert(e.req.size() == devices.size(),
+                        "PullHandler: must initialize the key, req");
+          PullReqRecord &r = e.req[wid];
+          SetDevice<xpu>(devid);
+          Copy(r.dest, e.src, pull_stream[wid]);
+          // callback, if any
+          if (r.callback != NULL) {
+            (*r.callback)(pull_stream[wid], r.callback_arg);
+          }
+          // wait till the operation finishes
+          pull_stream[wid]->Wait();
+        }
+        {
+          // wake up waiters if any
+          utils::Assert(e.wait.size() == devices.size(),
+                        "PullHandler, must initialize the key, req");
+          PullWaitRecord &w = e.wait[wid];
+          wait_lock.Lock();
+          w.finished = true;
+          if (w.nwait != 0) {
+            wait_cond.Broadcast();
+          }
+          wait_lock.Unlock();
+        }
+      } else {
+        utils::Assert(destroy_signal, "abort but not destroy");
+      }
+    }
+  }
+  // use one thread for all pull actions
+  inline void PullHandlerGlobal(void) {
+    // allocate stream resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      pull_stream[i] = NewStream<xpu>();
+    }
+    this->PullProc(&pull_queues[0]);
+    // free resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      DeleteStream(pull_stream[i]);
+    }
+  }
+  inline void PullHandlerLocal(size_t tid) {
+    utils::Assert(tid < devices.size(), "threadid exceed boundary");
+    utils::Assert(pull_queues.size() == devices.size(),
+                  "must have one pull_queue per device");
+    // allocate stream resources
+    SetDevice<xpu>(devices[tid]);
+    pull_stream[tid] = NewStream<xpu>();
+    this->PullProc(&pull_queues[tid]);
+    SetDevice<xpu>(devices[tid]);
+    DeleteStream(pull_stream[tid]);
+  }
+  /*!\brief entry point of pull thread, one thread for all devices */
+  inline static MSHADOW_THREAD_PREFIX PullGlobalThread(void *arg) {
+    static_cast<LocalModel*>(arg)->PullHandlerGlobal();
+    utils::ThreadExit(NULL);
+    return NULL;
+  }
+  inline static MSHADOW_THREAD_PREFIX PullLocalThread(void *arg) {
+    std::pair<LocalModel*, size_t> *p
+        = static_cast<std::pair<LocalModel*, size_t>*>(arg);
+    p->first->PullHandlerLocal(p->second);
+    delete p;
+    utils::ThreadExit(NULL);
+    return NULL;
+  }
+  // get internal index of device
+  inline int GetWorkIndex(int devid) const {
+    utils::Check(devid >= 0 &&
+                 devid < static_cast<int>(dev2index.size()) &&
+                 dev2index[devid] >= 0,
+                 "Push: invalid devid");
+    return dev2index[devid];
+  }
+  // functions to handle pull
+  inline void InitPullMap(int key) {
+    pull_map.Init(key);
+    PullEntry &e = pull_map.GetRef(key);
+    request_lock.Lock();
+    // must recheck after lock
+    if (e.req.size() == 0) {
+      e.req.resize(devices.size(), PullReqRecord());
+    }
+    request_lock.Unlock();
+    // check wait map
+    wait_lock.Lock();
+    // must recheck after lock
+    if (e.wait.size() == 0) {
+      e.wait.resize(devices.size(), PullWaitRecord());
+    }
+    wait_lock.Unlock();
+  }
+  // functions to handle pull
+  inline void InitPushMap(int key, Shape<2> shape) {
+    push_map.Init(key);
+    PushEntry &e = push_map.GetRef(key);
+    push_lock.Lock();
+    if (e.copied.size() == 0) {
+      e.Init(devices.size(), shape,
+             use_pin_memory != 0, update_on_server != 0);
+    }
+    this->ServerInitKey(e.weight, key);
+    push_lock.Unlock();
+  }
+};
+}  // namespace ps
+}  // namespace mshadow
+#endif // MSHADOW_PS_LOCAL_INL_H_
diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h
new file mode 100644
index 000000000000..382e17a447bf
--- /dev/null
+++ b/mshadow-ps/thread.h
@@ -0,0 +1,251 @@
+#ifndef MSHADOW_PS_THREAD_H_
+#define MSHADOW_PS_THREAD_H_
+/*!
+ * \file thread.h
+ * \brief this header include the minimum necessary resource for multi-threading that can be compiled in windows, linux, mac
+ * \author Tianqi Chen
+ */
+#ifdef _MSC_VER
+#include "../mshadow/utils.h"
+#include <windows.h>
+#include <process.h>
+namespace mshadow {
+namespace utils {
+/*! \brief simple semaphore used for synchronization */
+class Semaphore {
+ public :
+  inline void Init(int init_val) {
+    sem = CreateSemaphore(NULL, init_val, 10, NULL);
+    utils::Check(sem != NULL, "create Semaphore error");
+  }
+  inline void Destroy(void) {
+    CloseHandle(sem);
+  }
+  inline void Wait(void) {
+    utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
+  }
+  inline void Post(void) {
+    utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error");
+  }
+ private:
+  HANDLE sem;
+};
+
+/*! \brief mutex under windows */
+class Mutex {
+ public:
+  inline void Init(void) {
+    utils::Check(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400) != 0,
+				 "Mutex::Init fail");
+  }
+  inline void Lock(void) {
+	EnterCriticalSection(&mutex);
+  }
+  inline void Unlock(void) {
+	LeaveCriticalSection(&mutex);
+  }
+  inline void Destroy(void) {
+	DeleteCriticalSection(&mutex);
+  }
+
+ private:
+  friend class ConditionVariable;
+  CRITICAL_SECTION mutex;
+};
+
+// conditional variable that uses pthread
+class ConditionVariable {  
+ public:
+  // initialize conditional variable
+  inline void Init(void) {
+    InitializeConditionVariable(&cond);
+  }
+  // destroy the thread
+  inline void Destroy(void) {
+    //DeleteConditionVariable(&cond);
+  }
+  // wait on the conditional variable
+  inline void Wait(Mutex *mutex) {
+    utils::Check(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE) != 0,
+	             "ConditionVariable:Wait fail");
+  }
+  inline void Broadcast(void) {
+    WakeAllConditionVariable(&cond);
+  }
+  inline void Signal(void) {
+    WakeConditionVariable(&cond);
+  }
+
+ private:
+	CONDITION_VARIABLE cond;
+};
+
+/*! \brief simple thread that wraps windows thread */
+class Thread {
+ private:
+  HANDLE    thread_handle;
+  unsigned  thread_id;            
+ public:
+  inline void Start(unsigned int __stdcall entry(void*), void *param) {
+    thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
+  }            
+  inline int Join(void) {
+    WaitForSingleObject(thread_handle, INFINITE);
+    return 0;
+  }
+};
+/*! \brief exit function called from thread */
+inline void ThreadExit(void *status) {
+  _endthreadex(0);
+}
+#define MSHADOW_THREAD_PREFIX unsigned int __stdcall
+}  // namespace utils
+}  // namespace mshadow
+#else
+// thread interface using g++     
+#include <semaphore.h>
+#include <pthread.h>
+#include <errno.h>
+namespace mshadow {
+namespace utils {
+/*!\brief semaphore class */
+class Semaphore {
+  #ifdef __APPLE__
+ private:
+  sem_t* semPtr;
+  char sema_name[20];            
+ private:
+  inline void GenRandomString(char *s, const int len) {
+    static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;
+    for (int i = 0; i < len; ++i) {
+      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
+    }
+    s[len] = 0;
+  }
+ public:
+  inline void Init(int init_val) {
+    sema_name[0]='/'; 
+    sema_name[1]='s'; 
+    sema_name[2]='e'; 
+    sema_name[3]='/'; 
+    GenRandomString(&sema_name[4], 16);
+    if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
+      perror("sem_open");
+      exit(1);
+    }
+    utils::Check(semPtr != NULL, "create Semaphore error");
+  }
+  inline void Destroy(void) {
+    if (sem_close(semPtr) == -1) {
+      perror("sem_close");
+      exit(EXIT_FAILURE);
+    }
+    if (sem_unlink(sema_name) == -1) {
+      perror("sem_unlink");
+      exit(EXIT_FAILURE);
+    }
+  }
+  inline void Wait(void) {
+    sem_wait(semPtr);
+  }
+  inline void Post(void) {
+    sem_post(semPtr);
+  }               
+  #else
+ private:
+  sem_t sem;
+ public:
+  inline void Init(int init_val) {
+    if (sem_init(&sem, 0, init_val) != 0) {
+      utils::Error("Semaphore.Init:%s", strerror(errno));
+    }
+  }
+  inline void Destroy(void) {
+    if (sem_destroy(&sem) != 0) {
+      utils::Error("Semaphore.Destroy:%s", strerror(errno));
+    }
+  }
+  inline void Wait(void) {
+    if (sem_wait(&sem) != 0) {
+      utils::Error("Semaphore.Wait:%s", strerror(errno));
+    }
+  }
+  inline void Post(void) {
+    if (sem_post(&sem) != 0) {
+      utils::Error("Semaphore.Post:%s", strerror(errno));
+    }
+  }
+  #endif  
+};
+
+// mutex that works with pthread
+class Mutex {
+ public:
+  inline void Init(void) {
+    pthread_mutex_init(&mutex, NULL);
+  }
+  inline void Lock(void) {
+    pthread_mutex_lock(&mutex);
+  }
+  inline void Unlock(void) {
+    pthread_mutex_unlock(&mutex);
+  }
+  inline void Destroy(void) {
+    pthread_mutex_destroy(&mutex);
+  }
+
+ private:
+  friend class ConditionVariable;
+  pthread_mutex_t mutex;
+};
+
+// conditional variable that uses pthread
+class ConditionVariable {  
+ public:
+  // initialize conditional variable
+  inline void Init(void) {
+    pthread_cond_init(&cond, NULL);
+  }
+  // destroy the thread
+  inline void Destroy(void) {
+    pthread_cond_destroy(&cond);
+  }
+  // wait on the conditional variable
+  inline void Wait(Mutex *mutex) {
+    pthread_cond_wait(&cond, &(mutex->mutex));
+  }
+  inline void Broadcast(void) {
+    pthread_cond_broadcast(&cond);
+  }
+  inline void Signal(void) {
+    pthread_cond_signal(&cond);
+  }
+
+ private:
+  pthread_cond_t cond;
+};
+
+/*!\brief simple thread class */
+class Thread {
+ private:
+  pthread_t thread;                
+ public :
+  inline void Start(void * entry(void*), void *param) {
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    pthread_create(&thread, &attr, entry, param);
+  }  
+  inline int Join(void) {
+    void *status;
+    return pthread_join(thread, &status);
+  }
+};
+inline void ThreadExit(void *status) {
+  pthread_exit(status);
+}
+}  // namespace utils
+}  // namespace mshadow
+#define MSHADOW_THREAD_PREFIX void *
+#endif  // Linux
+#endif  // MSHADOW_PS_THREAD_H_
diff --git a/mshadow-ps/thread_util.h b/mshadow-ps/thread_util.h
new file mode 100644
index 000000000000..607d69f83c3a
--- /dev/null
+++ b/mshadow-ps/thread_util.h
@@ -0,0 +1,143 @@
+#ifndef MSHADOW_PS_THREAD_UTIL_H_
+#define MSHADOW_PS_THREAD_UTIL_H_
+/*!
+ * \file thread_util.h
+ * \brief data structures for multi-threading communication
+ * \author Tianqi Chen
+ */
+#include <utility>
+#include <queue>
+#include "./thread.h"
+namespace mshadow {
+namespace utils {
+/*!
+ * \brief thread safe queue that can be used for customer consumer model
+ * in the future, it will support priority scheduling
+ * \tparam DType the content of the queue
+ */
+template<typename DType>
+class ThreadPQueue {
+ public:
+  /*! \brief intitialize the queue, must call this before use */
+  inline void Init(void) {
+    lock_.Init();
+    counter_.Init(0);
+  }
+  /*! \brief destroy the resources on the queue */
+  inline void Destroy(void) {
+    lock_.Destroy();
+    counter_.Destroy();
+  }
+  /*!
+   * \brief Destroy the queue
+   *        wake up all the threads waits on pop
+   *  this is usually used in class destructor
+   * \param max_nthread the maximum number of thread that
+   *  could be waiting on the queue
+   */
+  inline void Abort(int max_nthread = 1) {
+    for (int i = 0; i < max_nthread; ++ i) {
+      counter_.Post();
+    }   
+  }
+  /*!
+   * \brief push an element to the queue
+   * \param data the data to be puhed into queue
+   * \param optionally priority level to hint which
+   *        element should be poped first
+   */
+  inline void Push(const DType &data, int priority = 0) {
+    lock_.Lock();
+    queue_.push(Entry(data, priority));
+    lock_.Unlock();
+    counter_.Post();    
+  }
+  /*!
+   * \brief pop an element from the queue
+   * this will block the thread if the queue is empty
+   * \param data_out the address to put output of the queue
+   * \return true if a correct element is returned
+   *  false if abort is called and no element was left in queue
+   */
+  inline bool Pop(DType *data_out) {
+    counter_.Wait();
+    lock_.Lock();
+    if (queue_.size() == 0) {
+      lock_.Unlock(); return false;
+    }
+    utils::Assert(queue_.size() != 0, "Queue.Pop");
+    *data_out = queue_.top().data;
+    queue_.pop();
+    lock_.Unlock();
+    return true;
+  }
+
+ private:
+  // entry in the queue
+  struct Entry {
+    DType data;
+    int priority;
+    Entry(const DType &data, int priority)
+        : data(data), priority(priority) {}
+    inline bool operator<(const Entry &b) const {
+      return priority < b.priority;
+    }
+  };
+
+  // the queue to push
+  std::priority_queue<Entry> queue_;
+  // lock for accessing the queue
+  utils::Mutex lock_;
+  // counter to count number of push tasks
+  utils::Semaphore counter_;
+};
+
+// naive implementation of threadsafe map
+template<typename TValue>
+class ThreadSafeMap {
+ public:
+  inline void Init(void) {
+    lock_.Init();
+  }
+  inline void Destroy(void) {
+    for (typename std::map<int, TValue*>::iterator
+             it = map_.begin(); it != map_.end(); ++it) {
+      delete it->second;
+    }
+    lock_.Destroy();
+  }
+  inline TValue *Get(int key) {
+    TValue *ret;
+    lock_.Lock();
+    typename std::map<int, TValue*>::const_iterator 
+        it = map_.find(key);
+    if (it == map_.end() || it->first != key) {
+      ret = NULL;
+    } else {
+      ret = it->second;
+    }
+    lock_.Unlock();
+    return ret;
+  }
+  inline TValue &GetRef(int key) {
+    TValue *ret = this->Get(key);
+    utils::Assert(ret != NULL, "key does not exist");
+    return *ret;
+  }
+  inline void Init(int key) {
+    lock_.Lock();
+    if (map_.count(key) == 0) {
+      map_[key] = new TValue();
+    }
+    lock_.Unlock();    
+  }
+
+ private:
+  // lock for accessing the queue
+  utils::Mutex lock_;
+  std::map<int, TValue*> map_;
+};
+
+}  // namespace utils
+}  // namespace mshadow
+#endif  // MSHADOW_PS_THREAD_UTIL_H_
diff --git a/mshadow/README.md b/mshadow/README.md
new file mode 100644
index 000000000000..86276af013e2
--- /dev/null
+++ b/mshadow/README.md
@@ -0,0 +1,8 @@
+Code Guide
+====
+This readme contains notes about code in mshadow. MShadow generally follows Google's C++ Style.
+
+Convention
+====
+* Basically, all the files ends in ```-inl.h, -inl.cuh``` are implementations, and can be ignored if only using mshadow
+* The files ends in ```.h``` are heavily commented with [doxyen format](http://www.doxygen.org/), and can be used to generate the corresponding document.
diff --git a/mshadow/base.h b/mshadow/base.h
new file mode 100644
index 000000000000..6336dfa023bc
--- /dev/null
+++ b/mshadow/base.h
@@ -0,0 +1,359 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file base.h
+ * \brief definitions of base types, operators, macros functions
+ *
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_BASE_H_
+#define MSHADOW_BASE_H_
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#endif
+#include <cmath>
+#include <cstdio>
+#include <cfloat>
+#include <climits>
+#include <algorithm>
+// macro defintiions
+/*!
+ * \brief if this macro is define to be 1,
+ * mshadow should compile without any of other libs 
+ */
+#ifndef MSHADOW_STAND_ALONE
+#define MSHADOW_STAND_ALONE 0
+#endif
+/*! \brief whether do padding during allocation */
+#ifndef MSHADOW_ALLOC_PAD
+#define MSHADOW_ALLOC_PAD true
+#endif
+/*!
+ * \brief 
+ *  x dimension of data must be bigger pad_size * ratio to be alloced padded memory,
+ *  otherwise use tide allocation 
+ *  for example, if pad_ratio=2, GPU memory alignement size is 32,
+ *  then we will only allocate padded memory if x dimension > 64
+ *  set it to 0 then we will always allocate padded memory
+ */
+#ifndef MSHADOW_MIN_PAD_RATIO
+  #define MSHADOW_MIN_PAD_RATIO 2
+#endif
+
+#if MSHADOW_STAND_ALONE
+  #define MSHADOW_USE_CBLAS 0
+  #define MSHADOW_USE_MKL   0
+  #define MSHADOW_USE_CUDA  0
+#endif
+
+/*!
+ * \brief force user to use GPU stream during computation
+ *  error will be shot when default stream NULL is used
+ */
+#ifndef MSHADOW_FORCE_STREAM
+#define MSHADOW_FORCE_STREAM 0
+#endif
+
+/*! \brief use CBLAS for CBLAS */
+#ifndef MSHADOW_USE_CBLAS
+  #define MSHADOW_USE_CBLAS 0
+#endif
+/*! \brief use MKL for BLAS */
+#ifndef MSHADOW_USE_MKL
+  #define MSHADOW_USE_MKL   1
+#endif
+/*!
+ * \brief use CUDA support, must ensure that the cuda include path is correct,
+ * or directly compile using nvcc
+ */
+#ifndef MSHADOW_USE_CUDA
+  #define MSHADOW_USE_CUDA   1
+#endif
+
+/*!
+ * \brief seems CUDAARCH is deprecated in future NVCC
+ * set this to 1 if you want to use CUDA version smaller than 2.0
+ */
+#ifndef MSHADOW_OLD_CUDA
+#define MSHADOW_OLD_CUDA 0
+#endif
+
+/*! \brief whether use SSE */
+#ifndef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 1
+#endif
+/*! \brief whether use NVML to get dynamic info */
+#ifndef MSHADOW_USE_NVML
+  #define MSHADOW_USE_NVML 0
+#endif
+// SSE is conflict with cudacc
+#ifdef __CUDACC__
+  #undef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 0
+#endif
+
+#if MSHADOW_USE_CBLAS
+extern "C" {
+    #include <cblas.h>
+}
+#elif MSHADOW_USE_MKL
+  #include <mkl.h>
+  #include <mkl_cblas.h>
+  #include <mkl_vsl.h>
+  #include <mkl_vsl_functions.h>
+#endif
+
+#if MSHADOW_USE_CUDA
+  #include <cublas.h>
+  #include <curand.h>
+#endif
+
+#if MSHADOW_USE_NVML
+  #include <nvml.h>
+#endif
+// --------------------------------
+// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code
+#ifdef MSHADOW_XINLINE
+  #error "MSHADOW_XINLINE must not be defined"
+#endif
+#ifdef _MSC_VER
+#define MSHADOW_FORCE_INLINE __forceinline
+#pragma warning( disable : 4068 )
+#else
+#define MSHADOW_FORCE_INLINE inline __attribute__((always_inline)) 
+#endif
+#ifdef __CUDACC__
+  #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE __device__ __host__
+#else
+  #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE
+#endif
+/*! \brief cpu force inline */
+#define MSHADOW_CINLINE MSHADOW_FORCE_INLINE
+
+#if defined(__GXX_EXPERIMENTAL_CXX0X) ||\
+    defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+  #define MSHADOW_CONSTEXPR constexpr
+#else
+  #define MSHADOW_CONSTEXPR const
+#endif
+
+/*!
+ * \brief default data type for tensor string
+ *  in code release, change it to default_real_t
+ *  during development, change it to empty string so that missing
+ *  template arguments can be detected
+ */
+#ifndef MSHADOW_DEFAULT_DTYPE
+#define MSHADOW_DEFAULT_DTYPE = default_real_t
+//#define MSHADOW_DEFAULT_DTYPE
+#endif
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/*! \brief buffer size for each random number generator */
+const unsigned kRandBufferSize = 1000000;
+/*! \brief pi  */
+const float kPi = 3.1415926f;
+/*! \brief type that will be used for index */
+typedef unsigned index_t;
+/*! \brief float point type that will be used in default by mshadow */
+typedef float default_real_t;
+
+/*! \brief namespace for operators */
+namespace op {
+// binary operator
+/*! \brief mul operator */
+struct mul{
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a * b;
+  }
+};
+/*! \brief plus operator */
+struct plus {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a + b;
+  }
+};
+/*! \brief minus operator */
+struct minus {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a - b;
+  }
+};
+/*! \brief divide operator */
+struct div {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a / b;
+  }
+};
+/*! \brief get rhs */
+struct right {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return b;
+  }
+};
+// unary operator/ function: example
+// these operators can be defined by user,
+// in the same style as binary and unary operator
+// to use, simply write F<op::identity>( src )
+/*! \brief identity function that maps a real number to it self */
+struct identity{
+  /*! \brief map a to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return a;
+  }
+};
+}  // namespace op
+/*! \brief namespace for savers */
+namespace sv {
+/*! \brief save to saver: = */
+struct saveto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) {
+    a = b;
+  }
+  /*! \brief helper constant to use BLAS, alpha */
+  inline static default_real_t AlphaBLAS(void) { return 1.0f; }
+  /*! \brief helper constant to use BLAS, beta */
+  inline static default_real_t BetaBLAS(void) { return 0.0f; }
+  /*! \brief corresponding binary operator type */
+  typedef op::right OPType;
+};
+/*! \brief save to saver: += */
+struct plusto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) {
+    a += b;
+  }
+  /*! \brief helper constant to use BLAS, alpha */
+  inline static default_real_t AlphaBLAS(void) { return 1.0f; }
+  /*! \brief helper constant to use BLAS, beta */
+  inline static default_real_t BetaBLAS(void) { return 1.0f; }
+  /*! \brief corresponding binary operator type */
+  typedef op::plus OPType;
+};
+/*! \brief minus to saver: -= */
+struct minusto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) {
+    a -= b;
+  }
+  /*! \brief helper constant to use BLAS, alpha */
+  inline static default_real_t AlphaBLAS(void) { return -1.0f; }
+  /*! \brief helper constant to use BLAS, beta */
+  inline static default_real_t BetaBLAS(void) { return 1.0f; }
+  /*! \brief corresponding binary operator type */
+  typedef op::minus OPType;
+};
+/*! \brief multiply to saver: *= */
+struct multo {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) {
+    a *= b;
+  }
+  /*! \brief corresponding binary operator type */
+  typedef op::mul OPType;
+};
+/*! \brief divide to saver: /= */
+struct divto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType& a, DType b) {
+    a /= b;
+  }
+  /*! \brief corresponding binary operator type */
+  typedef op::div OPType;
+};
+}  // namespace sv
+/*! \brief namespace for potential reducer operations */
+namespace red {
+namespace limits {
+/*!
+ * \brief minimum value of certain types 
+ * \tparam DType data type
+ */
+template<typename DType>
+MSHADOW_XINLINE DType MinValue(void);
+/*! \brief minimum value of float */
+template<>
+MSHADOW_XINLINE float MinValue<float>(void) {
+  return -FLT_MAX;
+}
+/*! \brief minimum value of double */
+template<>
+MSHADOW_XINLINE double MinValue<double>(void) {
+  return -DBL_MAX;
+}
+/*! \brief minimum value of int */
+template<>
+MSHADOW_XINLINE int MinValue<int>(void) {
+  return INT_MIN;
+}
+}  // namespace limits
+
+/*! \brief sum reducer */
+struct sum {
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) {
+    dst += src;
+  }
+  /*! 
+   *\brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element   
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return 1;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */  
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) {
+    initv = 0;
+  }
+};
+/*! \brief maximum reducer */
+struct maximum {
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) {
+    using namespace std;
+    dst = max(dst, src);
+  }
+  /*!
+   * \brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return redres == redsrc ? 1: 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */  
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) {
+    initv = limits::MinValue<DType>();
+  }
+};
+}  // namespace red
+}  // namespace mshadow
+#endif  // MSHADOW_BASE_H_
diff --git a/mshadow/cuda/cuda_reduce.cuh b/mshadow/cuda/cuda_reduce.cuh
deleted file mode 100644
index b7808a6ffa30..000000000000
--- a/mshadow/cuda/cuda_reduce.cuh
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef MSHADOW_CUDA_REDUCE_CUH
-#define MSHADOW_CUDA_REDUCE_CUH
-/*!
- * \file cuda_reduce.cuh
- * \brief helper functions to do reduction
- * \author Tianqi Chen
- */
-namespace mshadow{
-    namespace cuda{
-        /*
-         * \brief reduce over the dimension x
-         * \tparam Reducer reducer
-         * \tparam x_bits dimension = 1<<x_bits
-         */
-        template<typename Reducer,int x_bits>
-        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
-        /*
-         * \brief reduce over the dimension x
-         * \tparam Reducer reducer
-         * \tparam xmax_bits maximum size of buffer
-         * \param xsize size of x dimension, not sure if aligned
-         */
-        template<typename Reducer, int xmax_bits>
-        inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize );
-    };
-};
-
-// ===============================================x===
-//  implementations afterwards, 
-//  no need to read if only use the functions
-// --------------------------------------------------
-#ifdef  __DEVICE_EMULATION__
-#define __MSHADOW_EMUSYNC__ __syncthreads()
-#else
-#define __MSHADOW_EMUSYNC__ 
-#endif
-
-namespace mshadow{
-    namespace cuda{        
-        template<typename Reducer, int x_bits>
-        inline __device__ void ReduceX( volatile real_t buf[], int tid ){
-            if( x_bits >= 10 ){
-                if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 9 ){
-                if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 8 ){
-                if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 7 ){
-                if( tid < 64  ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
-                __syncthreads(); 
-            }            
-            if( x_bits >= 6 ){
-                if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
-                __syncthreads();
-            }
-            // in warp optimization
-            if( x_bits >= 5 ){
-                if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 4 ){
-                if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
-                __MSHADOW_EMUSYNC__;            
-            }
-            if( x_bits >= 3 ){
-                if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 2 ){
-                if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 1 ){
-                if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
-                __MSHADOW_EMUSYNC__;
-            }  
-        };
-        
-        template<typename Reducer,int x_bits>
-        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
-            ReduceX<Reducer,x_bits>( buf, threadIdx.x );
-        }
-
-        // reduce with a upper bound
-        #define __RD_NON_ALIGN(els,x_bits)                              \
-            els                                                         \
-            if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){       \
-                if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
-                    Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
-                }                                                       \
-                __syncthreads();                                        \
-                ReduceX<Reducer, x_bits>( buf, tid );                   \
-            }                                                           \
-            
-        template<typename Reducer, int xmax_bits>
-        inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){
-            int tid = threadIdx.x;
-            __RD_NON_ALIGN(, 8)
-            __RD_NON_ALIGN(else, 7)
-            __RD_NON_ALIGN(else, 6)
-            __RD_NON_ALIGN(else, 5) 
-            __RD_NON_ALIGN(else, 4) 
-            __RD_NON_ALIGN(else, 3) 
-            __RD_NON_ALIGN(else, 2) 
-            __RD_NON_ALIGN(else, 1)                     
-        }
-    };
-};
-
-#endif // MSHADOW_CUDA_REDUCE_CUH
-
diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh
new file mode 100644
index 000000000000..8fa0cf1dc061
--- /dev/null
+++ b/mshadow/cuda/reduce.cuh
@@ -0,0 +1,118 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file reduce.cuh
+ * \brief helper functions to do reduction
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_CUDA_REDUCE_CUH_
+#define MSHADOW_CUDA_REDUCE_CUH_
+
+namespace mshadow {
+namespace cuda {
+/*
+ * \brief reduce over the dimension x
+ * \tparam Reducer reducer
+ * \tparam x_bits dimension = 1<<x_bits
+ * \tparam DType content data type
+ */
+template<typename Reducer, int x_bits, typename DType>
+inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]);
+/*
+ * \brief reduce over the dimension x
+ * \tparam Reducer reducer
+ * \tparam xmax_bits maximum size of buffer
+ * \tparam DType content data type
+ * \param xsize size of x dimension, not sure if aligned
+ */
+template<typename Reducer, int xmax_bits, typename DType>
+inline __device__ void
+Reduce1DNotAlign(volatile DType buf[1 << xmax_bits], int xsize);
+// ===============================================x===
+//  implementations afterwards,
+//  no need to read if only use the functions
+// --------------------------------------------------
+#ifdef  __DEVICE_EMULATION__
+#define __MSHADOW_EMUSYNC__ __syncthreads()
+#else
+#define __MSHADOW_EMUSYNC__
+#endif
+
+template<typename Reducer, int x_bits, typename DType>
+inline __device__ void ReduceX(volatile DType  buf[], int tid) {
+  if (x_bits >= 10) {
+    if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
+    __syncthreads();
+  }
+  if (x_bits >= 9) {
+    if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
+    __syncthreads();
+  }
+  if (x_bits >= 8) {
+    if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
+    __syncthreads();
+  }
+  if (x_bits >= 7) {
+    if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
+    __syncthreads();
+  }
+  if (x_bits >= 6) {
+    if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
+    __syncthreads();
+  }
+  // in warp optimization
+  if (x_bits >= 5) {
+    if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
+#if MSHADOW_OLD_CUDA
+    __syncthreads();
+#else
+    __MSHADOW_EMUSYNC__;
+#endif
+  }
+  if (x_bits >= 4) {
+    if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
+    __MSHADOW_EMUSYNC__;
+  }
+  if (x_bits >= 3) {
+    if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
+    __MSHADOW_EMUSYNC__;
+  }
+  if (x_bits >= 2) {
+    if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
+    __MSHADOW_EMUSYNC__;
+  }
+  if (x_bits >= 1) {
+    if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
+    __MSHADOW_EMUSYNC__;
+  }
+}
+template<typename Reducer, int x_bits, typename DType>
+inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]) {
+  ReduceX<Reducer, x_bits>(buf, threadIdx.x);
+}
+// reduce with a upper bound
+#define __RD_NON_ALIGN(els, x_bits)                                     \
+  els                                                                   \
+  if (xmax_bits >= x_bits && x_size >= (1 << x_bits)) {                 \
+    if (tid < (1 << x_bits) && tid + (1 << x_bits) < x_size) {          \
+      Reducer::Reduce(buf[tid] , buf[tid + (1 << x_bits)]);             \
+    }                                                                   \
+    __syncthreads();                                                    \
+    ReduceX<Reducer, x_bits>(buf, tid);                                 \
+  }                                                                     \
+
+template<typename Reducer, int xmax_bits, typename DType>
+inline __device__ void Reduce1DNotAlign(volatile DType buf[], int x_size) {
+  int tid = threadIdx.x;
+  __RD_NON_ALIGN(, 8)
+  __RD_NON_ALIGN(else, 7)
+  __RD_NON_ALIGN(else, 6)
+  __RD_NON_ALIGN(else, 5)
+  __RD_NON_ALIGN(else, 4)
+  __RD_NON_ALIGN(else, 3)
+  __RD_NON_ALIGN(else, 2)
+  __RD_NON_ALIGN(else, 1)
+}
+}  // namespace cuda
+}  // namespace mshadow
+#endif  // MSHADOW_CUDA_REDUCE_CUH_
+
diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh
index 61e477cf531b..a65add5237a7 100644
--- a/mshadow/cuda/tensor_gpu-inl.cuh
+++ b/mshadow/cuda/tensor_gpu-inl.cuh
@@ -1,231 +1,253 @@
-#ifndef MSHADOW_TENSOR_GPU_INL_CUH
-#define MSHADOW_TENSOR_GPU_INL_CUH
 /*!
+ *  Copyright (c) 2014 by Contributors
  * \file tensor_gpu-inl.cuh
  * \brief implementation of GPU code using CUDA
  * \author Bing Xu, Tianqi Chen
  */
+#ifndef MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
+#define MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
 #include "../tensor.h"
-#include "cuda_reduce.cuh"
+#include "./reduce.cuh"
 
-namespace mshadow{
-    namespace cuda{
-        #ifndef __CUDA_ARCH__
-        #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0"
-        #endif
-        /* load unit for memory access */
-        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
-        const int kMemUnitBits = 5;
-        const int kMaxThreadsPerBlock = 1024;
-        #else
-        const int kMemUnitBits = 4;
-        const int kMaxThreadsPerBlock = 512;
-        #endif
-        /*! \brief number of units that can do synchronized update, half warp size */
-        const int kMemUnit     = 1 << kMemUnitBits;
-        /*! \brief mask that could be helpful sometime */
-        const int kMemUnitMask = kMemUnit - 1;
-        /*! \brief suggested thread number(logscale) for mapping kernel */
-        const int kBaseThreadBits = 8;
-        /*! \brief suggested thread number for mapping kernel */
-        const int kBaseThreadNum  = 1 << kBaseThreadBits;
-        /*! \brief maximum value of grid */
-        const int kMaxGridNum     = 65535;
-        /*! \brief suggested grid number for mapping kernel */
-        const int kBaseGridNum    = 1024;
-        
-        /*! \brief get align stride for given size in x dimension */
-        inline index_t GetAlignStride( index_t xsize, index_t xstride ){ 
-            if( (xstride & (kMemUnit-1)) == 0 ){
-                return ( (xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
-            }else{
-                // if originally space is not aligned, no necessary to to alligned thread allocation
-                return xsize;
-            }
-        }
-        inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){
-            if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock ||
-                dimGrid.x > 65535 || dimGrid.y > 65535 ){
-                fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z );
-                utils::Error( "too large launch parameter\n");
-            } 
-        }        
-    };
+namespace mshadow {
+namespace cuda {
+/* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */
+#if MSHADOW_OLD_CUDA
+const int kMemUnitBits = 4;
+const int kMaxThreadsPerBlock = 512;
+#else
+const int kMemUnitBits = 5;
+const int kMaxThreadsPerBlock = 1024;
+#endif
+/*! \brief number of units that can do synchronized update, half warp size */
+const int kMemUnit = 1 << kMemUnitBits;
+/*! \brief mask that could be helpful sometime */
+const int kMemUnitMask = kMemUnit - 1;
+/*! \brief suggested thread number(logscale) for mapping kernel */
+const int kBaseThreadBits = 8;
+/*! \brief suggested thread number for mapping kernel */
+const int kBaseThreadNum  = 1 << kBaseThreadBits;
+/*! \brief maximum value of grid */
+const int kMaxGridNum = 65535;
+/*! \brief suggested grid number for mapping kernel */
+const int kBaseGridNum = 1024;
+/*! \brief get align stride for given size in x dimension */
+inline index_t GetAlignStride(index_t xsize) { 
+  if (xsize >= MSHADOW_MIN_PAD_RATIO * 32) {
+    return ((xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
+  } else {
+    // if originally space is not aligned, no necessary to to alligned thread allocation
+    return xsize;
+  }
+}
+inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "") {
+  if (dimBlock.x * dimBlock.y * dimBlock.z > static_cast<unsigned>(kMaxThreadsPerBlock) ||
+      dimGrid.x > 65535 || dimGrid.y > 65535) {
+    fprintf(stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z);
+    utils::Error("too large launch parameter\n");
+  } 
+}
+template<typename Saver, typename DstPlan,
+         typename Plan, int block_dim_bits>
+__device__ void MapPlanProc(DstPlan dst, index_t xstride,
+                            Shape<2> dshape, const Plan exp, int block_idx) {
+  const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
+  const int y = tid / xstride;
+  const int x = tid % xstride;
+  if (y < dshape[0] && x < dshape[1]) {
+    Saver::Save(dst.REval(y, x), exp.Eval(y,x));
+  }
+}
+template<typename Saver,int block_dim_bits,
+         typename DstPlan, typename Plan>
+__global__ void MapPlanKernel(DstPlan dst, index_t xstride,
+                              Shape<2> dshape, const Plan exp) {
+  MapPlanProc<Saver, DstPlan, Plan, block_dim_bits>
+      (dst, xstride, dshape, exp, blockIdx.x);
+}
+template<typename Saver, int block_dim_bits, int grid_size,
+         typename DstPlan, typename Plan>
+__global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride,
+                                   Shape<2> dshape, const Plan exp, int repeat) {  
+  for (int i = 0; i < repeat; ++i) {
+  MapPlanProc<Saver, DstPlan, Plan, block_dim_bits>
+      (dst, xstride, dshape, exp, blockIdx.x + i * grid_size);
+  }
+}
 
-    namespace cuda {
-        template<typename Saver, typename Plan, int block_dim_bits>
-        __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int block_idx ){
-            const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
-            const int y   = tid / xstride;
-            const int x   = tid % xstride;
-            if (y < dst.shape[1] && x < dst.shape[0]) {
-                Saver::Save(dst[y][x], exp.Eval(y,x));
-            }
-        }
-        template<typename Saver, typename Plan, int block_dim_bits>
-        __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp ){
-            MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x );
-        }
-        template<typename Saver, typename Plan, int block_dim_bits, int grid_size>
-        __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int repeat ){
-            for( int i = 0; i < repeat; ++i ){
-                MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x + i*grid_size );
-            }
-        }        
-        
-        template<typename Saver, typename E>
-        inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){
-            const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ );
-            const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum;
-            dim3 dimBlock(kBaseThreadNum, 1, 1);
+template<typename Saver, typename DstExp, typename E, typename DType>
+inline void MapPlan(expr::Plan<DstExp, DType> dst,
+                    const expr::Plan<E, DType> &plan,
+                    Shape<2> dshape,
+                    cudaStream_t stream) {
+  const index_t xstride = GetAlignStride(dshape[1]);
+  const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum;
+  dim3 dimBlock(kBaseThreadNum, 1, 1);
+  
+  if (num_block < kMaxGridNum) {
+    dim3 dimGrid(num_block, 1, 1);
+    MapPlanKernel<Saver, kBaseThreadBits,
+                  expr::Plan<DstExp, DType>,
+                  expr::Plan<E, DType> >
+        <<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan);
+  } else {
+    int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
+    dim3 dimGrid(kBaseGridNum, 1 , 1);
+    MapPlanLargeKernel<Saver, kBaseThreadBits, kBaseGridNum,
+                       expr::Plan<DstExp, DType>,
+                       expr::Plan<E, DType> >
+        <<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan, repeat);
+  }
+}
 
-            if (num_block < kMaxGridNum) {
-                dim3 dimGrid(num_block, 1, 1);
-                MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits>   \
-                    <<<dimGrid,dimBlock>>>(dst, xstride, plan);
-            } else {
-                int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
-                dim3 dimGrid( kBaseGridNum, 1 , 1 );
-                MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, kBaseGridNum> \
-                    <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat );
-            }
-        }        
-    }; // namespace cuda
-    
-    namespace cuda{
-        template<typename Saver,typename Reducer, int warp_bits, typename Plan>
-        __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<2> eshape ){
-            const unsigned warp_size = 1 << warp_bits;
-            const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x;
-            // to avoid bank conflict
-            __shared__ real_t s_res[ warp_size ][ warp_size + 1 ];
+template<typename Saver,typename Reducer, int warp_bits,
+         typename DType, typename DstPlan, typename Plan>
+__global__ void MapRedKeepLowestKernel(DstPlan dst, Plan plan,
+                                       DType scale, Shape<2> eshape) {
+  const unsigned warp_size = 1 << warp_bits;
+  const unsigned x = (blockIdx.x << warp_bits) + threadIdx.x;
+  // to avoid bank conflict
+  __shared__ DType s_res[warp_size][warp_size + 1];
+  // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
+  if (threadIdx.y < eshape[0] && x < eshape[1]) {
+    s_res[threadIdx.x][threadIdx.y] = plan.Eval(threadIdx.y, x);
+  }
+  for (unsigned y = warp_size; y < eshape[0]; y += warp_size) {
+    if (threadIdx.y + y < eshape[0] && x < eshape[1]) {
+      Reducer::Reduce(s_res[threadIdx.x][threadIdx.y], plan.Eval(threadIdx.y + y, x));
+    }
+  }
+  __syncthreads();
+  if (eshape[0] >= warp_size) {
+    Reduce1D<Reducer, warp_bits>(s_res[threadIdx.y]);
+  } else {
+    Reduce1DNotAlign<Reducer, warp_bits>(s_res[threadIdx.y], eshape[0]);
+  }
+  __syncthreads();
 
-            // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
-            if( threadIdx.y < eshape[1] && x < eshape[0] ){
-                s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x );
-            }
-            for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){
-                if( threadIdx.y + y < eshape[1] && x < eshape[0] ){
-                    Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) );
-                }
-            } 
-            __syncthreads();
-            if( eshape[1] >= warp_size ){
-                Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] );
-            }else{
-                Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], eshape[1] );
-            }
-            __syncthreads();            
-            
-            if( threadIdx.y == 0 && x < eshape[0] ){
-                Saver::Save( dst[x],  s_res[ threadIdx.x ][ 0 ] * scale );
-            } 
-        }        
-        
-        template<typename Saver, typename Reducer, typename E>
-        inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){
-            dim3 dimBlock( kMemUnit, kMemUnit );
-            dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits );
-            CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" );
-            MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, plan, scale, eshape );
-        } 
-    }; // namespace cuda
-    
-    namespace cuda{
-        template<typename Saver,typename Reducer, int block_dim_bits, typename Plan>
-        __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<4> pshape ){
-            const int block_size = 1 << block_dim_bits;
-            __shared__ real_t s_rec[ block_size ];
-            const int c = blockIdx.x;            
-            const index_t tot = pshape[0]*pshape[1]*pshape[3];
+  if (threadIdx.y == 0 && x < eshape[1]) {
+    Saver::Save(dst.REval(0, x),  s_res[threadIdx.x][0] * scale);
+  }
+}
 
-            real_t res = Reducer::kInitV;
-            for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){
-                index_t i = i_offset + threadIdx.x;
-                if( i< tot ){
-                    const index_t x = i % pshape[0];
-                    i /= pshape[0]; 
-                    const index_t y = i % pshape[1];
-                    const index_t n = i / pshape[1];
-                    Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
-                }
-            }                
-            s_rec[ threadIdx.x ] = res;
-            __syncthreads();
-            Reduce1D<Reducer,block_dim_bits>( s_rec );
-            if( threadIdx.x == 0 ){
-                Saver::Save( dst[c], s_rec[0]*scale );
-            }
-        }
+template<typename Saver, typename Reducer,
+         typename DstExp, typename E, typename DType>
+inline void MapReduceKeepLowest(expr::Plan<DstExp, DType> dst,
+                                const expr::Plan<E, DType> &plan,
+                                DType scale, Shape<2> eshape,
+                                cudaStream_t stream) {
+  dim3 dimBlock(kMemUnit, kMemUnit);
+  dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits);
+  CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel");
+  MapRedKeepLowestKernel<Saver, Reducer, kMemUnitBits, DType,
+                         expr::Plan<DstExp, DType>,
+                         expr::Plan<E, DType> >
+      <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, eshape);
+}
 
-        template<typename Saver, typename Reducer, typename Plan>
-        inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, real_t scale, Shape<4> pshape ){  
-            dim3 dimBlock( kBaseThreadNum );
-            dim3 dimGrid ( dst.shape[0] );
-            CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" );
-            MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits>
-                <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape );
-        }
-    };
-    
-    namespace cuda{
-        template<int x_bits>        
-        __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){
-            const unsigned x_size = 1 << x_bits;  
-            const int y = blockIdx.x;
-            __shared__ real_t s_rec[ x_size ];
-            
-            // step 1: get max
-            if( threadIdx.x < dst.shape[ 0 ] ){
-                s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; 
-            }
-            for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    real_t a = src[ y ][ x + threadIdx.x ];
-                    s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] );
-                }
-            }
-            __syncthreads();
-            if( threadIdx.x >= dst.shape[0] ){
-                s_rec[ threadIdx.x ] = s_rec[0];
-            }
-            __syncthreads();
-            Reduce1D<red::maximum,x_bits>( s_rec );
-            __syncthreads();
-            real_t smax = s_rec[0];            
-            __syncthreads();
-            s_rec[ threadIdx.x ] = 0.0f;
-            __syncthreads();
+template<typename Saver, typename Reducer, int block_dim_bits,
+         typename DType, typename DstPlan, typename Plan>
+__global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Shape<4> pshape) {
+  const int block_size = 1 << block_dim_bits;
+  __shared__ DType s_rec[block_size];
+  const int c = blockIdx.x;  
+  const index_t tot = pshape[3] * pshape[2] * pshape[0];
+  
+  DType res; Reducer::SetInitValue(res);
+  for (index_t i_offset = 0; i_offset < tot; i_offset += block_size) {
+    index_t i = i_offset + threadIdx.x;
+    if (i< tot) {
+      const index_t x = i % pshape[3];
+      i /= pshape[3]; 
+      const index_t y = i % pshape[2];
+      const index_t n = i / pshape[2];
+      Reducer::Reduce(res, plan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
+    }
+  }
+  s_rec[threadIdx.x] = res;
+  __syncthreads();
+  Reduce1D<Reducer, block_dim_bits>(s_rec);
+  if (threadIdx.x == 0) {
+    Saver::Save(dst.REval(0, c), s_rec[0] * scale);
+  }
+}
 
-            // calculate normalizer, with writeback
-            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    real_t p = expf( src[ y ][ x + threadIdx.x ] - smax );
-                    s_rec[ threadIdx.x ] += p;
-                    // write back first, will fetch later
-                    dst[ y ][ x + threadIdx.x ] = p;
-                }
-            }
-            // calculate normalizer
-            __syncthreads();
-            Reduce1D<red::sum,x_bits>( s_rec );
-            __syncthreads();
-            real_t ssum = s_rec[0];
+template<typename Saver, typename Reducer, typename DstExp, typename E, typename DType>
+inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
+                              const expr::Plan<E, DType> &plan,
+                              DType scale, Shape<4> pshape,
+                              cudaStream_t stream) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid (pshape[1]);
+  CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim1");
+  MapReduceKeepDim1Kernel<Saver,Reducer,kBaseThreadBits, DType,
+                          expr::Plan<DstExp, DType>,
+                          expr::Plan<E, DType> >
+      <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
+}
 
-            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    dst[ y ][ x + threadIdx.x ] /= ssum;
-                }
-            }
-        }
-    
-        inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){
-            dim3 dimBlock( kBaseThreadNum );
-            dim3 dimGrid ( dst.shape[1] );
-            utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" );
-            CheckLaunchParam( dimGrid, dimBlock, "Softmax" );
-            SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src );
-        }
-    }; // namespace cuda
-}; // namespace mshadow
-#endif // TENSOR_GPU_INL_H
+template<int x_bits, typename DType,  typename DstPlan, typename SrcPlan>
+__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
+  const unsigned x_size = 1 << x_bits;
+  const int y = blockIdx.x;
+  __shared__ DType s_rec[x_size];  
+  // step 1: get max
+  if (threadIdx.x < xmax) {
+    s_rec[threadIdx.x] = src.Eval(y, threadIdx.x); 
+  }
+  for (unsigned x = x_size; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      DType a = src.Eval(y, x + threadIdx.x);
+      s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]);
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x >= xmax) {
+    s_rec[threadIdx.x] = s_rec[0];
+  }
+  __syncthreads();
+  Reduce1D<red::maximum, x_bits>(s_rec);
+  __syncthreads();
+  DType smax = s_rec[0];
+  __syncthreads();
+  s_rec[threadIdx.x] = 0.0f;
+  __syncthreads();
+ 
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      DType p = expf(src.Eval(y, x + threadIdx.x) - smax);
+      s_rec[threadIdx.x] += p;
+      // write back first, will fetch later
+      dst.REval(y, x + threadIdx.x) = p;
+    }
+  }
+  // calculate normalizer
+  __syncthreads();
+  Reduce1D<red::sum, x_bits>(s_rec);
+  __syncthreads();
+  DType ssum = s_rec[0];
+  
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      dst.REval(y, x + threadIdx.x) /= ssum;
+    }
+  }
+}
+template<typename DType>
+inline void Softmax(Tensor<gpu, 2, DType> &dst,
+                    const Tensor<gpu, 2, DType> &src) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  utils::Check(dst.shape_ == src.shape_, "Softmax: shape mismatch");
+  CheckLaunchParam(dimGrid, dimBlock, "Softmax");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SoftmaxKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       dst.size(1));
+}
+}  // namespace cuda
+}  // namespace mshadow
+#endif  // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h
new file mode 100644
index 000000000000..168441657baa
--- /dev/null
+++ b/mshadow/dot_engine-inl.h
@@ -0,0 +1,200 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file dot_engine-inl.h
+ * \brief definitions of how Matrix Multiplications can be evaluated
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_DOT_ENGINE_INL_H_
+#define MSHADOW_DOT_ENGINE_INL_H_
+namespace mshadow {
+namespace expr {
+//---------------------------------------------------------------------
+// Matrix Multiplications, depends on BLAS Engine
+//---------------------------------------------------------------------
+template<typename SV, typename Device, int ddim, int ldim,
+         int rdim, bool ltrans, bool rtrans, typename DType>
+struct DotEngine {
+  inline static void Eval(Tensor<Device, ddim, DType> *p_dst,
+                          const Tensor<Device, ldim, DType> &lhs,
+                          const Tensor<Device, rdim, DType> &rhs,
+                          DType scale);
+};
+// handles the dot
+template<typename Device>
+struct BLASEngine;
+#if (MSHADOW_USE_CBLAS || MSHADOW_USE_MKL)
+template<>
+struct BLASEngine<cpu> {
+  inline static CBLAS_TRANSPOSE GetT(bool t) {
+    return t ? CblasTrans : CblasNoTrans;
+  }
+  inline static void SetStream(Stream<cpu> *stream) {
+  }
+  inline static void gemm(bool transa, bool transb,
+                          int m, int n, int k, float alpha,
+                          const float *A, int lda, const float *B, int ldb,
+                          float beta, float *C, int ldc) {
+    cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb),
+                m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+  inline static void gemm(bool transa, bool transb,
+                          int m, int n, int k, double alpha,
+                          const double *A, int lda, const double *B, int ldb,
+                          double beta, double *C, int ldc) {
+    cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb),
+                m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+  inline static void gemv(bool trans, int m, int n,
+                          float alpha, const float *A, int lda,
+                          const float *X, int incX,
+                          float beta, float *Y, int incY) {
+    cblas_sgemv(CblasColMajor, GetT(trans), m, n, alpha,
+                A, lda, X, incX, beta, Y, incY);
+  }
+  inline static void gemv(bool trans, int m, int n, double alpha,
+                          const double *A, int lda,
+                          const double *X, int incX,
+                          double beta, double *Y, int incY) {
+    cblas_dgemv(CblasColMajor, GetT(trans), m, n, alpha,
+                A, lda, X, incX, beta, Y, incY);
+  }
+  inline static void ger(int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda) {
+    cblas_sger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
+  }
+  inline static void ger(int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda) {
+    cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
+  }
+};
+#endif  // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
+// CuBLAS redirect code
+#if MSHADOW_USE_CUDA
+// All CuBLAS goes to here, use legacy API: not threadsafe
+template<>
+struct BLASEngine<gpu> {
+  inline static char GetT(bool t) {
+    return t ? 'T' : 'N';
+  }
+  inline static void SetStream(Stream<gpu> *stream) {
+    cublasSetKernelStream(Stream<gpu>::GetStream(stream));
+  }
+  inline static void gemm(bool transa, bool transb,
+                          int m, int n, int k, float alpha,
+                          const float *A, int lda,
+                          const float *B, int ldb, float beta,
+                          float *C, int ldc) {
+    cublasSgemm(GetT(transa), GetT(transb), m, n, k, alpha,
+                A, lda, B, ldb, beta, C, ldc);
+  }
+  inline static void gemm(bool transa, bool transb,
+                          int m, int n, int k, double alpha,
+                          const double *A, int lda,
+                          const double *B, int ldb,
+                          double beta, double *C, int ldc) {
+    cublasDgemm(GetT(transa), GetT(transb), m, n, k, alpha,
+                A, lda, B, ldb, beta, C, ldc);
+  }
+  inline static void gemv(bool trans, int m, int n, float alpha,
+                          const float *A, int lda,
+                          const float *X, int incX, float beta,
+                          float *Y, int incY) {
+    cublasSgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY);
+  }
+  inline static void gemv(bool trans, int m, int n, double alpha,
+                          const double *A, int lda,
+                          const double *X, int incX,
+                          double beta, double *Y, int incY) {
+    cublasDgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY);
+  }
+  inline static void ger(int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda) {
+    cublasSger(m, n, alpha, X, incX, Y, incY, A, lda);
+  }
+  inline static void ger(int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda) {
+    cublasDger(m, n, alpha, X, incX, Y, incY, A, lda);
+  }
+};
+#endif  // MSHADOW_USE_CUDA
+// helper function to decide which shape we are in
+inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) {
+  return transpose ? Shape2(shape[1], shape[0]) : shape;
+}
+// dst = dot(lhs[.T], rhs[.T])
+template<typename SV, typename xpu,
+         bool transpose_left, bool transpose_right, typename DType>
+struct DotEngine<SV, xpu, 2, 2, 2, transpose_left, transpose_right, DType> {
+  inline static void Eval(Tensor<xpu, 2, DType> *p_dst,
+                          const Tensor<xpu, 2, DType> &lhs,
+                          const Tensor<xpu, 2, DType> &rhs,
+                          DType scale) {
+    Tensor<xpu, 2, DType> &dst = *p_dst;
+    // set kernel stream
+    BLASEngine<xpu>::SetStream(dst.stream_);
+    Shape<2> sleft = GetShape(lhs.shape_, transpose_left);
+    Shape<2> sright = GetShape(rhs.shape_, transpose_right);
+    utils::Check(dst.size(0) == sleft[0] && dst.size(1) == sright[1] \
+                 && sleft[1] == sright[0] ,
+                 "dot-gemm: matrix shape mismatch");
+    // use column major argument to compatible with most BLAS
+    BLASEngine<xpu>::gemm
+        (transpose_right , transpose_left,
+         transpose_right ? rhs.size(0) : rhs.size(1),
+         transpose_left  ? lhs.size(1) : lhs.size(0),
+         transpose_right ? rhs.size(1) : rhs.size(0),
+         scale * SV::AlphaBLAS(),
+         rhs.dptr_, rhs.stride_,
+         lhs.dptr_, lhs.stride_,
+         SV::BetaBLAS(),
+         dst.dptr_, dst.stride_);
+  }
+};
+template<typename SV, typename xpu, bool transpose_right, typename DType>
+struct DotEngine<SV, xpu, 1, 1, 2, false, transpose_right, DType> {
+  inline static void Eval(Tensor<xpu, 1, DType> *p_dst,
+                          const Tensor<xpu, 1, DType> &lhs,
+                          const Tensor<xpu, 2, DType> &rhs,
+                          DType scale) {
+    Tensor<xpu, 1, DType> &dst = *p_dst;
+    // set kernel stream
+    BLASEngine<xpu>::SetStream(dst.stream_);
+    Shape<2> sright = GetShape(rhs.shape, transpose_right);
+    utils::Check(dst.size(0) == sright[1] && lhs.size(0) == sright[0],
+                 "dot-gemv: matrix shape mismatch");
+    BLASEngine<xpu>::gemv
+        (transpose_right,
+         rhs.size(1), rhs.size(0), scale * SV::AlphaBLAS(),
+         rhs.dptr_, rhs.stride_,
+         lhs.dptr_, 1, SV::BetaBLAS(),
+         dst.dptr_, 1);
+  }
+};
+template<typename SV, typename xpu, typename DType>
+struct DotEngine<SV, xpu, 2, 1, 1, true, false, DType> {
+  inline static void Eval(Tensor<xpu, 2, DType> *p_dst,
+                          const Tensor<xpu, 1, DType> &lhs,
+                          const Tensor<xpu, 1, DType> &rhs,
+                          DType scale) {
+    Tensor<xpu, 2, DType> &dst = *p_dst;
+    // set kernel stream
+    BLASEngine<xpu>::SetStream(dst.stream_);
+    utils::Check(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0),
+                  "dot-ger: matrix shape mismatch");
+    if (SV::kBetaBLAS == 0.0f) {
+      BLASEngine<xpu>::ger
+          (rhs.size(0), lhs.size(0), scale * SV::AlphaBLAS(),
+           rhs.dptr_, 1, lhs.dptr_, 1, dst.dptr_, dst.stride_);
+    } else {
+      DotEngine<SV, xpu, 2, 2, 2, true, false,
+                DType>::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale);
+    }
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_DOT_ENGINE_INL_H_
diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h
new file mode 100644
index 000000000000..b6ed59048a82
--- /dev/null
+++ b/mshadow/expr_engine-inl.h
@@ -0,0 +1,423 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file expr_engine-inl.h
+ * \brief definitions of how expressions should be evaluated
+ * \author Tianqi Chen, Bing Xu
+ */
+#ifndef MSHADOW_EXPR_ENGINE_INL_H_
+#define MSHADOW_EXPR_ENGINE_INL_H_
+#include <utility>
+#include <algorithm>
+#include "./utils.h"
+#include "./expression.h"
+#include "./tensor.h"
+
+namespace mshadow {
+namespace expr {
+/*! 
+ * \brief a general class that allows extension that makes tensors of some shape
+ * \tparam SubType type of subclass
+ * \tparam SrcExp source expression of the MakeTensorExp, the source of operation
+ * \tparam dim dimension of the expression
+ * \tparam DType the type of elements
+ */
+template<typename SubType, typename SrcExp, int dim, typename DType>
+struct MakeTensorExp
+    : public Exp<MakeTensorExp<SubType, SrcExp, dim, DType>,
+                 DType, type::kChainer> {
+  /*! \brief the shape of this expression */
+  Shape<dim> shape_;
+  /*! \brief true self of subtype */
+  inline const SubType& real_self(void) const{
+    return *static_cast<const SubType*>(this);
+  }
+};
+//----------------------------------------------------------------------
+// This part of code gives plan that can be used to carry out execution
+//---------------------------------------------------------------------
+// Declarations of plans
+template<typename ExpType, typename DType>
+class Plan {
+ public:
+  /*!
+   * \brief evaluate the expression at index [y][x]
+   *  to be implemented by SubType, for RValue, the return type will be DType &
+   */
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const;
+};
+// tensor plan
+template <typename Device, int dim, typename DType>
+class Plan<Tensor<Device, dim, DType>, DType> {
+ public:
+  explicit Plan(const Tensor<Device, dim, DType> &t)
+      : dptr_(t.dptr_), stride_(t.stride_) {}
+  // for RValue, the return type should be reference
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    return dptr_[y * stride_ + x];
+  }
+  // const evaluation
+  MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const {
+    return dptr_[y * stride_ + x];
+  }
+
+ private:
+  DType  *dptr_;
+  index_t stride_;
+};
+// special evaluation case for 1d tensor, no stride
+template <typename Device, typename DType>
+class Plan<Tensor<Device, 1, DType>, DType> {
+ public:
+  explicit Plan(const Tensor<Device, 1, DType> &t) : dptr_(t.dptr_) {}
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    return dptr_[x];
+  }
+  MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const {
+    return dptr_[x];
+  }
+
+ private:
+  DType  *dptr_;
+};
+// scalar
+template<typename DType>
+class Plan<ScalarExp<DType>, DType> {
+ public:
+  explicit Plan(DType scalar) : scalar_(scalar) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return scalar_;
+  }
+
+ private:
+  DType scalar_;
+};
+// unary expression
+template<typename DstDType, typename SrcDType,
+         typename EType, int etype>
+class Plan<TypecastExp<DstDType, SrcDType, EType, etype>, DstDType> {
+ public:
+  explicit Plan(const Plan<EType, SrcDType> &src) : src_(src) {}
+  MSHADOW_XINLINE DstDType Eval(index_t y, index_t x) const {
+    return static_cast<DstDType>(src_.Eval(y, x));
+  }
+
+ private:
+  Plan<EType, SrcDType> src_;
+};
+// binary expression
+template<typename OP, typename TA, typename TB, int etype, typename DType>
+class Plan<BinaryMapExp<OP, TA, TB, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
+      : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
+  }
+
+ private:
+  Plan<TA, DType> lhs_;
+  Plan<TB, DType> rhs_;
+};
+// unary expression
+template<typename OP, typename TA, int etype, typename DType>
+class Plan<UnaryMapExp<OP, TA, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(src_.Eval(y, x));
+  }
+
+ private:
+  Plan<TA, DType> src_;
+};
+// remaps map tensor expression to subtype's plan
+template<typename SubType, typename SrcExp, int dim, typename DType>
+struct Plan<MakeTensorExp<SubType, SrcExp, dim, DType>, DType> {
+ public:
+  Plan(const Plan<SubType, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(y, x);
+  }
+
+ private:
+  Plan<SubType, DType> src_;
+};
+// tranpsoe
+template<typename EType, typename DType>
+class Plan<TransposeExp<EType, DType>, DType> {
+ public:
+  explicit Plan(const Plan<EType, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(x, y);
+  }
+
+ private:
+  Plan<EType, DType> src_;
+};
+//----------------------------------------------------------------------
+// Mappings from expression to plans
+//---------------------------------------------------------------------
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+inline Plan<BinaryMapExp<OP, TA, TB, DType, etype>, DType>
+MakePlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e);
+
+template<typename DType>
+inline Plan<ScalarExp<DType>, DType> MakePlan(const ScalarExp<DType> &e) {
+  return Plan<ScalarExp<DType>, DType>(e.scalar_);
+}
+
+template<typename DstDType, typename SrcDType, typename EType, int etype>
+inline Plan<TypecastExp<DstDType, SrcDType, EType, etype>, DstDType>
+MakePlan(const TypecastExp<DstDType, SrcDType, EType, etype> &e) {
+  return Plan<TypecastExp<DstDType, SrcDType, EType, etype>, DstDType>(MakePlan(e.exp));
+}
+
+template<typename T, typename DType>
+inline Plan<T, DType> MakePlan(const RValueExp<T, DType> &e) {
+  return Plan<T, DType>(e.self());
+}
+
+template<typename T, typename DType>
+inline Plan<TransposeExp<T, DType>, DType>
+MakePlan(const TransposeExp<T, DType> &e) {
+  return Plan<TransposeExp<T, DType>, DType>(MakePlan(e.exp));
+}
+
+template<typename T, typename SrcExp, int dim, typename DType>
+inline Plan<T, DType>
+MakePlan(const MakeTensorExp<T, SrcExp, dim, DType> &e) {
+  return Plan<T, DType>(e.real_self());
+}
+
+template<typename OP, typename TA, typename DType, int etype>
+inline Plan<UnaryMapExp<OP, TA, DType, etype>, DType>
+MakePlan(const UnaryMapExp<OP, TA, DType, etype> &e) {
+  return Plan<UnaryMapExp<OP, TA, DType, etype>, DType>(MakePlan(e.src_));
+}
+
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+inline Plan<BinaryMapExp<OP, TA, TB, DType, etype>, DType>
+MakePlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e) {
+  return Plan<BinaryMapExp<OP, TA, TB, DType, etype>,
+              DType>(MakePlan(e.lhs_), MakePlan(e.rhs_));
+}
+//----------------------------------------------------------------
+// Static Type inference and Type Checking
+//----------------------------------------------------------------
+/*!
+ * \brief static type inference template, 
+ *        used to get the dimension of each expression, 
+ *        if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
+ *        if (ExpInfo<E>::kDevMask & cpu::kDevMask) != 0, this means this expression can be assigned to cpu
+ * \tparam E expression
+ */
+template<typename E>
+struct ExpInfo {
+  static const int kDim = -1;
+  static const int kDevMask = 0;
+};
+template<typename DType>
+struct ExpInfo< ScalarExp<DType> > {
+  static const int kDim = 0;
+  static const int kDevMask = 0xffff;
+};
+template<typename E, typename DType>
+struct ExpInfo<TransposeExp<E, DType> > {
+  static const int kDim = ExpInfo<E>::kDim;
+  static const int kDevMask = ExpInfo<E>::kDevMask;
+};
+template<typename DstDType, typename SrcDType, typename EType, int etype>
+struct ExpInfo<TypecastExp<DstDType, SrcDType, EType, etype> > {
+  static const int kDim = ExpInfo<EType>::kDim;
+  static const int kDevMask = ExpInfo<EType>::kDevMask;  
+};
+template<typename Device, int dim, typename DType>
+struct ExpInfo<Tensor<Device, dim, DType> > {
+  static const int kDim = dim;
+  static const int kDevMask = Device::kDevMask;
+};
+template<typename T, typename SrcExp, int dim, typename DType>
+struct ExpInfo<MakeTensorExp<T, SrcExp, dim, DType> > {
+  static const int kDimSrc = ExpInfo<SrcExp>::kDim;
+  static const int kDim = kDimSrc >= 0 ? dim : -1;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask;
+};
+template<typename OP, typename TA, typename DType, int etype>
+struct ExpInfo<UnaryMapExp<OP, TA, DType, etype> > {
+  static const int kDim = ExpInfo<TA>::kDim;
+  static const int kDevMask = ExpInfo<TA>::kDevMask;
+};
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+struct ExpInfo<BinaryMapExp<OP, TA, TB, DType, etype> > {
+  static const int kDimLhs = ExpInfo<TA>::kDim;
+  static const int kDimRhs = ExpInfo<TB>::kDim;
+  static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\
+      (kDimLhs == 0 ?\
+       kDimRhs :\
+       ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1;
+  static const int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
+};
+/*! \brief template to do type check */
+template<typename Device, int dim, typename DType, typename E>
+struct TypeCheck {
+  /*! \brief dimension of expression*/
+  static const int kExpDim = ExpInfo<E>::kDim;
+  /*! \brief whether the expression device type matches */
+  static const bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
+  /*! \brief whether the expression can be mapped to expression of dim */
+  static const bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass;
+  /*! \brief whether the expression can be reduced to expression of dim */
+  static const bool kRedPass = (kExpDim > dim) && kDevPass;
+};
+/*! \brief used to help static type check*/
+template<bool kPass>
+struct TypeCheckPass;
+// Todo : add static assert using C++11
+template<>
+struct TypeCheckPass<false> {};
+template<>
+struct TypeCheckPass<true> {
+  inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {}
+  inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {}
+  inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {}
+};
+
+//----------------------------------------------------------------
+// Runtime Stream Getting
+//----------------------------------------------------------------
+template<typename Device, typename E>
+struct StreamInfo {
+  inline static Stream<Device> *Get(const E &t);
+};
+template<int dim, typename Device, typename DType>
+struct StreamInfo<Device, Tensor<Device, dim, DType> > {
+  inline static Stream<Device> *Get(const Tensor<Device, dim, DType> &t) {
+    return t.stream_;
+  }
+};
+//----------------------------------------------------------------
+// Runtime Shape Checking
+//----------------------------------------------------------------
+/*!
+ * \brief runtime shape checking template
+ *    get the shape of an expression, report error if shape mismatch
+ * \tparam dim the dimension of the shape
+ * \tparam E expression
+ */
+template<int dim, typename E>
+struct ShapeCheck {
+  inline static Shape<dim> Check(const E &t);
+};
+template<int dim, typename DType>
+struct ShapeCheck<dim, ScalarExp<DType> > {
+  inline static Shape<dim> Check(const ScalarExp<DType> &exp) {
+    // use lowest dimension to mark scalar exp
+    Shape<dim> shape; shape[0] = 0;
+    return shape;
+  }
+};
+template<int dim, typename DstDType, typename SrcDType, typename EType, int etype>
+struct ShapeCheck<dim, TypecastExp<DstDType, SrcDType, EType, etype> > {
+  inline static Shape<dim>
+  Check(const TypecastExp<DstDType, SrcDType, EType, etype> &exp) {
+    return ShapeCheck<dim, EType>::Check(exp.exp);
+  }
+};
+template<int dim, typename E, typename DType>
+struct ShapeCheck<dim, TransposeExp<E, DType> > {
+  inline static Shape<dim> Check(const TransposeExp<E, DType> &e) {
+    // swap the lowest two dimensions
+    Shape<dim> s = ShapeCheck<dim, E>::Check(e.exp);
+    std::swap(s[0], s[1]);
+    return s;
+  }
+};
+template<int dim, typename Device, typename DType>
+struct ShapeCheck<dim, Tensor<Device, dim, DType> > {
+  inline static Shape<dim> Check(const Tensor<Device, dim, DType> &t) {
+    return t.shape_;
+  }
+};
+template<int dim, typename SrcExp, typename T, typename DType>
+struct ShapeCheck<dim, MakeTensorExp<T, SrcExp, dim, DType> > {
+  inline static Shape<dim>
+  Check(const MakeTensorExp<T, SrcExp, dim, DType> &t) {
+    return t.shape_;
+  }
+};
+template<int dim, typename OP, typename TA, typename DType, int etype>
+struct ShapeCheck<dim, UnaryMapExp<OP, TA, DType, etype> > {
+  inline static Shape<dim> Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
+    Shape<dim> s = ShapeCheck<dim, TA>::Check(t.src_);
+    return s;
+  }
+};
+template<int dim, typename OP, typename TA, typename TB,
+         typename DType, int etype>
+struct ShapeCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype> > {
+  inline static Shape<dim>
+  Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
+    Shape<dim> shape1 = ShapeCheck<dim, TA>::Check(t.lhs_);
+    Shape<dim> shape2 = ShapeCheck<dim, TB>::Check(t.rhs_);
+    if (shape1[0] == 0) return shape2;
+    if (shape2[0] == 0) return shape1;
+    utils::Check(shape1 == shape2,
+                 "BinaryMapExp: Shapes of operands are not the same");
+    return shape1;
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+// include definition of dot engine
+#include "./dot_engine-inl.h"
+
+namespace mshadow {
+namespace expr {
+/*! \brief some engine that evaluate complex expression */
+template<typename SV, typename RV, typename E, typename DType>
+struct ExpComplexEngine {
+  inline static void Eval(RV *dst, const E &exp);
+};
+/*! \brief the engine that dispatches simple operations*/
+template<typename SV, typename RV, typename DType>
+struct ExpEngine {
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kMapper> &exp) {
+    MapExp<SV>(dst, exp);
+  }
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kChainer> &exp) {
+    MapExp<SV>(dst, exp);
+  }
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kRValue> &exp) {
+    MapExp<SV>(dst, exp);
+  }
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kComplex> &exp) {
+    ExpComplexEngine<SV, RV, E, DType>::Eval(dst->ptrself(), exp.self());
+  }
+};
+template<typename SV, typename Device, int dim, int ldim,
+         int rdim, bool ltrans, bool rtrans, typename DType>
+struct ExpComplexEngine<SV,
+                        Tensor<Device, dim, DType>,
+                        DotExp<Tensor<Device, ldim, DType>,
+                               Tensor<Device, rdim, DType>,
+                               ltrans, rtrans, DType>,
+                        DType> {
+  inline static void Eval(Tensor<Device, dim, DType> *dst,
+                          const DotExp<Tensor<Device, ldim, DType>,
+                                       Tensor<Device, rdim, DType>,
+                                       ltrans, rtrans, DType> &exp) {
+    DotEngine<SV, Device, dim, ldim, rdim,
+              ltrans, rtrans, DType>::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_);
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXPR_ENGINE_INL_H_
diff --git a/mshadow/expr_scalar-inl.h b/mshadow/expr_scalar-inl.h
new file mode 100644
index 000000000000..a0efdc1ab649
--- /dev/null
+++ b/mshadow/expr_scalar-inl.h
@@ -0,0 +1,123 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file expression-inl.h
+ * \brief definitions of operators in expression with respect to scalar
+ *  this file will be included several times, each time with MACRO MSHADOW_SCALAR_ to be different types
+ *
+ * DO NOT add pragma once or macro guard
+ * \author Tianqi Chen, Bing Xu
+ */
+namespace mshadow {
+namespace expr {
+// DotExp
+/*! \brief dot operator def */
+template<typename TA, typename TB, bool ltrans, bool rtrans>
+inline DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_>
+operator*(const DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_> &lhs,
+          MSHADOW_SCALAR_ rhs) {
+  return DotExp<TA, TB, ltrans, rtrans,
+                MSHADOW_SCALAR_>(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs);
+}
+/*! \brief scale of dot operation */
+template<typename TA, typename TB, bool ltrans, bool rtrans>
+inline DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_>
+operator*(MSHADOW_SCALAR_ lhs,
+          const DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_> &rhs) {
+  return DotExp<TA, TB, ltrans, rtrans,
+                MSHADOW_SCALAR_>(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs);
+}
+
+/*! \brief operator overload */
+template<typename E, typename DType, typename R, int d>
+inline ReduceTo1DExp<E, DType, R, d>
+operator*(const ReduceTo1DExp<E, DType, R, d> &e, MSHADOW_SCALAR_ scale) {
+  return ReduceTo1DExp<E, DType, R, d>(e.src_, e.scale_ * scale);
+}
+/*! \brief operator overload */
+template<typename E, typename DType, typename R,int d>
+inline ReduceTo1DExp<E, DType, R, d>
+operator*(MSHADOW_SCALAR_ scale, const ReduceTo1DExp<E, DType, R, d> &e) {
+  return ReduceTo1DExp<E, DType, R, d>(e.src_, e.scale_ * scale);
+}
+
+/*! \brief operator overload for const */
+template<typename OP, typename TA, int ta>
+inline BinaryMapExp<OP, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+F(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs, const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+/*! \brief operator overload for const */
+template<typename OP, typename TB, int tb>
+inline BinaryMapExp<OP, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+F(const ScalarExp<MSHADOW_SCALAR_> &lhs, const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+// constant operators
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::plus, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator+(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::minus, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator-(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::mul, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator*(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::div, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator/(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+// constant operators 2
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::plus, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator+(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::minus, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator-(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::mul, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator*(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::div, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator/(const ScalarExp<MSHADOW_SCALAR_> &lhs, const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+}  // namespace expr
+}  // namespace mshadow
diff --git a/mshadow/expression.h b/mshadow/expression.h
new file mode 100644
index 000000000000..d73c11f7d40c
--- /dev/null
+++ b/mshadow/expression.h
@@ -0,0 +1,355 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file expression.h
+ * \brief definitions of abstract expressions and expressions template
+ * \author Tianqi Chen, Bing Xu
+ */
+#ifndef MSHADOW_EXPRESSION_H_
+#define MSHADOW_EXPRESSION_H_
+#include "./base.h"
+
+namespace mshadow {
+/*!
+ * \brief namespace for abstract expressions and expressions template,
+ *        have no dependecy on tensor.h,
+ *        These data structure takes no charge in computations,
+ *        they are only used to define operations and represent expression in a symbolic way
+ */
+namespace expr {
+/*! \brief type of expressions */
+namespace type {
+// type expression type are defined as bitmask
+// subtype relationshop kRValue < kMapper < kPull < kComplex
+/*! 
+ * \brief this expression directly correspnds to a data class,
+ *   can be used to assign data 
+ */
+const int kRValue = 0;
+/*! 
+ * \brief expression contains element-wise tensor operations,
+ *   map a expression to same shape 
+ */
+const int kMapper = 1;
+/*!
+ * \brief expression that can be chained with other expressiones
+ *    Usually it have function Eval(i,j) defined, which pulls the result (i, j) from input
+ *    expression and output the result at certain position.
+ */
+const int kChainer = 3;
+/*! \brief othercase: e.g dot product */
+const int kComplex = 7;
+}  // namespace type
+/*!
+ * \brief expression engine that actually interprets these expressions
+ *   this is a function template that needed to be implemented for specific expressions
+ * \tparam Saver the save method
+ * \tparam RValue the type of RValue to be saved
+ * \sa namespace sv
+ */
+template<typename Saver, typename RValue, typename DType>
+struct ExpEngine;
+/*! \brief defines how expression exp can be evaluated and stored into dst */
+//template<typename EType>
+//inline static void Eval(RValue *dst, const EType &exp);
+/*!
+ * \brief base class for expression
+ * \tparam SubType inheritated class must put their type into this parameter
+ * \tparam DType the data type of each element in the expression
+ * \tparam exp_type expression type, see namespace type
+ */
+template<typename SubType, typename DType, int exp_type>
+struct Exp {
+ public:
+  /*! \return  subtype instance of current class */
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+  /*! \return reference of subtype instance of current class */
+  inline SubType* ptrself(void) {
+    return static_cast<SubType*>(this);
+  }
+};
+/*!
+ * \brief scalar expression 
+ * \tparam DType the data type of the scalar
+ */
+template<typename DType>
+struct ScalarExp: public Exp<ScalarExp<DType>, DType, type::kMapper> {
+  /*! \brief scalar value */
+  DType scalar_;
+  /*! \brief implicit constructor, MUST NOT BE explicit */
+  ScalarExp(DType scalar) : scalar_(scalar) {}
+};
+/*! \brief create an scalar expression */
+template<typename DType>
+inline ScalarExp<DType> scalar(DType s) {
+  return ScalarExp<DType>(s);
+}
+/*!
+ * \brief typecast expression, cast the type of elements
+ * \tparam DstDType the target type we want to cast into
+ * \tparam SrcDType the target type we want to cast from
+ * \tparam EType the type of the source expression
+ * \tparam etype the type of expression after cast
+ */
+template<typename DstDType, typename SrcDType, typename EType, int etype>
+struct TypecastExp:
+      public Exp<TypecastExp<DstDType, SrcDType, EType, etype>,
+                 DstDType, etype> {
+  /*! \brief expression to be typecasted */
+  const EType &exp;
+  /*! \brief constructor */
+  explicit TypecastExp(const EType &e) : exp(e) {}
+};
+/*! \brief create an scalar expression */
+template<typename DstDType, typename SrcDType,
+         typename EType, int etype>
+inline TypecastExp<DstDType, SrcDType, EType, (etype|type::kMapper)>
+tcast(const Exp<EType, SrcDType, etype> &exp) {
+  return TypecastExp<DstDType, SrcDType, EType,
+                     (etype|type::kMapper)>(exp.self());
+}
+/*! \brief represent a transpose expression of a container */
+template<typename EType, typename DType>
+struct TransposeExp: public Exp<TransposeExp<EType, DType>,
+                                DType, type::kChainer> {
+  /*! \brief expression to be transposed */
+  const EType &exp;
+  /*! \brief constructor */
+  explicit TransposeExp(const EType &e) : exp(e) {}
+  /*! \brief transpose expression */
+  inline const EType &T(void) const {
+    return exp;
+  }
+};
+/*!
+ * \brief base class of all rvalues
+ * \tparam Container the actually class of data container, e.g. Tensor1D
+ * \tparam DataType the element data type of each element in the container
+ */
+template<typename Container, typename DType>
+class RValueExp: public Exp<Container, DType, type::kRValue> {
+ public:
+  /*!
+   *\brief transpose of a matrix
+   *\return transpose of current expression
+   */
+  inline const TransposeExp<Container, DType> T(void) const {
+    return TransposeExp<Container, DType>(this->self());
+  }
+  /*! \brief operator overload */
+  inline Container &operator+=(DType s) {
+    ExpEngine<sv::plusto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &operator-=(DType s) {
+    ExpEngine<sv::minusto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &operator*=(DType s) {
+    ExpEngine<sv::multo, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &operator/=(DType s) {
+    ExpEngine<sv::divto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &__assign(DType s) {
+    ExpEngine<sv::saveto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief  we can not define container = container */
+  template<typename E, int etype>
+  inline Container &__assign(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::saveto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload, assign */
+  inline Container &__assign(const Exp<Container, DType, type::kRValue> &exp);
+  /*! \brief implementation of operator+= */
+  template<typename E, int etype>
+  inline Container &operator+=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::plusto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief implementation of operator-= */
+  template<typename E, int etype>
+  inline Container &operator-=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::minusto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief implementation of operator*= */
+  template<typename E, int etype>
+  inline Container &operator*=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::multo, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief implementation of operator/= */
+  template<typename E, int etype>
+  inline Container &operator/=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::divto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+};
+/*!
+ * \brief matrix multiplication expression dot(lhs[.T], rhs[.T])
+ * \tparam TA type of lhs
+ * \tparam TB type of rhs
+ * \tparam ltrans whether lhs is transposed
+ * \tparam rtrans whether rhs is transposed
+ * \tparam DType the data type of the scalar
+ */
+template<typename TA, typename TB, bool ltrans, bool rtrans, typename DType>
+struct DotExp: public Exp<DotExp<TA, TB, ltrans, rtrans, DType>,
+                          DType, type::kComplex> {
+  /*! \brief left operand */
+  const TA &lhs_;
+  /*! \brief right operand */
+  const TB &rhs_;
+  /*! \brief scale over result */
+  DType scale_;
+  /*! \brief constructor */
+  explicit DotExp(const TA &lhs, const TB &rhs, DType scale)
+      : lhs_(lhs), rhs_(rhs), scale_(scale) {}
+};
+// definition of dot expression
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, false, false, DType>
+dot(const RValueExp<TA, DType> &lhs, const RValueExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, false, false, DType>(lhs.self(), rhs.self(), 1.0f);
+}
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, true, false, DType>
+dot(const TransposeExp<TA, DType> &lhs, const RValueExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, true, false, DType>(lhs.exp, rhs.self(), 1.0f);
+}
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, false, true, DType>
+dot(const RValueExp<TA, DType> &lhs, const TransposeExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, false, true, DType>(lhs.self(), rhs.exp, 1.0f);
+}
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, true, true, DType>
+dot(const TransposeExp<TA, DType> &lhs, const TransposeExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, true, true, DType>(lhs.exp, rhs.exp, 1.0f);
+}
+//---------------
+// BinaryMapExp
+// --------------
+/*!
+ * \brief binary map expression lhs [op] rhs
+ * \tparam OP operator
+ * \tparam TA type of lhs
+ * \tparam TB type of rhs
+ * \tparam etype expression type, sa namespace::type
+ */
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+struct BinaryMapExp: public Exp<BinaryMapExp<OP, TA, TB, DType, etype>,
+                                DType, etype> {
+  /*! \brief left operand */
+  const TA &lhs_;
+  /*! \brief right operand */
+  const TB &rhs_;
+  /*! \brief constructor */
+  explicit BinaryMapExp(const TA &lhs, const TB &rhs)
+      :lhs_(lhs), rhs_(rhs) {}
+};
+
+/*! \brief make expression */
+template<typename OP, typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<OP, TA, TB, DType, (ta|tb|type::kMapper)>
+MakeExp(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return BinaryMapExp<OP, TA, TB, DType,
+                      (ta|tb|type::kMapper)>(lhs.self(), rhs.self());
+}
+/*!
+ * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression 
+ * \param lhs left operand
+ * \param rhs right operand
+ * \return the result expression
+ * \tparam binary operator 
+ * \tparam TA lhs expression
+ * \tparam ta lhs expression type
+ * \tparam TB rhs expression
+ * \tparam tb rhs expression type
+ * \sa mshadow::op
+ */
+template<typename OP, typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<OP, TA, TB, DType, (ta|tb|type::kMapper)>
+F(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+// operator rules
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::plus, TA, TB, DType, (ta|tb|type::kMapper)>
+operator+(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::minus, TA, TB, DType, (ta|tb|type::kMapper)>
+operator-(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::mul, TA, TB, DType, (ta|tb|type::kMapper)>
+operator*(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::div, TA, TB, DType, (ta|tb|type::kMapper)>
+operator/(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+//---------------
+// UnaryMapExp
+// --------------
+/*!
+ * \brief unary map expression op(src)
+ * \tparam OP operator
+ * \tparam TA type of src
+ * \tparam etype expression type, sa namespace::type
+ */
+template<typename OP, typename TA, typename DType, int etype>
+struct UnaryMapExp: public Exp<UnaryMapExp<OP, TA, DType, etype>,
+                               DType, etype> {
+  /*! \brief source expression */
+  const TA &src_;
+  /*! \brief constructor */
+  explicit UnaryMapExp(const TA &src) : src_(src) {}
+};
+
+/*! \brief make expression */
+template<typename OP, typename TA, typename DType, int ta>
+inline UnaryMapExp<OP, TA, DType, (ta|type::kMapper)>
+MakeExp(const Exp<TA, DType, ta> &src) {
+  return UnaryMapExp<OP, TA, DType, (ta|type::kMapper)>(src.self());
+}
+/*! 
+ * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression 
+ * \param src source expression
+ * \return the result expression
+ * \tparam operator 
+ * \tparam TA source expression
+ * \tparam ta source expression type
+ * \sa mshadow::op
+ */
+template<typename OP, typename TA, typename DType, int ta>
+inline UnaryMapExp<OP, TA, DType, (ta|type::kMapper)>
+F(const Exp<TA, DType, ta> &src) {
+  return MakeExp<OP>(src);
+}
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXPRESSION_H_
diff --git a/mshadow/extension.h b/mshadow/extension.h
new file mode 100644
index 000000000000..882d367a72dd
--- /dev/null
+++ b/mshadow/extension.h
@@ -0,0 +1,26 @@
+/*!
+ * 
+ * \file extension.h
+ * \brief some extension of expressions, 
+ *  used to support something beyond elementwise op
+ * \author Tianqi Chen, Bing Xu
+ */
+#ifndef MSHADOW_EXTENSION_H_
+#define MSHADOW_EXTENSION_H_
+#include "./expr_engine-inl.h"
+#include "./extension/broadcast.h"
+#include "./extension/unpack_patch2col.h"
+#include "./extension/pack_col2patch.h"
+#include "./extension/reshape.h"
+#include "./extension/swapaxis.h"
+#include "./extension/reduceto1d.h"
+#include "./extension/spatial_pool.h"
+#include "./extension/spatial_unpool.h"
+#include "./extension/channel_pool.h"
+#include "./extension/channel_unpool.h"
+#include "./extension/pad.h"
+#include "./extension/crop.h"
+#include "./extension/mirror.h"
+#include "./extension/concat.h"
+#endif
+
diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h
new file mode 100644
index 000000000000..9a8b57bffc7d
--- /dev/null
+++ b/mshadow/extension/broadcast.h
@@ -0,0 +1,107 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file broadcast.h
+ * \brief support for broadcast and repmat
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_BROADCAST_H_
+#define MSHADOW_EXTENSION_BROADCAST_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief broadcast Tensor1D into a higher dimension Tensor
+ * input: Tensor<Device,1>: ishape[0]
+ * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0]
+ * \tparam SrcExp type of input expression
+ * \tparam DType the type of elements
+ * \tparam dimdst  target tensor dimension
+ * \tparam dimcast_m_dst  dimcast - dimdst
+ */
+template<typename SrcExp, typename DType, int dimdst, int dimdst_m_cast>
+struct Broadcast1DExp:
+      public MakeTensorExp<Broadcast1DExp<SrcExp, DType, dimdst, dimdst_m_cast>,
+                           SrcExp, dimdst, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  Broadcast1DExp(const SrcExp &src, Shape<dimdst> shape)
+      : src_(src) {
+    this->shape_ = shape;
+  }
+};
+/*!
+ * \brief a expression that replicate a 1 dimension tensor in dimension dimcast
+ * \param src Tensor<Device,1>: shape[0]
+ * \param shape shape of output
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam dimcast target dimension where the 1D tensor will be broadcasted
+ * \tparam SrcExp type of input expression
+ * \tparam DType the type of elements
+ * \tparam dimdst dimension of destination tensor
+ * \tparam dimcast_lowest the dimension we want to cast the data into
+ */
+template<int dimcast, typename SrcExp, typename DType,
+         int etype, int dimdst>
+inline Broadcast1DExp<SrcExp, DType, dimdst, dimdst - dimcast>
+broadcast(const expr::Exp<SrcExp, DType, etype> &src, Shape<dimdst> shape) {
+  TypeCheckPass<dimcast < dimdst && ExpInfo<SrcExp>::kDim == 1>
+                ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  utils::Check(ShapeCheck<1, SrcExp>::Check(src.self())[0] == shape[dimcast],
+               "broadcast, shape mismatch");
+  return Broadcast1DExp<SrcExp, DType, dimdst,
+                        dimdst - dimcast>(src.self(), shape);
+}
+// short cut functions
+/*!
+ * \brief a expression that replicate a 1 dimension tensor for nrow times
+ * \param src Tensor<Device,1>: shape[0]
+ * \param nrow number of rows to replicate
+ * \return a expresion with type Tensor<Device,2> size(1), size(0) = nrow
+ * \tparam Device which device it lies
+ */
+template<typename SrcExp, typename DType, int etype>
+inline Broadcast1DExp<SrcExp, DType, 2, 1>
+repmat(const expr::Exp<SrcExp, DType, etype> &src, index_t nrow) {
+  return broadcast<1>
+      (src, Shape2(nrow, ShapeCheck<1, SrcExp>::Check(src.self())[0]));
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dimdst, int dimdst_m_cast>
+struct Plan<Broadcast1DExp<SrcExp, DType, dimdst, dimdst_m_cast>, DType> {
+ public:
+  static const int dimcast = dimdst - dimdst_m_cast;
+  explicit Plan(const Broadcast1DExp<SrcExp, DType, dimdst, dimdst_m_cast> &e)
+      : src_(MakePlan(e.src_)),
+        ystride_(e.shape_.ProdShape(dimcast + 1, dimdst - 1)),
+        length_(e.shape_[dimcast]) {
+    TypeCheckPass<dimcast != dimdst - 1>
+        ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, (y / ystride_) % length_);
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+  const index_t  ystride_, length_;
+};
+
+/*! \brief execution plan of Broadcast1DExp */
+template<typename SrcExp, typename DType, int dimdst>
+struct Plan<Broadcast1DExp<SrcExp, DType, dimdst, 1>, DType>{
+ public:
+  explicit Plan(const Broadcast1DExp<SrcExp, DType, dimdst, 1> &e)
+      : src_(MakePlan(e.src_)) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, x);
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_BROADCAST_H_
diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h
new file mode 100644
index 000000000000..4039d1d6303a
--- /dev/null
+++ b/mshadow/extension/channel_pool.h
@@ -0,0 +1,108 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file channel_pool.h
+ * \brief support for chpool
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CHANNEL_POOL_H_
+#define MSHADOW_EXTENSION_CHANNEL_POOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief channel pooling expression, do reduction over (local nearby) channels,
+ *        used to implement local response normalization
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct ChannelPoolingExp:
+      public MakeTensorExp<ChannelPoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief neighbor size */
+  index_t nsize_;
+  /*! \brief stride of pooling */
+  index_t stride_;
+  /*! \brief pad of pooling of each side */
+  index_t pad_;
+  index_t src_channel_;
+  /*! \brief constructor */
+  ChannelPoolingExp(const SrcExp &src, index_t nsize, index_t stride, index_t pad)
+      : src_(src), nsize_(nsize), stride_(stride), pad_(pad) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    this->src_channel_ = this->shape_[srcdim - 3];
+    utils::Check(this->shape_[srcdim - 3] >= nsize_,
+                 "chpool: local size must be smaller than nchannels");
+    this->shape_[srcdim - 3] = (this->src_channel_ - nsize + pad * 2 + 1) / stride;
+  }
+};
+/*!
+ * \brief  channel pooling, do reduction over (local nearby) channels,
+ *         used to implement local response normalization
+ * \param src source data
+ * \param nsize neighbor size
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+chpool(const Exp<SrcExp, DType, etype> &src, index_t nsize) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  utils::Check(nsize % 2 == 1,
+                 "chpool: if no pad is specified, local size must be odd");
+  return ChannelPoolingExp<Reducer, SrcExp,
+                           DType, ExpInfo<SrcExp>::kDim>(src.self(), nsize, 1, nsize / 2);
+}
+
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+chpool(const Exp<SrcExp, DType, etype> &src, index_t nsize, index_t stride, index_t pad) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return ChannelPoolingExp<Reducer, SrcExp,
+                           DType, ExpInfo<SrcExp>::kDim>(src.self(), nsize, stride, pad);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<ChannelPoolingExp<Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const ChannelPoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]),
+        height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]),
+        hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_),
+        src_channel_(e.src_channel_){}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % channel_;
+    const index_t n = i / channel_;
+    const index_t x = j;
+    const index_t cstart = c * stride_ < pad_ ? 0  : c * stride_ - pad_;
+    const index_t cend   = min(cstart + hnsize_, channel_);
+    DType res; Reducer::SetInitValue(res);
+    for (index_t cc = cstart; cc < cend; ++cc) {
+      Reducer::Reduce(res, src_.Eval((n * src_channel_ + cc) * height_ + y, x));
+    }
+    return res;
+  }
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t channel_, height_, width_, hnsize_, stride_, pad_, src_channel_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CHANNEL_POOL_H_
+
diff --git a/mshadow/extension/channel_unpool.h b/mshadow/extension/channel_unpool.h
new file mode 100644
index 000000000000..6257391d2fd0
--- /dev/null
+++ b/mshadow/extension/channel_unpool.h
@@ -0,0 +1,132 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file channel_pool.h
+ * \brief support for chpool
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_
+#define MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief channel pooling expression, do reduction over (local nearby) channels,
+ *        used to implement local response normalization
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct ChannelUnpoolingExp:
+      public MakeTensorExp<ChannelUnpoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source input, corresponds to src in pooling */
+  const SrcExp &data_src_;
+  /*! \brief result of pooled data, corresponds to result of pooling */
+  const SrcExp &data_pooled_;
+  /*! \brief gradient data of pooled part, to be propgate down */
+  const SrcExp &grad_pooled_;
+  /*! \brief channel of pooled expression */
+  index_t pchannel_;
+  /*! \brief kernel size in height */
+  index_t nsize_;
+  /*! \brief kernel size in width */
+  index_t kstride_;
+  /*! \brief pad */
+  index_t pad_;
+  /*! \brief constructor */
+  ChannelUnpoolingExp(const SrcExp &data_src,
+               const SrcExp &data_pooled,
+               const SrcExp &grad_pooled,
+               index_t nsize, index_t kstride, index_t pad)
+      : data_src_(data_src), data_pooled_(data_pooled),
+        grad_pooled_(grad_pooled),
+        nsize_(nsize), kstride_(kstride), pad_(pad) {
+    Shape<srcdim> pshape = ShapeCheck<srcdim, SrcExp>::Check(grad_pooled);
+    utils::Check(pshape == ShapeCheck<srcdim, SrcExp>::Check(data_pooled),
+                 "ChannelUnPoolingExp: data and grad shape mismatch");
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(data_src);
+    for (int k = 0; k < srcdim; ++k) {
+      if (k == 1){
+        continue;
+      }
+      utils::Check(pshape[k] == sshape[k],
+                   "ChannelUnPoolingExp: pooled tensor and src tensor shape mismatch");
+    }
+    pchannel_ = pshape[1];
+    this->shape_ = sshape;
+  }
+};
+/*!
+ * \brief  channel unpooling, do unroll over (local nearby) channels
+ * \param src source data
+ * \param nsize neighbor size
+ * \param stride stride of the pooling
+ * \param pad number of padding at each side
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelUnpoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+ch_unpool(const Exp<SrcExp, DType, etype> &data_src,
+       const Exp<SrcExp, DType, etype> &data_pooled,
+       const Exp<SrcExp, DType, etype> &grad_pooled,
+      index_t nsize, index_t stride, index_t pad) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return ChannelUnpoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+        (data_src.self(), data_pooled.self(), grad_pooled.self(), nsize, stride, pad);
+}
+
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelUnpoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+ch_unpool(const Exp<SrcExp, DType, etype> &data_src,
+       const Exp<SrcExp, DType, etype> &data_pooled,
+       const Exp<SrcExp, DType, etype> &grad_pooled, index_t nsize) {
+  return ch_unpool(data_src, data_pooled, grad_pooled, nsize, 1, nsize / 2);
+}
+
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<ChannelUnpoolingExp<Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const ChannelUnpoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : data_src_(e.data_src_), data_pooled_(e.data_pooled_),
+        grad_pooled_(e.grad_pooled_), channel_(e.shape_[srcdim - 3]),
+        height_(e.shape_[srcdim - 2]), pchannel_(e.pchannel_),
+        hnsize_(e.nsize_), stride_(e.kstride_), pad_(e.pad_){}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const DType vsrc = data_src_.Eval(i, j);
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % channel_;
+    const index_t n = i / channel_;
+    const index_t x = j;
+    const index_t cstart = c < hnsize_ - pad_ ? 0
+                        : (c - (hnsize_ - pad_) + stride_) / stride_;
+    const index_t cend = min((c + pad_ + stride_) / stride_, channel_);
+    DType val = static_cast<DType>(0);
+    for (index_t cc = cstart; cc < cend; ++cc) {
+      val += Reducer::PartialGrad(vsrc,
+                                  data_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x)) *
+                                  grad_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x);
+    }
+    return val;
+  }
+ private:
+  Plan<SrcExp, DType> data_src_, data_pooled_, grad_pooled_;
+  const index_t channel_, height_, pchannel_, hnsize_, stride_, pad_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CHANNEL_POOL_H_
+
diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h
new file mode 100644
index 000000000000..e7ae27735a0f
--- /dev/null
+++ b/mshadow/extension/concat.h
@@ -0,0 +1,177 @@
+#ifndef MSHADOW_EXTENSION_CONCAT_H_
+#define MSHADOW_EXTENSION_CONCAT_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief concat expression, concat two tensor's channel
+ * \tparam LhsExp left expression
+ * \tparam RhsExp right expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ * \tparam dimsrc_m_cat dimsrc - dimcat
+ */
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct ConcatExp : public TRValue<ConcatExp<LhsExp, RhsExp,
+                                            Device, DType,
+                                            srcdim, dimsrc_m_cat>,
+                                  Device, srcdim, DType> {
+  static const int dimcat = srcdim - dimsrc_m_cat;
+  const LhsExp &src1_;
+  const RhsExp &src2_;
+  index_t dcat_src1_;
+  index_t dcat_src2_;
+  Shape<4> shape_;
+  ConcatExp(const LhsExp &src1, const RhsExp &src2) : src1_(src1), src2_(src2) {
+    Shape<srcdim> sshape1 = ShapeCheck<srcdim, LhsExp>::Check(src1_);
+    Shape<srcdim> sshape2 = ShapeCheck<srcdim, RhsExp>::Check(src2_);
+    #pragma unroll
+    for (int i = 0; i < srcdim; ++i) {
+      if (i != dimcat) {
+        utils::Check(sshape1[i] == sshape2[i],
+                     "ConcatExp: shape mismatch");    
+      }
+    }
+    this->shape_ = sshape1;
+    this->shape_[dimcat] = sshape1[dimcat] + sshape2[dimcat];
+    this->dcat_src1_ = sshape1[dimcat];
+    this->dcat_src2_ = sshape2[dimcat];
+  }
+  template<typename E, int etype>
+  inline void
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    this->__assign(exp);
+  }
+  inline void
+  operator=(const DType &exp) {
+    this->__assign(exp);
+  }
+}; // struct ConcatExp
+/*!
+ * \brief concat two 4D tensor
+ * \param src1 source tensor1
+ * \param src2 source tensor2
+ * \return concated 4D tensor
+ * \tparam cdim the dimension to concatnate on
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<int cdim, typename LhsExp, typename RhsExp,
+         typename Device, typename DType, int srcdim>
+inline ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, srcdim - cdim>
+concat(const TRValue<LhsExp, Device, srcdim, DType> &src1,
+       const TRValue<RhsExp, Device, srcdim, DType> &src2) {
+  TypeCheckPass<ExpInfo<LhsExp>::kDim == ExpInfo<RhsExp>::kDim>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  TypeCheckPass<cdim < srcdim && ExpInfo<LhsExp>::kDim == srcdim>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();  
+  return ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, srcdim - cdim>
+      (src1.self(), src2.self());
+}
+//------------------------
+//  engine plugin
+//------------------------
+// runtime shapecheck
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct ShapeCheck<srcdim, ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> >{
+  inline static Shape<srcdim> Check(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> &t) {
+    return t.shape_;
+  }
+};
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct StreamInfo<Device, ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> >{
+  inline static Stream<Device> *
+  Get(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> &t) {
+    Stream<Device> *lhs = StreamInfo<Device, LhsExp>::Get(t.src1_);
+    Stream<Device> *rhs = StreamInfo<Device, RhsExp>::Get(t.src2_);
+    if (lhs != rhs) return NULL;
+    return lhs;
+  }
+};
+// static typecheck
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct ExpInfo<ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> >{
+  static const int kDimLhs = ExpInfo<LhsExp>::kDim;
+  static const int kDimRhs = ExpInfo<RhsExp>::kDim;
+  // copy from binarymap
+  static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\
+      (kDimLhs == 0 ?\
+       kDimRhs :\
+       ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1;
+  static const int kDevMask = ExpInfo<LhsExp>::kDevMask & ExpInfo<RhsExp>::kDevMask;
+};
+//----------------------
+// Execution plan
+//---------------------
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct Plan<ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat>, DType> {
+ public:
+  static const int dimcat = srcdim - dimsrc_m_cat;  
+  explicit Plan(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> &e)
+      : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)),
+        height_(e.shape_.ProdShape(dimcat + 1, srcdim - 1)),
+        ch_src1_(e.dcat_src1_), ch_src2_(e.dcat_src2_), ch_(e.shape_[dimcat]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % ch_;
+    const index_t b = i / ch_;
+    const index_t x = j;
+    if (c < ch_src1_) return src1_.Eval((b * ch_src1_ + c) * height_ + y, x);
+    else return src2_.Eval((b * ch_src2_ + c - ch_src1_) * height_ + y, x);
+  }
+  MSHADOW_XINLINE DType &REval(index_t i, index_t j) {
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % ch_;
+    const index_t b = i / ch_;
+    const index_t x = j;
+    if (c < ch_src1_) return src1_.REval((b * ch_src1_ + c) * height_ + y, x);
+    else return src2_.REval((b * ch_src2_ + c - ch_src1_) * height_ + y, x);
+  }
+
+ private:
+  Plan<LhsExp, DType> src1_;
+  Plan<RhsExp, DType> src2_;
+  const index_t height_, ch_src1_, ch_src2_, ch_;
+}; // struct Plan
+
+// specialize for concat in x
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim>
+struct Plan<ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, 1>, DType> {  
+ public:
+  explicit Plan(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, 1> &e)
+      : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)),
+        width_src1_(e.dcat_src1_) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    if (x < width_src1_) return src1_.Eval(y, x);
+    else return src2_.Eval(y, x - width_src1_);
+  }
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    if (x < width_src1_) return src1_.REval(y, x);
+    else return src2_.REval(y, x - width_src1_);
+  }
+
+ private:
+  Plan<LhsExp, DType> src1_;
+  Plan<RhsExp, DType> src2_;
+  const index_t width_src1_;
+};
+}// namespace expr
+} // namespace mshadow
+#endif // MSHADOW_EXTENSION_CONCAT_H_
diff --git a/mshadow/extension/crop.h b/mshadow/extension/crop.h
new file mode 100644
index 000000000000..d740d7bb18c9
--- /dev/null
+++ b/mshadow/extension/crop.h
@@ -0,0 +1,121 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file crop.h
+ * \brief support for crop
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CROP_H_
+#define MSHADOW_EXTENSION_CROP_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief crop expression, cut off the boundary region, reverse operation of padding
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct CroppingExp:
+      public MakeTensorExp<CroppingExp<SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief pad height */
+  index_t pad_height_;
+  /*! \brief pad height */
+  index_t pad_width_;
+  /*! \brief src height */
+  index_t src_height_;
+  /*! \brief constructor */
+  explicit CroppingExp(const SrcExp &src, Shape<2> cshape)
+      : src_(src) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    utils::Check(this->shape_[srcdim - 2] >= cshape[0],
+                 "CroppingExp: height requirement not met");
+    utils::Check(this->shape_[srcdim - 1] >= cshape[1],
+                 "CroppingExp: width requirement not met");
+    pad_height_ = (this->shape_[srcdim - 2] - cshape[0]) / 2;
+    pad_width_ = (this->shape_[srcdim - 1] - cshape[1]) / 2;
+    src_height_ = this->shape_[srcdim - 2];
+    this->shape_[srcdim - 2] = cshape[0];  // height
+    this->shape_[srcdim - 1] = cshape[1];  // width
+  }
+  /*! \brief constructor */
+  explicit CroppingExp(const SrcExp &src, Shape<2> cshape,
+                       index_t start_height, index_t start_width)
+      : src_(src), pad_height_(start_height), pad_width_(start_width) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    utils::Check(this->shape_[srcdim - 2] >= cshape[0] + start_height,
+                 "CroppingExp: height requirement not met");
+    utils::Check(this->shape_[srcdim - 1] >= cshape[1] + start_width,
+                 "CroppingExp: width requirement not met");
+    src_height_ = this->shape_[srcdim - 2];
+    this->shape_[srcdim - 2] = cshape[0];  // height
+    this->shape_[srcdim - 1] = cshape[1];  // width
+  }
+};  // struct CroppingExp
+/*!
+ * \brief revserse operationg of padding, cut off boundaries,
+ *   crop output from center of input
+ * \param src original image batches
+ * \param oshape output shape to be cropped
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+crop(const Exp<SrcExp, DType, etype> &src, Shape<2> oshape) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), oshape);
+}
+/*!
+ * \brief same as crop, but can specify starting position to do cropping
+ * \param src original image batches
+ * \param oshape output shape to be cropped
+ * \param start_height start height position to do cropping
+ * \param start_width  start width position to do cropping
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+crop(const Exp<SrcExp, DType, etype> &src, Shape<2> oshape,
+     index_t start_height, index_t start_width) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), oshape, start_height, start_width);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<CroppingExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const CroppingExp<SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)),
+        pad_height_(e.pad_height_), pad_width_(e.pad_width_),
+        new_height_(e.shape_[srcdim - 2]), src_height_(e.src_height_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x = j;
+    const index_t y = i % new_height_;
+    const index_t c = i / new_height_;
+    const index_t h = y + pad_height_;
+    const index_t w = x + pad_width_;
+    return src_.Eval(c * src_height_ + h, w);
+  }
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t pad_height_, pad_width_;
+  const index_t new_height_;
+  const index_t src_height_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CROP_H_
diff --git a/mshadow/extension/mirror.h b/mshadow/extension/mirror.h
new file mode 100644
index 000000000000..9e9edc9b6f70
--- /dev/null
+++ b/mshadow/extension/mirror.h
@@ -0,0 +1,62 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file mirror.h
+ * \brief support for mirror
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_MIRROR_H_
+#define MSHADOW_EXTENSION_MIRROR_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief mirror expression, mirror a image in width
+ * \tparam SrcExp source expression to be mirrored
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct MirroringExp:
+      public MakeTensorExp<MirroringExp<SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  explicit MirroringExp(const SrcExp &src) : src_(src) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+  }
+};
+/*!
+ * \brief mirroring expression, mirror images in width
+ * \param src original image batches
+ * \return expression corresponding to mirrored result
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline MirroringExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+mirror(const Exp<SrcExp, DType, etype> &src) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return MirroringExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self());
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<MirroringExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const MirroringExp<SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)), width_(e.shape_[srcdim - 1]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    return src_.Eval(i, width_ - j - 1);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_MIRROR_H_
diff --git a/mshadow/extension/pack_col2patch.h b/mshadow/extension/pack_col2patch.h
new file mode 100644
index 000000000000..28001b42c9e6
--- /dev/null
+++ b/mshadow/extension/pack_col2patch.h
@@ -0,0 +1,119 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file pack_col2patch.h
+ * \brief support for pack
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_PACK_COL2PATCH_H_
+#define MSHADOW_EXTENSION_PACK_COL2PATCH_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief reverse operation of UnpackPatchToCol,
+ *    used to backprop gradient back
+ *    this is a version supporting multiple images
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam dstdim destination dimension
+ */
+template<typename SrcExp, typename DType, int dstdim>
+struct PackColToPatchXExp:
+      public MakeTensorExp<PackColToPatchXExp<SrcExp, DType, dstdim>,
+                           SrcExp, dstdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief patch height */
+  index_t psize_y_;
+  /*! \brief patch height */
+  index_t psize_x_;
+  /*! \brief patch stride */
+  index_t pstride_;
+  /*! \brief constructor */
+  PackColToPatchXExp(const SrcExp &src, Shape<dstdim> imshape,
+                     index_t psize_y, index_t psize_x, index_t pstride)
+      :src_(src), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){
+    this->shape_ = imshape;
+    const index_t o_height = (imshape[dstdim - 2] - psize_y) / pstride + 1;
+    const index_t o_width  = (imshape[dstdim - 1] - psize_x) / pstride + 1;
+    Shape<2> sshape = ShapeCheck<2, SrcExp>::Check(src_);
+    utils::Check(sshape[1] == o_height * o_width *
+                 imshape.ProdShape(0, dstdim - 3),
+                 "PackColToPatchExp: src.size(1) mismatch");
+    utils::Check(sshape[0] == psize_y * psize_x * imshape[dstdim - 3],
+                 "PackColToPatchExp: src.size(0) mismatch");
+  }
+};
+/*!
+ * \brief reverse operation of pack_col2patch, can be used to implement deconvolution
+ * \return packed img expression
+ * \param mat source matrix
+ * \param imshape shape of target img
+ * \param psize_y height of each patch
+ * \param psize_x height of each patch
+ * \param pstride stride of each patch
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam dstdim destination dimension
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int dstdim, int etype>
+inline PackColToPatchXExp<SrcExp, DType, dstdim>
+pack_col2patch(const expr::Exp<SrcExp, DType, etype> &src,
+               Shape<dstdim> imshape, index_t psize_y,
+               index_t psize_x, index_t pstride) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  utils::Check(imshape[dstdim - 1] >= psize_x &&
+               imshape[dstdim - 2] >= psize_y,
+               "PackColToPatch:image shape smaller than patch size");
+  return PackColToPatchXExp<SrcExp, DType, dstdim>(src.self(), imshape,
+                                                   psize_y, psize_x, pstride);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dstdim>
+struct Plan<PackColToPatchXExp<SrcExp, DType, dstdim>, DType> {
+ public:
+  explicit Plan(const PackColToPatchXExp<SrcExp, DType, dstdim> &e)
+      :src_(MakePlan(e.src_)), psize_y_(e.psize_y_),
+       psize_x_(e.psize_x_), pstride_(e.pstride_),
+       i_channel_(e.shape_[dstdim - 3]), i_height_(e.shape_[dstdim - 2]),
+       o_height_((e.shape_[dstdim - 2]  - psize_y_) / pstride_ + 1),
+       o_width_((e.shape_[dstdim - 1]  - psize_x_) / pstride_ + 1) {
+    // note: i/o convention are same as unpack
+  }
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t y = i % i_height_;
+    const index_t idivh = i / i_height_;
+    const index_t c = idivh % i_channel_;
+    const index_t n = idivh / i_channel_;
+    const index_t x = j;
+    const index_t py_min =
+        y < psize_y_ ? 0 : (y-psize_y_ + pstride_) / pstride_;
+    const index_t px_min =
+        x < psize_x_ ? 0 : (x-psize_x_ + pstride_) / pstride_;
+    const index_t py_max = min((y + pstride_) / pstride_, o_height_);
+    const index_t px_max = min((x + pstride_) / pstride_, o_width_);
+    DType res = static_cast<DType>(0);
+    for (index_t py = py_min; py < py_max; ++py) {
+      for (index_t px = px_min; px < px_max; ++px) {
+        res += src_.Eval(((c * psize_y_ + y - py*pstride_) * psize_x_ +
+                          x - px * pstride_),
+                         (n * o_height_ + py) * o_width_ + px);
+      }
+    }
+    return res;
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t psize_y_, psize_x_, pstride_, i_channel_;
+  const index_t i_height_, o_height_, o_width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_PACK_COL2PATCH_H_
diff --git a/mshadow/extension/pad.h b/mshadow/extension/pad.h
new file mode 100644
index 000000000000..6622a022acc8
--- /dev/null
+++ b/mshadow/extension/pad.h
@@ -0,0 +1,111 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file pad.h
+ * \brief support for pad
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_PAD_H_
+#define MSHADOW_EXTENSION_PAD_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief padding expression, pad a image with zeros
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct PaddingExp:
+      public MakeTensorExp<PaddingExp<SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief pad size in y */
+  index_t pad_y_;
+  /*! \brief pad size in x */
+  index_t pad_x_;
+  /*! \brief source tensor height */
+  index_t src_height_;
+  /*! \brief source tensor width */
+  index_t src_width_;
+  /*! \brief constructor */
+  PaddingExp(const SrcExp &src, index_t pad_y, index_t pad_x)
+      : src_(src), pad_y_(pad_y), pad_x_(pad_x) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    src_height_ = this->shape_[srcdim - 2];
+    src_width_  = this->shape_[srcdim - 1];
+    this->shape_[srcdim - 2] += pad_y * 2;  // height
+    this->shape_[srcdim - 1] += pad_x * 2;  // width
+  }
+};
+/*!
+ * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
+ * \param src original image batches
+ * \param pad padding size
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pad(const Exp<SrcExp, DType, etype> &src, index_t pad) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), pad, pad);
+}
+/*!
+ * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
+ * \param src original image batches
+ * \param pad_y padding size in y
+ * \param pad_x padding size in x
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pad(const Exp<SrcExp, DType, etype> &src, index_t pad_y, index_t pad_x) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), pad_y, pad_x);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<PaddingExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const PaddingExp<SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)),
+        pad_y_(e.pad_y_), pad_x_(e.pad_x_),
+        new_height_(e.shape_[srcdim - 2]),
+        src_height_(e.src_height_), src_width_(e.src_width_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x = j;
+    const index_t y = i % new_height_;
+    const index_t c = i / new_height_;
+    if (y < pad_y_ || x < pad_x_) return static_cast<DType>(0);
+    const index_t h = y - pad_y_;
+    const index_t w = x - pad_x_;
+    if (h < src_height_ && w < src_width_) {
+      return src_.Eval(c * src_height_ + h, w);
+    } else {
+      return static_cast<DType>(0);
+    }
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t pad_y_;
+  const index_t pad_x_;
+  const index_t new_height_;
+  const index_t src_height_;
+  const index_t src_width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_PAD_H_
diff --git a/mshadow/extension/reduceto1d.h b/mshadow/extension/reduceto1d.h
new file mode 100644
index 000000000000..b35e88c3153f
--- /dev/null
+++ b/mshadow/extension/reduceto1d.h
@@ -0,0 +1,89 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file reduceto1d.h
+ * \brief support for sum_rows and sumall_except_dim
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_REDUCETO1D_H_
+#define MSHADOW_EXTENSION_REDUCETO1D_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief reduction to 1 dimension tensor
+ * input: Tensor<Device,k>: ishape
+ * output: Tensor<Device,1> shape[0] = ishape[dimkeep];
+ *
+ * \tparam SrcExp type of expression to be reduced
+ * \tparam DType the data type of the scalar
+ * \tparam Reducer which reducer to use
+ * \tparam m_dimkeep which dimension to be kept, encoded with dimsrc - dimkeep
+ */
+template<typename SrcExp, typename DType, typename Reducer, int m_dimkeep>
+struct ReduceTo1DExp:
+      public Exp<ReduceTo1DExp<SrcExp, DType, Reducer, m_dimkeep>,
+                 DType, type::kComplex> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief source operand, scale of the  */
+  DType scale_;
+  /*! \brief construct a repmat expression from src and nrow */
+  ReduceTo1DExp(const SrcExp& src, DType scale) : src_(src), scale_(scale) {}
+};
+/*!
+ * \brief a sum over all dimensions, except dimkeep
+ * \param exp input expression that must be a matrix Tensor<?,2>
+ * \return a expresion with type Tensor<Device,1>
+ * \tparam dimkeep the dimension that will be kept
+ * \tparam SrcExp expression
+ * \tparam etype type of expression
+ */
+template<int dimkeep,  typename SrcExp, typename DType, int etype>
+inline ReduceTo1DExp<SrcExp, DType, red::sum,
+                     ExpInfo<SrcExp>::kDim - dimkeep>
+sumall_except_dim(const Exp<SrcExp, DType, etype> &exp) {
+  return ReduceTo1DExp<SrcExp, DType, red::sum,
+                       ExpInfo<SrcExp>::kDim - dimkeep>(exp.self(), 1);
+}
+/*!
+ * \brief a expression that sum over rows of a matrix
+ * \param exp input expression that must be a matrix Tensor<?, 2>
+ * \return a expresion with type Tensor<Device, 1>
+ * \tparam SrcExp expression
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline ReduceTo1DExp<SrcExp, DType, red::sum, 1>
+sum_rows(const Exp<SrcExp, DType, etype> &exp) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim ==2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return sumall_except_dim<1>(exp);
+}
+template<typename SV, typename Device, typename DType,
+         typename SrcExp, typename Reducer, int m_dimkeep>
+struct ExpComplexEngine<SV,
+                        Tensor<Device, 1, DType>,                        
+                        ReduceTo1DExp<SrcExp, DType, Reducer, m_dimkeep>,
+                        DType> {
+  static const int dimkeep = ExpInfo<SrcExp>::kDim - m_dimkeep;
+  inline static void Eval(Tensor<Device, 1, DType> *dst,
+                          const ReduceTo1DExp<SrcExp, DType,
+                                              Reducer, m_dimkeep> &exp) {
+    TypeCheckPass<m_dimkeep != 1>
+        ::Error_Expression_Does_Not_Meet_Dimension_Req();
+    MapReduceKeepHighDim<SV, Reducer, dimkeep>(dst, exp.src_, exp.scale_);
+  }
+};
+template<typename SV, typename Device, typename DType,
+         typename SrcExp, typename Reducer>
+struct ExpComplexEngine<SV,
+                        Tensor<Device, 1, DType>,
+                        ReduceTo1DExp<SrcExp, DType, Reducer, 1>, DType> {
+  inline static void Eval(Tensor<Device, 1, DType> *dst,
+                          const ReduceTo1DExp<SrcExp, DType, Reducer, 1> &exp) {
+    MapReduceKeepLowest<SV, Reducer>(dst, exp.src_, exp.scale_);
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_REDUCETO1D_H_
diff --git a/mshadow/extension/reshape.h b/mshadow/extension/reshape.h
new file mode 100644
index 000000000000..738e98f0e2c9
--- /dev/null
+++ b/mshadow/extension/reshape.h
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file reshape.h
+ * \brief support for reshape
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_RESHAPE_H_
+#define MSHADOW_EXTENSION_RESHAPE_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief reshape the content to another shape
+ * input: Tensor<Device,dimsrc>: ishape
+ * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size()
+ * \tparam SrcExp source expression
+ * \tparam dimdst target dimension
+ * \tparam dimsrc source dimension
+ */
+template<typename SrcExp, typename DType, int dimdst, int dimsrc>
+struct ReshapeExp:
+      public MakeTensorExp<ReshapeExp<SrcExp, DType, dimdst, dimsrc>,
+                           SrcExp, dimdst, DType> {
+  /*! \brief source expression */
+  const SrcExp &src_;
+  /*! \brief smallest dimension of input */
+  index_t ishapex_;
+  /*! \brief constructor */
+  ReshapeExp(const SrcExp &src, Shape<dimdst> shape)
+      : src_(src) {
+    Shape<dimsrc> ishape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+    utils::Check(ishape.Size() == shape.Size(), "reshape size must match");
+    ishapex_ = ishape[dimsrc - 1];
+    this->shape_ = shape;
+  }
+};
+/*!
+ * \brief a expression that reshapes a tensor to another shape
+ * \param src Tensor<Device,dimsrc>:
+ * \param oshape target shape
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam SrcExp source expression
+ * \tparam etype source expression type
+ * \tparam dimdst target dimension
+ */
+template<typename SrcExp, typename DType, int etype, int dimdst>
+inline ReshapeExp<SrcExp, DType, dimdst, ExpInfo<SrcExp>::kDim>
+reshape(const Exp<SrcExp, DType, etype> &src, Shape<dimdst> oshape) {
+  return ReshapeExp<SrcExp, DType, dimdst, ExpInfo<SrcExp>::kDim>
+      (src.self(), oshape);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dimdst, int dimsrc>
+struct Plan<ReshapeExp<SrcExp, DType, dimdst, dimsrc>, DType> {
+ public:
+  explicit Plan(const ReshapeExp<SrcExp, DType, dimdst, dimsrc> &e)
+      : src_(MakePlan(e.src_)),
+        oshapex_(e.shape_[dimdst - 1]), ishapex_(e.ishapex_) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t idx = y * oshapex_ + x;
+    return src_.Eval(idx / ishapex_, idx % ishapex_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t oshapex_, ishapex_;
+};
+// special work plan for 1 dimensional data
+template<typename SrcExp, typename DType, int dimdst>
+struct Plan<ReshapeExp<SrcExp, DType, dimdst, 1>, DType> {
+ public:
+  explicit Plan(const ReshapeExp<SrcExp, DType, dimdst, 1> &e)
+      : src_(MakePlan(e.src_)), oshapex_(e.shape_[dimdst - 1]) {
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, y * oshapex_ + x);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t oshapex_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_RESHAPE_H_
diff --git a/mshadow/extension/spatial_pool.h b/mshadow/extension/spatial_pool.h
new file mode 100644
index 000000000000..07f8433cca27
--- /dev/null
+++ b/mshadow/extension/spatial_pool.h
@@ -0,0 +1,146 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file spatial_pool.h
+ * \brief support for spatial pooling
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_SPATIAL_POOL_H_
+#define MSHADOW_EXTENSION_SPATIAL_POOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief pooling expression, do reduction over local patches of a image
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the content data type
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct PoolingExp:
+      public MakeTensorExp<PoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief kernel size in height */
+  index_t ksize_y_;
+  /*! \brief kernel size in width */
+  index_t ksize_x_;
+  /*! \brief kernel stride */
+  index_t kstride_;
+  /*! \brief source height shape[1] */
+  index_t src_height_;
+  /*! \brief source width shape[0] */
+  index_t src_width_;
+  /*! \brief constructor */
+  PoolingExp(const SrcExp &src,
+             index_t ksize_y, index_t ksize_x, index_t kstride)
+      : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) {
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    utils::Check(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y,
+                 "PoolingExp: kernel must be smaller than image");
+    this->src_height_ = sshape[srcdim - 2];
+    this->src_width_  = sshape[srcdim - 1];
+    this->shape_ = sshape;
+    this->shape_[srcdim - 2] = (src_height_ - ksize_y) / kstride + 1;
+    this->shape_[srcdim - 1] = (src_width_  - ksize_x) / kstride + 1;
+  }
+  /*! \brief constructor, specify shape */
+  PoolingExp(const SrcExp &src, Shape<2> pshape,
+             index_t ksize_y, index_t ksize_x, index_t kstride)
+      : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) {
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    utils::Check(sshape[srcdim - 1] >= ksize_x &&
+                 sshape[srcdim - 2] >= ksize_y,
+                 "PoolingExp: kernel must be smaller than image");
+    this->src_height_ = sshape[srcdim - 2];
+    this->src_width_  = sshape[srcdim - 1];
+    this->shape_ = sshape;
+    this->shape_[srcdim - 2] = pshape[0];
+    this->shape_[srcdim - 1] = pshape[1];
+  }
+};
+/*!
+ * \brief pooling subregion results together
+ * \param src source image, shape: (batch, channel, height, width)
+ * \param ksize_y kernel size in height
+ * \param ksize_x kernel size in width
+ * \param kstride stride for each kernel
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pool(const Exp<SrcExp, DType, etype> &src,
+     index_t ksize_y, index_t ksize_x, index_t kstride) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), ksize_y, ksize_x, kstride);
+}
+/*! 
+ * \brief same as pool, except the output shape is specified by pshape
+ * \param src source image
+ * \param pshape ouput shape 
+ * \param ksize_y kernel size in y
+ * \param ksize_x kernel size in x
+ * \param kstride stride for each kernel
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp,
+         typename DType, int etype>
+inline PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pool(const Exp<SrcExp, DType, etype> &src, Shape<2> pshape,
+     index_t ksize_y, index_t ksize_x, index_t kstride) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), pshape, ksize_y, ksize_x, kstride);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<PoolingExp< Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const PoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)),
+        ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_),
+        src_height_(e.src_height_), src_width_(e.src_width_),
+        new_height_(e.shape_[srcdim - 2]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t py = i % new_height_;
+    const index_t y_start = py * kstride_;
+    const index_t y_end = min(y_start + ksize_y_, src_height_);
+    const index_t px = j;
+    const index_t x_start = px * kstride_;
+    const index_t x_end = min(x_start + ksize_x_, src_width_);
+    const index_t c = i / new_height_;
+
+    DType res; Reducer::SetInitValue(res);
+    for (index_t y = y_start; y < y_end; ++y) {
+      for (index_t x = x_start; x < x_end; ++x) {
+        Reducer::Reduce(res, src_.Eval(c * src_height_ + y, x));
+      }
+    }
+    return res;
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t ksize_y_, ksize_x_, kstride_;
+  const index_t src_height_, src_width_;
+  const index_t new_height_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SPATIAL_POOL_H_
diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h
new file mode 100644
index 000000000000..848b77bb39fa
--- /dev/null
+++ b/mshadow/extension/spatial_unpool.h
@@ -0,0 +1,130 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file spatial_unpool.h
+ * \brief support for unpool
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_
+#define MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief unpooling expr reverse operation of pooling, used to pass gradient back
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the content data type
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct UnPoolingExp:
+      public MakeTensorExp<UnPoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source input, corresponds to src in pooling */
+  const SrcExp &data_src_;
+  /*! \brief result of pooled data, corresponds to result of pooling */
+  const SrcExp &data_pooled_;
+  /*! \brief gradient data of pooled part, to be propgate down */
+  const SrcExp &grad_pooled_;
+  /*! \brief shape of pooled expression */
+  index_t pshape_y_;
+  /*! \brief shape of pooled expression */
+  index_t pshape_x_;
+  /*! \brief kernel size in height */
+  index_t ksize_y_;
+  /*! \brief kernel size in width */
+  index_t ksize_x_;
+  /*! \brief kernel stride */
+  index_t kstride_;
+  /*! \brief constructor */
+  UnPoolingExp(const SrcExp &data_src,
+               const SrcExp &data_pooled,
+               const SrcExp &grad_pooled,
+               index_t ksize_y, index_t ksize_x, index_t kstride)
+      : data_src_(data_src), data_pooled_(data_pooled),
+        grad_pooled_(grad_pooled),
+        ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) {
+    Shape<srcdim> pshape = ShapeCheck<srcdim, SrcExp>::Check(grad_pooled);
+    utils::Check(pshape == ShapeCheck<srcdim, SrcExp>::Check(data_pooled),
+                 "UnPoolingExp: pooled shape mismatch");
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(data_src);
+    for (int k = 0;  k < srcdim - 2; ++k) {
+      utils::Check(pshape[k] == sshape[k],
+                   "UnPoolingExp: pool and src shape mismatch");
+    }
+    pshape_x_ = pshape[srcdim - 1];
+    pshape_y_ = pshape[srcdim - 2];
+    this->shape_ = sshape;
+  }
+};
+/*!
+ * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling,
+ *   same as unpooling, but allows unequal size of kernel
+ * \param data_src  source input, corresponds to src in pooling
+ * \param data_pooled result of pooled data, corresponds to result of pooling
+ * \param grad_pooled gradient data of pooled part, to be propgate down
+ * \param ksize_y kernel height
+ * \param ksize_x kernel width
+ * \param kstride stride for each kernel
+ * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline UnPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+unpool(const Exp<SrcExp, DType, etype> &data_src,
+       const Exp<SrcExp, DType, etype> &data_pooled,
+       const Exp<SrcExp, DType, etype> &grad_pooled,
+       index_t ksize_y, index_t ksize_x, index_t kstride) {
+  return UnPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (data_src.self(), data_pooled.self(), grad_pooled.self(),
+       ksize_y, ksize_x, kstride);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<UnPoolingExp<Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const UnPoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : data_src_(e.data_src_), data_pooled_(e.data_pooled_),
+        grad_pooled_(e.grad_pooled_), sshape_y_(e.shape_[srcdim - 2]),
+        pshape_y_(e.pshape_y_),  pshape_x_(e.pshape_x_),
+        ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t x = j;
+    const index_t y = i % sshape_y_;
+    const index_t c = i / sshape_y_;
+    const DType vsrc = data_src_.Eval(i, j);
+    const index_t py_min =
+        y < ksize_y_ ? 0 : (y - ksize_y_ + kstride_) / kstride_;
+    const index_t px_min =
+        x < ksize_x_ ? 0 : (x - ksize_x_ + kstride_) / kstride_;
+    const index_t py_max = min((y + kstride_) / kstride_, pshape_y_);
+    const index_t px_max = min((x + kstride_) / kstride_, pshape_x_);
+
+    DType val = static_cast<DType>(0);
+    for (index_t py = py_min; py < py_max; ++py) {
+      for (index_t px = px_min; px < px_max; ++px) {
+        val += Reducer::PartialGrad(vsrc,
+                                    data_pooled_.Eval(c * pshape_y_ + py, px)) *
+                                    grad_pooled_.Eval(c * pshape_y_ + py, px);
+      }
+    }
+
+    return val;
+  }
+
+ private:
+  Plan<SrcExp, DType> data_src_, data_pooled_, grad_pooled_;
+  const index_t sshape_y_, pshape_y_, pshape_x_;
+  const index_t ksize_y_, ksize_x_;
+  const index_t kstride_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_
diff --git a/mshadow/extension/swapaxis.h b/mshadow/extension/swapaxis.h
new file mode 100644
index 000000000000..3fcda22b527e
--- /dev/null
+++ b/mshadow/extension/swapaxis.h
@@ -0,0 +1,109 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file swapaxis.h
+ * \brief support for swapaxis
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_SWAPAXIS_H_
+#define MSHADOW_EXTENSION_SWAPAXIS_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief swap two axis of a tensor
+ * input: Tensor<Device,dim>: ishape
+ * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1]
+ *
+ * \tparam SrcExp type of source expression
+ * \tparam DType the type of elements 
+ * \tparam dimsrc source dimension, assert a1 > a2
+ * \tparam m_a1 one dimension to be swapped, encoded by dimsrc - a1 
+ * \tparam a2 second dimension to be swapped, encoded by a2
+ */
+template<typename SrcExp, typename DType, int dimsrc, int m_a1, int a2>
+struct SwapAxisExp:
+      public MakeTensorExp<SwapAxisExp<SrcExp, DType, dimsrc, m_a1, a2>,
+                           SrcExp, dimsrc, DType> {
+  // decode the a1, a2
+  static const int a1 = dimsrc - m_a1;
+  /*! \brief source expression */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  explicit SwapAxisExp(const SrcExp &src) : src_(src) {
+    this->shape_ = ShapeCheck<dimsrc, SrcExp>::Check(src);
+    std::swap(this->shape_[a1], this->shape_[a2]);
+  }
+};
+/*!
+ * \brief a expression that reshapes a tensor to another shape
+ * \param src Tensor<Device,dimsrc>:
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam a1 higher dimension to be swapped, assert a1 > a2
+ * \tparam a2 lower dimension to be swapped
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements 
+ * \tparam etype source expression type
+ */
+template<int a1, int a2, typename SrcExp, typename DType, int etype>
+inline SwapAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+                   ExpInfo<SrcExp>::kDim - a1, a2>
+swapaxis(const Exp<SrcExp, DType, etype> &src) {
+  typedef ExpInfo<SrcExp> Info;
+  TypeCheckPass<Info::kDim >= a1 + 1 && Info::kDim >= a2 + 1 &&
+                a2 < a1>::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return SwapAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+                     ExpInfo<SrcExp>::kDim - a1, a2>(src.self());
+}
+template<typename SrcExp, typename DType, int dimsrc, int m_a1, int a2>
+struct Plan<SwapAxisExp<SrcExp, DType, dimsrc, m_a1, a2>, DType> {
+ public:
+  // decode the a1
+  static const int a1 = dimsrc - m_a1;
+  explicit Plan(const SwapAxisExp<SrcExp, DType, dimsrc, m_a1, a2> &e)
+      : src_(MakePlan(e.src_)),
+        shapey_(e.shape_.ProdShape(a1 + 1, dimsrc - 1)),
+        shapez_(e.shape_[a1]),
+        shapec_(e.shape_.ProdShape(a2 + 1, a1)),
+        shapen_(e.shape_[a2]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t y = i % shapey_;
+    i /= shapey_;
+    const index_t z = i % shapez_;
+    i /= shapez_;
+    const index_t c = i % shapec_;
+    i /= shapec_;
+    const index_t n = i % shapen_;
+    // swap z and n
+    return src_.Eval(((((i / shapen_) * shapez_ + z) * shapec_ +
+                          c) * shapen_ + n) * shapey_ + y, j);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t shapey_, shapez_, shapec_, shapen_;
+};
+template<typename SrcExp, typename DType, int dimsrc, int a2>
+struct Plan<SwapAxisExp<SrcExp, DType, dimsrc, 1, a2>, DType> {
+ public:
+  explicit Plan(const SwapAxisExp<SrcExp, DType, dimsrc, 1, a2> &e)
+      : src_(MakePlan(e.src_)),
+        shapex_(e.shape_[dimsrc - 1]),
+        shapey_(e.shape_.ProdShape(a2 + 1, dimsrc - 1)),
+        shapez_(e.shape_[a2]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t x) const {
+    // swap x and z
+    const index_t y = i % shapey_;
+    i /= shapey_;
+    const index_t z = i % shapez_;
+    const index_t n = i / shapez_;
+    return src_.Eval((n * shapex_ + x) * shapey_ + y , z);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t shapex_, shapey_, shapez_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SWAPAXIS_H_
diff --git a/mshadow/extension/unpack_patch2col.h b/mshadow/extension/unpack_patch2col.h
new file mode 100644
index 000000000000..619baf26bd2a
--- /dev/null
+++ b/mshadow/extension/unpack_patch2col.h
@@ -0,0 +1,123 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file unpack_patch2col.h
+ * \brief support for unpack
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_
+#define MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief unpack local (overlap) patches of image to column of mat,
+ *  can be used to implement convolution, this expression allow unpack of a batch
+ *  this is a version support unpacking multiple images
+ *  after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations:
+ * \tparam SrcExp source expression
+ * \tparam dstdim destination dimension
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct UnpackPatchToColXExp:
+      public MakeTensorExp<UnpackPatchToColXExp<SrcExp, DType, srcdim>,
+                           SrcExp, 2, DType>{
+  /*! \brief source operand */
+  const SrcExp &img_;
+  /*! \brief patch height */
+  index_t psize_y_;
+  /*! \brief patch width */
+  index_t psize_x_;
+  /*! \brief patch stride */
+  index_t pstride_;
+  /*! \brief number of input channel */
+  index_t i_channel_;
+  /*! \brief height of img */
+  index_t i_height_;
+  /*! \brief width of img */
+  index_t i_width_;
+  /*! \brief constructor */
+  UnpackPatchToColXExp(const SrcExp &img,
+                       index_t psize_y,
+                       index_t psize_x,
+                       index_t pstride)
+      : img_(img), psize_y_(psize_y),
+       psize_x_(psize_x), pstride_(pstride) {
+    Shape<srcdim> imshape = ShapeCheck<srcdim, SrcExp>::Check(img_);
+    utils::Check(imshape[srcdim - 1] >= psize_x &&
+                 imshape[srcdim - 2] >= psize_y,
+                 "UnpackPatchToCol:image shape smaller than patch size");
+    this->i_channel_ = imshape[srcdim - 3];
+    this->i_height_  = imshape[srcdim - 2];
+    this->i_width_   = imshape[srcdim - 1];
+    // calculate number of batches
+    const index_t num = imshape.ProdShape(0, srcdim - 3);
+    const index_t o_height = (i_height_ - psize_y) / pstride + 1;
+    const index_t o_width  = (i_width_  - psize_x) / pstride + 1;
+    this->shape_[1] = o_height * o_width * num;
+    this->shape_[0] = psize_y * psize_x * i_channel_;
+  }
+};
+
+/*!
+ * \brief  unpack local (overlap) patches of image to column of mat, can be used to implement convolution
+ *  after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations:
+ *
+ *  weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x
+ *  output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images
+ *  out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0
+ *  out_width  = (in_width - psize_x) / pstride + 1
+ *
+ * \return mat target matrix; shape[0]: in_channel*psize_y*psize_x  shape[1]: out_height*out_width * num_of_images
+ * \param img source image; shape[-3]: in_channels, shape[-2]: in_height, shape[-1]: in_width, can be 3D or 4D tensor(multiple images)
+ * \param psize_y height of each patch
+ * \param psize_x width of each patch
+ * \param pstride stride of each patch 
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline UnpackPatchToColXExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+unpack_patch2col(const Exp<SrcExp, DType, etype> &img,
+                 index_t psize_y, index_t psize_x, index_t pstride) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return UnpackPatchToColXExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (img.self(), psize_y, psize_x, pstride);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<UnpackPatchToColXExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const UnpackPatchToColXExp<SrcExp, DType, srcdim> &e)
+      :src_(MakePlan(e.img_)),
+       psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_),
+       i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_),
+       o_height_((i_height_  - psize_y_) / pstride_ + 1),
+       o_width_((i_width_   - psize_x_) / pstride_ + 1) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x_offset = i % psize_x_;
+    const index_t idivp    = i / psize_x_;
+    const index_t y_offset = idivp % psize_y_;
+    const index_t c = idivp / psize_y_;
+    const index_t x = (j % o_width_) * pstride_ + x_offset;
+    const index_t jdivw = j / o_width_;
+    const index_t y = (jdivw % o_height_) * pstride_ + y_offset;
+    const index_t n = jdivw / o_height_;
+    if (x < i_width_ && y < i_height_) {
+      return src_.Eval((n * i_channel_  + c) * i_height_ + y, x);
+    } else {
+      return 0.0f;
+    }
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t psize_y_, psize_x_, pstride_, i_channel_;
+  const index_t i_height_, i_width_, o_height_, o_width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_
diff --git a/mshadow/io.h b/mshadow/io.h
new file mode 100644
index 000000000000..5a298198123e
--- /dev/null
+++ b/mshadow/io.h
@@ -0,0 +1,122 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file io.h
+ * \brief definitions of I/O functions for mshadow tensor
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_IO_H_
+#define MSHADOW_IO_H_
+#include "./tensor.h"
+
+namespace mshadow {
+namespace utils {
+/*! 
+ * \brief interface of stream I/O, used to serialize data, 
+ *   mshadow does not restricted to only this interface in SaveBinary/LoadBinary
+ *   mshadow accept all class that implements Read and Write
+ */
+class IStream {
+ public:
+  /*! 
+   * \brief read data from stream
+   * \param ptr pointer to memory buffer
+   * \param size size of block
+   * \return usually is the size of data readed
+   */
+  virtual size_t Read(void *ptr, size_t size) = 0;
+  /*! 
+   * \brief write data to stream
+   * \param ptr pointer to memory buffer
+   * \param size size of block
+   */
+  virtual void Write(const void *ptr, size_t size) = 0;
+  /*! \brief virtual destructor */
+  virtual ~IStream(void) {}
+};
+}  // namespace utils
+/*! 
+ * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+ * \param fo output binary stream
+ * \param src source data file
+ * \tparam dim dimension of tensor
+ * \tparam DType type of element in tensor
+ * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+ */
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<cpu, dim, DType> &src);
+/*! \brief refer to comment of cpu ver \sa SaveBinary */
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<gpu, dim, DType> &src);
+/*!
+ * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+ *       if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded
+ *       if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst
+ * \param fi output binary stream
+ * \param dst destination file
+ * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen
+ * \tparam dim dimension of tensor
+ * \tparam DType type of element in tensor
+ * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+ */
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi,
+                       Tensor<cpu, dim, DType> *dst, bool pre_alloc);
+/*! \brief refer to comment of cpu ver \sa LoadBinary */
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi,
+                       Tensor<gpu, dim, DType> *dst, bool pre_alloc);
+
+// implementations
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<cpu, dim, DType> &src_) {
+  fo.Write(&src_.shape_, sizeof(src_.shape_));
+  Tensor<cpu, 2, DType> src = src_.FlatTo2D();
+  for (index_t i = 0; i < src.size(0); ++i) {
+    fo.Write(src[i].dptr_, sizeof(DType) * src.size(1));
+  }
+}
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<gpu, dim, DType> &src) {
+  // copy to CPU, then save
+  Tensor<cpu, dim, DType> tmp(src.shape_);
+  AllocSpace(&tmp);
+  Stream<gpu> stream;
+  Copy(tmp, src, &stream);
+  SaveBinary(fo, tmp);
+  FreeSpace(&tmp);
+}
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi,
+                       Tensor<cpu, dim, DType> *dst_, bool pre_alloc) {
+  Shape<dim> shape;
+  utils::Check(fi.Read(&shape, sizeof(shape)) != 0, "mshadow::LoadBinary");
+  if (pre_alloc) {
+    utils::Check(shape == dst_->shape_,
+                 "LoadBinary, shape do not match pre-allocated shape");
+  } else {
+    dst_->shape_ = shape; AllocSpace(dst_);
+  }
+  Tensor<cpu, 2, DType> dst = dst_->FlatTo2D();
+  if (dst.size(0) == 0) return;
+  for (index_t i = 0; i < dst.size(0); ++i) {
+    utils::Check(fi.Read(dst[i].dptr_, sizeof(DType) * dst.size(1)) != 0,
+                 "mshadow::LoadBinary");
+  }
+}
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi,
+                       Tensor<gpu, dim, DType> *dst, bool pre_alloc) {
+  Tensor<cpu, dim, DType> tmp;
+  LoadBinary(fi, &tmp, false);
+  if (pre_alloc) {
+    utils::Check(tmp.shape == dst->shape_,
+                 "LoadBinary, shape do not match pre-allocated shape");
+  } else {
+    dst->shape = tmp.shape; AllocSpace(dst);
+  }
+  Stream<gpu> stream;
+  Copy(*dst, tmp, &stream);
+  FreeSpace(&tmp);
+}
+}  // namespace mshadow
+#endif  // MSHADOW_IO_H_
diff --git a/mshadow/random.h b/mshadow/random.h
new file mode 100644
index 000000000000..5213a69571f6
--- /dev/null
+++ b/mshadow/random.h
@@ -0,0 +1,358 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ *  \file random.h
+ *  \brief Random inline functions for tensor.
+ *  \author Bing Xu, Tianqi Chen
+ *   Based on curand|MKL|stdlib
+ */
+#ifndef MSHADOW_RANDOM_H_
+#define MSHADOW_RANDOM_H_
+#include <cstdlib>
+#include "./tensor.h"
+#include "./tensor_container.h"
+
+namespace mshadow {
+/*! 
+ * \brief random number generator 
+ * \tparam Device the device of random number generator
+ * \tparam DType the target data type of random number can be float for double
+ */
+template<typename Device, typename DType MSHADOW_DEFAULT_DTYPE>
+class Random {};
+
+/*! \brief CPU random number generator */
+template<typename DType>
+class Random<cpu, DType> {
+ public:
+  /*!
+   * \brief constructor of random engine
+   * \param seed random number seed
+   */
+  explicit Random(int seed) {
+    this->Seed(seed);
+    buffer_.Resize(Shape1(kRandBufferSize));
+  }
+  ~Random(void) {
+#if MSHADOW_USE_MKL
+    vslDeleteStream(&vStream_);
+#endif
+  }
+  /*!
+   * \brief seed random number generator using this seed
+   * \param seed seed of prng
+   */
+  inline void Seed(int seed) {
+#if MSHADOW_USE_MKL
+    int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
+    utils::Check(status == VSL_STATUS_OK,
+                 "MKL VSL Random engine failed to be initialized.\n");
+#else
+    this->rseed_ = static_cast<unsigned>(seed);
+#endif
+  }
+  /*!
+   * \brief set the stream of computation
+   * \param stream computation stream
+   */
+  inline void set_stream(Stream<cpu> *stream) {    
+  }
+  /*!
+   * \brief generate data from uniform [a,b)
+   * \param dst destination
+   * \param a lower bound of uniform
+   * \param b upper bound of uniform
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline void SampleUniform(Tensor<cpu, dim, DType> *dst,
+                            DType a = 0.0f, DType b = 1.0f) {
+    Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
+    for (index_t i = 0; i < mat.size(0); ++i) {
+      this->GenUniform(mat[i].dptr_, mat.size(1), a, b);
+    }
+  }
+  /*!
+   * \brief generate data from standard gaussian
+   * \param dst destination
+   * \param mu mean variable
+   * \param sigma standard deviation
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline void SampleGaussian(Tensor<cpu, dim, DType> *dst,
+                             DType mu = 0.0f, DType sigma = 1.0f) {
+    if (sigma <= 0.0f) {
+      *dst = mu; return;
+    }
+    Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
+    for (index_t i = 0; i < mat.size(0); ++i) {
+      this->GenGaussian(mat[i].dptr_, mat.size(1), mu, sigma);
+    }
+  }
+  /*!
+   * \brief return a temporal expression storing standard gaussian random variables
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \return a temporal expression storing standard gaussian random variables
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<cpu, 1, DType>, DType, dim, 1>
+  gaussian(Shape<dim> shape) {
+    buffer_.Resize(Shape1(shape.Size()));
+    this->SampleGaussian(&buffer_, 0.0f, 1.0f);
+    return expr::reshape(buffer_, shape);
+  }
+  /*!
+   * \brief return a temporal expression storing standard uniform [0,1)
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = uniform(s1) * uniform(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \return a temporal expression storing standard uniform [0,1)
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<cpu, 1, DType>, DType, dim, 1>
+  uniform(Shape<dim> shape) {
+    buffer_.Resize(Shape1(shape.Size()));
+    this->SampleUniform(&buffer_, 0.0f, 1.0f);
+    return expr::reshape(buffer_, shape);
+  }
+
+ private:
+#if MSHADOW_USE_MKL
+  /*! \brief stream used by MKL VSL */
+  VSLStreamStatePtr vStream_;
+  // generate uniform distribution
+  inline void GenUniform(float *dptr, index_t size, float a, float b) {
+    int status = vsRngUniform(0, vStream_, size, dptr, a, b);
+    utils::Check(status == VSL_STATUS_OK,
+                 "Failed to generate random number by MKL.");
+  }
+  inline void GenUniform(double *dptr, index_t size, double a, double b) {
+    int status = vdRngUniform(0, vStream_, size, dptr, a, b);
+    utils::Check(status == VSL_STATUS_OK,
+                 "Failed to generate random number by MKL.");
+  }
+  inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) {
+    int status = vsRngGaussian(0, vStream_, size, dptr, mu, sigma);
+    utils::Check(status == VSL_STATUS_OK,
+                 "Failed to generate random number by MKL.");
+  }
+  inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) {
+    int status = vdRngGaussian(0, vStream_, size, dptr, mu, sigma);
+    utils::Check(status == VSL_STATUS_OK,
+                 "Failed to generate random number by MKL.");
+  }
+#else
+  /*! \brief random number seed used by PRNG*/
+  unsigned rseed_;
+  // functions
+  inline void GenUniform(float *dptr, index_t size, float a, float b) {
+    for (index_t j = 0; j < size; ++j) {
+      dptr[j] = static_cast<float>(RandNext()) * (b - a) + a;
+    }
+  }
+  inline void GenUniform(double *dptr, index_t size, double a, double b) {
+    for (index_t j = 0; j < size; ++j) {
+      dptr[j] = static_cast<double>(RandNext()) * (b - a) + a;
+    }
+  }
+  inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) {
+    this->GenGaussianX(dptr, size, mu, sigma);
+  }
+  inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) {
+    this->GenGaussianX(dptr, size, mu, sigma);
+  }
+  inline void GenGaussianX(DType *dptr, index_t size, DType mu, DType sigma) {
+    DType g1 = 0.0f, g2 = 0.0f;
+    for (index_t j = 0; j < size; ++j) {
+      if ((j & 1) == 0) {
+        this->SampleNormal2D(&g1, &g2);
+        dptr[j] = mu + g1 * sigma;
+      } else {
+        dptr[j] = mu + g2 * sigma;
+      }
+    }
+  }
+  /*! \brief get next random number from rand */
+  inline DType RandNext(void) {
+    return static_cast<DType>(rand_r(&rseed_)) /
+        (static_cast<DType>(RAND_MAX) + 1.0f);
+  }
+  /*! \brief return a real numer uniform in (0,1) */
+  inline DType RandNext2(void) {
+    return (static_cast<DType>(rand_r(&rseed_)) + 1.0f) /
+        (static_cast<DType>(RAND_MAX) + 2.0f);
+  }
+  /*!
+   * \brief sample iid xx,yy ~N(0,1)
+   * \param xx first  gaussian output
+   * \param yy second gaussian output
+   */
+  inline void SampleNormal2D(DType *xx_, DType *yy_) {
+    DType &xx = *xx_, &yy = *yy_;
+    DType x, y, s;
+    do {
+      x = 2.0f * RandNext2() - 1.0f;
+      y = 2.0f * RandNext2() - 1.0f;
+      s = x * x + y * y;
+    } while (s >= 1.0f || s == 0.0f);
+    DType t = std::sqrt(-2.0f * std::log(s) / s);
+    xx = x * t; yy = y * t;
+  }
+#endif
+  /*! \brief temporal space used to store random numbers */
+  TensorContainer<cpu, 1, DType> buffer_;
+};  // class Random<cpu, DType>
+// only allow GPU PRNG in CUDACC
+#ifdef __CUDACC__
+/*! \brief GPU random number generator */
+template<typename DType>
+class Random<gpu, DType> {
+ public:
+  /*!
+   * \brief constructor of random engine
+   * \param seed random number seed
+   */
+  Random(int seed) {
+    curandStatus_t status;
+    status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT);
+    utils::Check(status == CURAND_STATUS_SUCCESS,
+                 "Can not create CURAND Generator");
+    this->Seed(seed);
+    buffer_.Resize(Shape1(kRandBufferSize));
+  }
+
+  ~Random(void) {
+    curandStatus_t status;
+    status = curandDestroyGenerator(gen_);
+    utils::Check(status == CURAND_STATUS_SUCCESS,
+                 "Destory CURAND Gen failed");
+  }
+  /*!
+   * \brief set the stream of computation
+   * \param stream computation stream
+   */
+  inline void set_stream(Stream<gpu> *stream) {
+    curandStatus_t status;
+    status = curandSetStream(gen_, Stream<gpu>::GetStream(stream));
+    utils::Check(status == CURAND_STATUS_SUCCESS,
+                 "set_stream CURAND failed");
+  }
+  /*!
+   * \brief seed random number generator using this seed
+   * \param seed seed of prng
+   */
+  inline void Seed(int seed) {
+    curandStatus_t status;
+    status = curandSetPseudoRandomGeneratorSeed(gen_, seed);
+    utils::Check(status == CURAND_STATUS_SUCCESS,
+                 "Set CURAND seed failed.");
+  }
+  /*!
+   * \brief generate data from uniform [a,b)
+   * \param dst destination
+   * \param a lower bound of uniform
+   * \param b upper bound of uniform
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline void SampleUniform(Tensor<gpu, dim, DType> *dst,
+                            DType a = 0.0f, DType b = 1.0f) {
+    if (a == 0.0f && b == 1.0f) {
+      *dst = this->uniform(dst->shape_);
+    } else {
+      *dst = this->uniform(dst->shape_) * (b - a) + a;
+    }
+  }
+  /*!
+   * \brief generate data from standard gaussian
+   * \param dst destination
+   * \param mu mean variable
+   * \param sigma standard deviation
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline void SampleGaussian(Tensor<gpu, dim, DType> *dst,
+                             DType mu = 0.0f, DType sigma = 1.0f) {
+    *dst = this->gaussian(dst->shape_, mu, sigma);
+  }
+  /*!
+   * \brief return a temporal expression storing standard gaussian random variables
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \param mu mean
+   * \param sigma variance
+   * \return a temporal expression storing standard gaussian random variables
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<gpu, 1, DType>, DType, dim, 1>
+  gaussian(Shape<dim> shape, DType mu = 0.0f, DType sigma = 1.0f) {
+    size_t aligned_sz = ((shape.Size() + 1UL) >> 1) << 1;
+    // allocate alligned size
+    buffer_.Resize(Shape1(aligned_sz));
+    buffer_.Resize(Shape1(shape.Size()));
+    this->GenGaussian(buffer_.dptr_, aligned_sz, mu, sigma);
+    return expr::reshape(buffer_, shape);
+  }
+  /*!
+   * \brief return a temporal expression storing standard uniform [0,1)
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \return a temporal expression storing standard uniform [0,1)
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<gpu, 1, DType>, DType, dim, 1>
+  uniform(Shape<dim> shape) {
+    buffer_.Resize(Shape1(shape.Size()));
+    this->GenUniform(buffer_.dptr_, buffer_.size(0));
+    return expr::reshape(buffer_, shape);
+  }
+
+ private:
+  inline void GenGaussian(float *dptr, size_t size, float mu, float sigma) {
+    curandStatus_t status;
+    status = curandGenerateNormal(gen_, dptr, size, mu, sigma);
+    utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed");
+  }
+  inline void GenGaussian(double *dptr, size_t size, double mu, double sigma) {
+    curandStatus_t status;
+    status = curandGenerateNormalDouble(gen_, dptr, size, mu, sigma);
+    utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed");
+  }
+  inline void GenUniform(float *dptr, size_t size) {
+    curandStatus_t status;
+    status = curandGenerateUniform(gen_, dptr, size);
+    utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed");
+  }
+  inline void GenUniform(double *dptr, size_t size) {
+    curandStatus_t status;
+    status = curandGenerateUniformDouble(gen_, dptr, size);
+    utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed");
+  }
+  /*! \brief random numbeer generator */
+  curandGenerator_t gen_;
+  /*! \brief templ buffer */
+  TensorContainer<gpu, 1, DType> buffer_;
+};  // class Random<gpu, DType>
+#endif
+}  // namespace mshadow
+#endif  // MSHADOW_RANDOM_H_
diff --git a/mshadow/sse-inl.h b/mshadow/sse-inl.h
new file mode 100644
index 000000000000..9281c2a7d487
--- /dev/null
+++ b/mshadow/sse-inl.h
@@ -0,0 +1,435 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file sse-inl.h
+ * \brief support of sse2 optimization of some operations
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_SSE_INL_H_
+#define MSHADOW_SSE_INL_H_
+#ifdef __APPLE__
+#include <stdlib.h>
+#else
+#include <malloc.h>
+#endif
+#include "./expression.h"
+#include "./tensor.h"
+
+namespace mshadow {
+/*! \brief namespace to support sse2 vectorization */
+namespace sse2 {
+/*! 
+ * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
+ * \param out_pitch output parameter, the actuall space allocated for each line
+ * \param lspace number of cells required for each line
+ * \param num_line number of lines to be allocated
+ */
+inline void* AlignedMallocPitch(size_t *out_pitch,
+                                size_t lspace, size_t num_line) {
+  size_t pitch = ((lspace+15) >> 4) << 4;
+  *out_pitch = pitch;
+#ifdef _MSC_VER
+  void *res = _aligned_malloc(pitch * num_line, 16);
+#else
+#ifdef __APPLE__
+  void *res = malloc(pitch * num_line);
+#else
+  void *res = memalign(16, pitch * num_line);
+#endif
+#endif
+  utils::Check(res != NULL, "AlignedMallocPitch failed");
+  return res;
+}
+/*! 
+ * \brief free aligned space 
+ * \param ptr pointer to space to be freed
+ */
+inline void AlignedFree(void *ptr) {
+#ifdef _MSC_VER
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+/*! \brief check if a pointer is aligned */
+inline bool CheckAlign(size_t pitch) {
+  return !(pitch & ((1 << 4) - 1));
+}
+/*! \brief check if a pointer is aligned */
+inline bool CheckAlign(void *ptr) {
+  return CheckAlign(reinterpret_cast<size_t>(ptr));
+}
+/*! 
+ * \brief get upper bound of aligned index of size 
+ * \param size size of the array
+ * \param fsize size of float
+ */
+inline index_t UpperAlign(index_t size, size_t fsize) {
+  return (((size * fsize + 15) >> 4) << 4) / fsize;
+}
+/*! 
+ * \brief get lower bound of aligned index of size 
+ * \param size size of the array
+ * \param fsize size of float
+ */
+inline index_t LowerAlign(index_t size, size_t fsize) {
+  return (((size * fsize) >> 4) << 4) / fsize;
+}
+}  // namespace sse2
+}  // namespace  mshadow
+#if MSHADOW_USE_SSE
+// sse types are not compatible with nvcc, only use them in cpu mode
+#include <emmintrin.h>
+
+namespace mshadow {
+namespace sse2 {
+/*! 
+ * \brief float vector real type, used for vectorization 
+ * \tparam FloatType double or float
+ */
+template<typename FloatType>
+struct FVec {
+  // whether the vectorization is enabled
+  static const bool kEnabled = false;
+};
+/*! \brief vector real type for float */
+template<>
+struct FVec<float> {
+  // type
+  typedef __m128 DType;
+  // whether the vectorization is enabled
+  static const bool kEnabled = true;
+  /*! \brief number of float in vector */
+  static const index_t kSize = 4;
+  /*! \brief data content */
+  DType data_;
+  // functions
+  /* constructors */
+  FVec(void) {}
+  explicit FVec(DType data) : data_(data) {}
+  /* set the float */
+  explicit FVec(const float &s) {
+    data_ = _mm_set1_ps(s);
+  }
+  /*!\brief load from pointer src */
+  explicit FVec(const float *src) {
+    data_ = _mm_load_ps(src);
+  }
+  /*! \brief store data into dst space */
+  inline void Store(float *dst) const {
+    return _mm_store_ps(dst, data_);
+  }
+  /*! \brief sum of all content */
+  inline float Sum(void) const {
+    DType ans  = _mm_add_ps(data_, _mm_movehl_ps(data_, data_));
+    DType rst  = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1));
+#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
+    return rst.m128_f32[0];
+#else
+    float rr = _mm_cvtss_f32(rst);
+    return rr;
+#endif
+  }
+};
+/*! \brief vector real type for float */
+template<>
+struct FVec<double> {
+  // data type
+  typedef __m128d DType;
+  // whether the vectorization is enabled
+  static const bool kEnabled = true;
+  /*! \brief number of float in vector */
+  static const index_t kSize = 2;
+  /*! \brief data content */
+  DType data_;
+  /* constructors */
+  FVec(void) {}
+  explicit FVec(DType data) : data_(data) {}
+  /* set the float */
+  explicit FVec(const double &s) {
+    data_ = _mm_set1_pd(s);
+  }
+  /*!\brief load from pointer src */
+  explicit FVec(const double *src) {
+    data_ = _mm_load_pd(src);
+  }
+  /*! \brief store data into dst space */
+  inline void Store(double *dst) const {
+    return _mm_store_pd(dst, data_);
+  }
+  /*! \brief sum of all content */
+  inline double Sum(void) const {
+    DType tmp =  _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_));
+#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
+    return tmp.m128d_f64[0];
+#else
+    double ans = _mm_cvtsd_f64(tmp);
+    return ans;
+#endif
+  }
+};
+/*! \brief sse2 operator type of certain operator */
+template<typename OP>
+struct SSEOp{
+  static const bool kEnabled = false;
+};
+template<>
+struct SSEOp<op::plus> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static FVec<float>
+  Map(const FVec<float> &lhs, const FVec<float> &rhs) {
+    return FVec<float>(_mm_add_ps(lhs.data_, rhs.data_));
+  }
+  MSHADOW_CINLINE static FVec<double>
+  Map(const FVec<double> &lhs, const FVec<double> &rhs) {
+    return FVec<double>(_mm_add_pd(lhs.data_, rhs.data_));
+  }
+};
+template<>
+struct SSEOp<op::minus> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static FVec<float>
+  Map(const FVec<float> &lhs, const FVec<float> &rhs) {
+    return FVec<float>(_mm_sub_ps(lhs.data_, rhs.data_));
+  }
+  MSHADOW_CINLINE static FVec<double>
+  Map(const FVec<double> &lhs, const FVec<double> &rhs) {
+    return FVec<double>(_mm_sub_pd(lhs.data_, rhs.data_));
+  }
+};
+template<>
+struct SSEOp<op::mul> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static FVec<float>
+  Map(const FVec<float> &lhs, const FVec<float> &rhs) {
+    return FVec<float>(_mm_mul_ps(lhs.data_, rhs.data_));
+  }
+  MSHADOW_CINLINE static FVec<double>
+  Map(const FVec<double> &lhs, const FVec<double> &rhs) {
+    return FVec<double>(_mm_mul_pd(lhs.data_, rhs.data_));
+  }
+};
+template<>
+struct SSEOp<op::div> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static FVec<float>
+  Map(const FVec<float> &lhs, const FVec<float> &rhs) {
+    return FVec<float>(_mm_div_ps(lhs.data_, rhs.data_));
+  }
+  MSHADOW_CINLINE static FVec<double>
+  Map(const FVec<double> &lhs, const FVec<double> &rhs) {
+    return FVec<double>(_mm_div_pd(lhs.data_, rhs.data_));
+  }
+};
+template<>
+struct SSEOp<op::identity> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static FVec<float> Map(const FVec<float> &src) {
+    return src;
+  }
+  MSHADOW_CINLINE static FVec<double> Map(const FVec<double> &src) {
+    return src;
+  }
+};
+// savers to do storage
+template<typename SV, typename TFloat>
+struct Saver{
+  MSHADOW_CINLINE static void Save(TFloat *dst, const FVec<TFloat> &src) {
+    FVec<TFloat> lhs(dst);
+    FVec<TFloat> ans = SSEOp<typename SV::OPType>::Map(lhs, src);
+    ans.Store(dst);
+  }
+};
+template<typename TFloat>
+struct Saver<sv::saveto, TFloat> {
+  MSHADOW_CINLINE static void Save(TFloat *dst, const FVec<TFloat> &src) {
+    src.Store(dst);
+  }
+};
+}  // namespace sse2
+namespace expr {
+// same as plan, but use sse2
+template<typename ExpType, typename DType>
+class SSEPlan {
+ public:
+  /*!
+   * \brief evaluate the expression at index [y][x], x will be aligned to 4
+   *        to be implemented by SubType
+   */
+  MSHADOW_CINLINE sse2::FVec<DType> EvalSSE(index_t y, index_t x) const;
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const;
+};
+template <typename Device, int dim, typename DType>
+class SSEPlan<Tensor<Device, dim, DType>, DType> {
+ public:
+  explicit SSEPlan(const Tensor<Device, dim, DType> &t)
+      :dptr_(t.dptr_), stride_(t.stride_) {}
+  MSHADOW_CINLINE sse2::FVec<DType> EvalSSE(index_t y, index_t x) const {
+    return sse2::FVec<DType>(&dptr_[y * stride_ + x]);
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return dptr_[y * stride_ + x];
+  }
+
+ private:
+  const DType  *dptr_;
+  index_t stride_;
+};
+template<typename DType>
+class SSEPlan<ScalarExp<DType>, DType> {
+ public:
+  explicit SSEPlan(DType scalar) : scalar_(scalar) {}
+  MSHADOW_CINLINE sse2::FVec<DType> EvalSSE(index_t y, index_t x) const {
+    return sse2::FVec<DType>(scalar_);
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return scalar_;
+  }
+
+ private:
+  DType scalar_;
+};
+template<typename OP, typename TA, typename TB, int etype, typename DType>
+class SSEPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType> {
+ public:
+  SSEPlan(const SSEPlan<TA, DType> &lhs, const SSEPlan<TB, DType> &rhs)
+      : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_CINLINE sse2::FVec<DType> EvalSSE(index_t y, index_t x) const {
+    return sse2::SSEOp<OP>::Map(lhs_.EvalSSE(y, x), rhs_.EvalSSE(y, x));
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
+  }
+
+ private:
+  SSEPlan<TA, DType> lhs_;
+  SSEPlan<TB, DType> rhs_;
+};
+
+template<typename OP, typename TA, int etype, typename DType>
+class SSEPlan<UnaryMapExp<OP, TA, DType, etype>, DType> {
+ public:
+  SSEPlan(const SSEPlan<TA, DType> &src) : src_(src) {}
+  MSHADOW_CINLINE sse2::FVec<DType> EvalSSE(index_t y, index_t x) const {
+    return sse2::SSEOp<OP>::Map(src_.EvalSSE(y, x));
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(src_.Eval(y, x));
+  }
+
+ private:
+  SSEPlan<TA, DType> src_;
+};
+
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+inline SSEPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType>
+MakeSSEPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e);
+
+template<typename DType>
+inline SSEPlan<ScalarExp<DType>, DType> MakeSSEPlan(const ScalarExp<DType> &e) {
+  return SSEPlan<ScalarExp<DType>, DType>(e.scalar_);
+}
+template<typename T, typename DType>
+inline SSEPlan<T, DType> MakeSSEPlan(const RValueExp<T, DType> &e) {
+  return SSEPlan<T, DType>(e.self());
+}
+template<typename T, int dim, typename DType>
+inline SSEPlan<T, DType>
+MakeSSEPlan(const MakeTensorExp<T, cpu, dim, DType> &e) {
+  return SSEPlan<T, DType>(e.real_self());
+}
+template<typename OP, typename TA, typename DType, int etype>
+inline SSEPlan<UnaryMapExp<OP, TA, DType, etype>, DType>
+MakeSSEPlan(const UnaryMapExp<OP, TA, DType, etype> &e) {
+  return SSEPlan<UnaryMapExp<OP, TA, DType, etype>, DType>(MakeSSEPlan(e.src_));
+}
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+inline SSEPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType>
+MakeSSEPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e) {
+  return SSEPlan<BinaryMapExp<OP, TA, TB, DType, etype>,
+                 DType>(MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_));
+}
+/*!
+ * \brief static check sse enable
+ *        if a expression E can not be evaluated using sse, then kPass = false
+ * \tparam Device the type of Device
+ * \tparam dim dimension of the tensor
+ * \tparam E expression
+ */
+template<typename E>
+struct SSECheck{
+  static const bool kPass = false;
+};
+template<typename DType>
+struct SSECheck<ScalarExp<DType> > {
+  static const bool kPass = sse2::FVec<DType>::kEnabled;
+};
+template<int dim, typename DType>
+struct SSECheck<Tensor<cpu, dim, DType> > {
+  static const bool kPass = sse2::FVec<DType>::kEnabled;
+};
+template<typename OP, typename TA, typename DType, int etype>
+struct SSECheck<UnaryMapExp<OP, TA, DType, etype> > {
+  static const bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;
+};
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+struct SSECheck< BinaryMapExp<OP, TA, TB, DType, etype> > {
+  static const bool kPass = SSECheck<TA>::kPass &&
+      SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;
+};
+//-------------------------------------------------
+// Check if data is aligned and allow sse operation
+//-------------------------------------------------
+template<int dim, typename E>
+struct SSEAlignCheck {
+  inline static bool Check(const E &exp) {
+    return false;
+  }
+};
+template<int dim, typename DType>
+struct SSEAlignCheck<dim, ScalarExp<DType> > {
+  inline static bool Check(const ScalarExp<DType> &exp) {
+    return true;
+  }
+};
+template<int dim, typename DType>
+struct SSEAlignCheck<dim, Tensor<cpu, dim, DType> > {
+  inline static bool Check(const Tensor<cpu, dim, DType> &t) {
+    return sse2::CheckAlign(t.dptr_) &&
+        sse2::CheckAlign(t.stride_ * sizeof(DType));
+  }
+};
+template<int dim, typename OP, typename TA, typename DType, int etype>
+struct SSEAlignCheck<dim, UnaryMapExp<OP, TA, DType, etype> > {
+  inline static bool Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
+    return SSEAlignCheck<dim, TA>::Check(t.src_);
+  }
+};
+template<int dim, typename OP, typename TA, typename TB,
+         typename DType, int etype>
+struct SSEAlignCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype> > {
+  inline static bool Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
+    return SSEAlignCheck<dim, TA>::Check(t.lhs_) &&
+        SSEAlignCheck<dim, TB>::Check(t.rhs_);
+  }
+};
+/*!
+ * \brief use SSEPlan to compute result
+ */
+template<typename SV, typename E, int dim, typename DType>
+inline void MapSSEPlan(Tensor<cpu, dim, DType> _dst,
+                       const expr::SSEPlan<E, DType> &plan) {
+  Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
+  const index_t xlen = sse2::LowerAlign(dst.size(1), sizeof(DType));
+  for (index_t y = 0; y < dst.size(0); ++y) {
+    for (index_t x = 0; x < xlen; x += sse2::FVec<DType>::kSize) {
+      sse2::Saver<SV, DType>::Save(&dst[y][x], plan.EvalSSE(y, x));
+    }
+    for (index_t x = xlen; x < dst.size(1); ++x) {
+      SV::Save(dst[y][x], plan.Eval(y, x));
+    }
+  }
+}
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_USE_SSE
+#endif  // MSHADOW_SSE_INL_H_
diff --git a/mshadow/stream_gpu-inl.h b/mshadow/stream_gpu-inl.h
new file mode 100644
index 000000000000..1cd2e971fc52
--- /dev/null
+++ b/mshadow/stream_gpu-inl.h
@@ -0,0 +1,70 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file stream_gpu-inl.h
+ * \brief implementation of GPU code
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_STREAM_GPU_INL_H_
+#define MSHADOW_STREAM_GPU_INL_H_
+#include "./base.h"
+#include "./tensor.h"
+#include "./utils.h"
+
+namespace mshadow {
+#if MSHADOW_USE_CUDA==1
+// Stream alocation
+// actual implementation of GPU stream in CUDA
+template<>
+struct Stream<gpu> {
+  /*! \brief cudaStream */
+  cudaStream_t stream_;
+  Stream(void) : stream_(0) {}
+  /*!
+   * \brief wait for all the computation associated
+   *  with this stream to complete
+   */
+  inline void Wait(void) {
+    cudaError_t err = cudaStreamSynchronize(stream_);
+    utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+  }
+  /*!
+   * \brief query whether the the stream is idle
+   * \return true if the stream is idle and all the job have been completed
+   */  
+  inline bool CheckIdle(void) {
+    cudaError_t err = cudaStreamQuery(stream_);
+    if (err == cudaSuccess) return true;
+    if (err == cudaErrorNotReady) return false;
+    utils::Error(cudaGetErrorString(err));
+    return false;
+  }
+  /*!
+   * \brief returns actual cudaStream_t given an input GPU stream pointer
+   * \param stream pointer to GPU stream
+   */
+  inline static cudaStream_t GetStream(Stream<gpu> *stream) {
+    if (stream == NULL) {
+#if MSHADOW_FORCE_STREAM
+      utils::Error("Default GPU stream was used when MSHADOW_FORCE_STREAM was on");
+#endif
+      return 0;
+    }
+    else return stream->stream_;
+  }
+};
+template<>
+inline Stream<gpu> *NewStream<gpu>(void) {
+  Stream<gpu> *st = new Stream<gpu>();
+  cudaError_t err = cudaStreamCreate(&st->stream_);
+  utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+  return st;
+}
+template<>
+inline void DeleteStream<gpu>(Stream<gpu> *stream) {
+  cudaError_t err = cudaStreamDestroy(stream->stream_);
+  utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+  delete stream;
+}
+#endif 
+}
+#endif  // MSHADOW_STREAM_GPU_INL_H_
diff --git a/mshadow/tensor.h b/mshadow/tensor.h
index d3979b7751a8..773094dd4637 100644
--- a/mshadow/tensor.h
+++ b/mshadow/tensor.h
@@ -1,485 +1,649 @@
-#ifndef MSHADOW_TENSOR_H
-#define MSHADOW_TENSOR_H
 /*!
+ *  Copyright (c) 2014 by Contributors
  * \file tensor.h
  * \brief header file of tensor data structure and functions
- *        covention: this lib requires explicit memory allocation and de-allocation
- *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
- *                   no memory allocation is happening during calculation
+ *  This lib requires explicit memory allocation and de-allocation
+ *  all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
+ *  no memory allocation is happening during calculation
+ *
+ *  For STL style tensor, see tensor_container.h
  * \author Bing Xu, Tianqi Chen
  */
-#include "tensor_base.h"
-#include "tensor_expr.h"
+#ifndef MSHADOW_TENSOR_H_
+#define MSHADOW_TENSOR_H_
+#include "./base.h"
+#include "./expression.h"
 
 namespace mshadow {
-    /*!
-     * \brief shape of a tensor
-     *       IMPORTANT NOTE: this shape is different from numpy.shape
-     *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
-     *       shape[k] corresponds to k-th dimension of tensor
-     * \tparam dimension dimension of tensor
-     */
-    template<int dimension>
-    struct Shape {
-    public:
-        /*! \brief maximum dimension of tensor */
-        const static int kMaxShape = dimension;
-        /*! \brief maximum dimension minus 1 */
-        const static int kSubShape = dimension - 1;
-    public:
-        /*! \brief default constructor, do nothing */
-        MSHADOW_XINLINE Shape(void) {}
-        /*! \brief constuctor */
-        MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
-            #pragma unroll
-            for( int i = 0; i < kMaxShape; ++i ){
-                this->shape_[i] = s[i];
-            }
-            this->stride_ = s.stride_;
-        }
-        /*!
-         * \brief get corresponding index
-         * \param idx dimension index
-         * \return the corresponding dimension size
-         */
-        MSHADOW_XINLINE index_t& operator[](index_t idx) {
-            return shape_[ idx ];
-        }
-        /*!
-         * \brief get corresponding index
-         * \param idx dimension index
-         * \return the corresponding dimension size
-         */
-        MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
-            return shape_[ idx ];
-        }
-        /*! \return whether two shape equals */
-        MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
-            #pragma unroll
-            for ( int i = 0; i < kMaxShape; ++i ) {
-                if (s.shape_[i] != this->shape_[i]) return false;
-            }
-            return true;
-        }
-        /*!
-         * flatten the higher dimension to second dimension, return a 2D shape
-         * \return the flat 2d shape
-         */
-        MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
-            Shape<2> s;
-            s.stride_ = this->stride_;
-            s.shape_[ 0 ] = this->shape_[ 0 ];
-            index_t ymax = 1;
-
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                ymax *= this->shape_[ i ];
-            }
-            s.shape_[1] = ymax;
-            return s;
-        }
-        /*! \return number of valid elements */
-        MSHADOW_XINLINE size_t Size(void) const{
-            size_t memsz = this->shape_[ 0 ];
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                memsz *= this->shape_[ i ];
-            }
-            return memsz;
-        }
-        /*! \return memory size, including the aligned x dimension */
-        MSHADOW_XINLINE size_t MSize(void) const {
-            size_t memsz = this->stride_;
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                memsz *= this->shape_[ i ];
-            }
-            return memsz;
-        }
-        /*!
-         * \return product shape in [dimstart,dimend)
-         * \param dimstart start dimension
-         * \param dimend   end dimension
-         */
-        MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
-            index_t num = 1;
-            #pragma unroll
-            for (int i = dimstart; i < dimend; ++i) {
-                num *= this->shape_[ i ];
-            }
-            return num;
-        }
-        /*!
-         * \brief get subshape
-         * \return subshape
-         */
-        MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
-            Shape<kSubShape> s;
-            s.stride_ = this->stride_;
-            // for cuda
-            #pragma unroll
-            for (int i = 0; i < kSubShape; ++i) {
-                s.shape_[ i ] = this->shape_[ i ];
-            }
-            return s;
-        }
-
-    public:
-        /*! \brief storing the dimension information */
-        index_t shape_[ kMaxShape ];
-        /*!
-         * \brief storing the stride information in x dimension
-         *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
-         */
-        index_t stride_;
-    };
-    // useful construction functions to generate shape
-    /*!
-     * \brief construct a one dimension shape, stride will equal s0
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
-        Shape<1> s; s[0] = s0; s.stride_ = s0;
-        return s;
+/*! \brief device name CPU */
+struct cpu {
+  /*! \brief whether this device is CPU or not */
+  static const bool kDevCPU = true;
+  /*! \brief device flag number, identifies this device */
+  static const int kDevMask = 1 << 0;
+};
+/*! \brief device name CPU */
+struct gpu {
+  /*! \brief whether this device is CPU or not */
+  static const bool kDevCPU = false;
+  /*! \brief device flag number, identifies this device */
+  static const int kDevMask = 1 << 1;
+};
+/*!
+ * \brief shape of a tensor
+ *       IMPORTANT NOTE: this shape is different from numpy.shape
+ *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
+ *       shape[k] corresponds to k-th dimension of tensor
+ * \tparam dimension dimension of tensor
+ */
+template<int dimension>
+struct Shape {
+  /*! \brief dimension of current shape */
+  static const int kDimension = dimension;
+  /*! \brief dimension of current shape minus one */
+  static const int kSubdim = dimension - 1;
+  /*! \brief storing the dimension information */
+  index_t shape_[kDimension];
+  /*! \brief default constructor, do nothing */
+  MSHADOW_XINLINE Shape(void) {}
+  /*! \brief constuctor */
+  MSHADOW_XINLINE Shape(const Shape<kDimension> &s) {
+    #pragma unroll
+    for (int i = 0; i < kDimension; ++i) {
+      this->shape_[i] = s[i];
     }
-    /*!
-     * \brief construct a two dimension shape, stride will equal s0
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
-        Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
-        return s;
+  }
+  /*!
+   * \brief get corresponding index
+   * \param idx dimension index
+   * \return the corresponding dimension size
+   */
+  MSHADOW_XINLINE index_t &operator[](index_t idx) {
+    return shape_[idx];
+  }
+  /*!
+   * \brief get corresponding index
+   * \param idx dimension index
+   * \return the corresponding dimension size
+   */
+  MSHADOW_XINLINE const index_t &operator[](index_t idx) const {
+    return shape_[idx];
+  }
+  /*!
+   * \return whether two shape equals 
+   * \param s the shape to compare against
+   */
+  MSHADOW_XINLINE bool operator==(const Shape<kDimension> &s) const {
+    #pragma unroll
+    for (int i = 0; i < kDimension; ++i) {
+      if (s.shape_[i] != this->shape_[i]) return false;
     }
-    /*!
-     * \brief construct a three dimension shape, stride will equal s0
-     * \param s2 size of dimension 2
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
-        Shape<3> s;
-        s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
-        return s;
+    return true;
+  }
+  /*!
+   * flatten the higher dimension to second dimension, return a 2D shape
+   * \return the flat 2d shape
+   */
+  MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
+    Shape<2> s;
+    s.shape_[1] = this->shape_[kDimension - 1];
+    index_t ymax = 1;
+    #pragma unroll
+    for (int i = 0; i < kDimension - 1; ++i) {
+      ymax *= this->shape_[i];
     }
-    /*!
-     * \brief construct a four dimension shape, stride will equal s0
-     * \param s3 size of dimension 3
-     * \param s2 size of dimension 2
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
-        Shape<4> s;
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
-        return s;
+    s.shape_[0] = ymax;
+    return s;
+  }
+  /*! \return number of valid elements */
+  MSHADOW_XINLINE size_t Size(void) const {
+    size_t size = this->shape_[0];
+    #pragma unroll
+    for (int i = 1; i < kDimension; ++i) {
+      size *= this->shape_[i];
     }
-}; // namespace mshadow
-
-namespace mshadow {
-    /*! \brief device name CPU */
-    struct cpu {
-        /*! \brief whether this device is CPU or not */
-        const static bool kDevCPU = true;
-        /*! \brief device flag number, identifies this device */
-        const static int kDevMask = 1<<0;
-    };
-    /*! \brief device name CPU */
-    struct gpu {
-        /*! \brief whether this device is CPU or not */
-        const static bool kDevCPU = false;
-        /*! \brief device flag number, identifies this device */
-        const static int kDevMask = 1<<1;
-    };
-
-    // more compact template
-    /*!
-     * \brief general tensor
-     * \tparam Device which device the tensor is on
-     * \tparam dimension dimension of the tensor
-     */
-    template<typename Device, int dimension>
-    struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
-    public:
-        /*! \brief whether current type lies in cpu */
-        const static bool kDevCPU = Device::kDevCPU;
-        /*! \brief dimension of subtype */
-        const static int  kSubdim = dimension - 1;
-
-    public:
-        /*! \brief pointer to the data */
-        real_t *dptr;
-        /*! \brief shape of the tensor */
-        Shape<dimension> shape;
-    public:
-        /*! \brief default constructor */
-        MSHADOW_XINLINE Tensor(void) {}
-        /*! \brief constructor from shape  */
-        MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
-        /*! \brief constructor from data pointer and shape  */
-        MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {}
-        /*! 
-         * \brief return size of i-th dimension, start counting from highest dimension
-         *  This meets the habit of normal usage of size of matrix. Note that mat.shape[0] gives lowest dimension,
-         *  while mat.size(0) returns the highest dimension
-         * \param the dimension count from the highest dimensin
-         * \return the size
-         */
-        MSHADOW_XINLINE index_t size(index_t i) const {
-            return shape[dimension - 1 - i];
-        }
-        /*!
-         * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
-         * \return tensor after flatten
-         */
-        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
-            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
-                                     (dptr), shape.FlatTo2D());
-        }
-        /*!
-         * \brief get a element of dimension - 1
-         * \param idx index
-         * \return the result tensor
-         */
-        MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
-            Shape<kSubdim> s = shape.SubShape();
-            return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
-                                           (dptr) + s.MSize() * idx, s);
-        }
-        /*!
-         * \brief slice the tensor in highest dimension [begin,end)
-         * \param begin begin position of slice
-         * \param end end position of slice
-         * \return tensor after slice
-         */
-        MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
-            Shape<dimension> s = this->shape;
-            s[ dimension - 1 ] = end - begin;
-            return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
-                                             (dptr) + s.SubShape().MSize() * begin, s);
-        }
-    public:
-        /*!\brief functions to fit expression template */
-        inline Tensor<Device,dimension>& operator=( real_t s ){
-            return this->__assign( s );
-        }
-        /*!\brief functions to fit expression template */
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        /*!\brief functions to fit expression template */
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    };
-
-    /*
-     *  respecialized class Tensor1D,thei is due to different implementation in operator[]
-     */
-    template<typename Device>
-    struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
-    public:
-        real_t *dptr;
-        Shape<1> shape;
-    public:
-        MSHADOW_XINLINE Tensor(void) {}
-        MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
-        MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}
-
-        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
-            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
-                                     (dptr), shape.FlatTo2D());
-        }
-        MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
-            Shape<1> s;
-            s[0] = s.stride_ = end  - begin;
-            return Tensor<Device, 1>(reinterpret_cast<real_t*> \
-                                     (dptr) + begin, s);
-        }
-        MSHADOW_XINLINE index_t size(index_t i) const {
-            return shape[0];
-        }        
-        MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
-        MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
-    public:
-        // functions to fit expression template
-        inline Tensor<Device,1>& operator=( double s ){
-            return this->__assign( s );
-        }
-        template<typename E>
-        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        template<typename E>
-        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    };
-}; // namespace mshadow
-
-// add unroll loops for the shape
-namespace mshadow {
-    // function declarations
-    /*!
-     * \brief initialize tensor engine, used to call intialization functions of dependent libs
-     *        this function should be called before all GPU tensor operations,
-     *        for using tensors in CPU, this call is actually not needed
-     * \param device_id GPU device id to be choosed
-     */
-    inline void InitTensorEngine( int device_id=0 );
-    /*!
-     * \brief Shutdown tensor engine,
-     *        this function should be called after all GPU tensor operations,
-     *        for using tensors in CPU, this call is actually not needed
-     */
-    inline void ShutdownTensorEngine( void );
-
-    /*!
-     * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
-     *        this function is responsible to set the stride_ in each obj.shape
-     * \tparam dim specify the dim of tensor
-     * \param obj the tensor object, with shape specified
-     * \param pad whether padding dimension 0, to make last dimension aligned,
-     *            padding may help improve efficiency of matrix multiplications
-     *            if true, will allocate space with stride_ that may not equals shape[0]
-     *            if false, will allocate continuous space
-     */
-    template<int dim>
-    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
-    /*! \brief refer to comment of cpu ver \sa AllocSpace */
-    template<int dim>
-    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
-
-    /*!
-     * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
-     * \tparam dim specify the dim of tensor
-     * \param obj the tensor object
-     */
-    template<int dim>
-    inline void FreeSpace(Tensor<cpu,dim> &obj);
-    /*! \brief refer to comment of cpu ver \sa FreeSpace */
-    template<int dim>
-    inline void FreeSpace(Tensor<gpu,dim> &obj);
-
-    /*!
-     * \brief CPU/GPU: short cut to allocate and initialize a Tensor
-     * \tparam Device device of tensor
-     * \tparam dim dimention of tensor
-     * \param shape: shape of tensor
-     * \param initv: initialization value
-     * \param pad : padding option
-     * \sa AllocSpace
-     */
-    template<typename Device, int dim>
-    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);
-
-    /*!
-     * \brief copy data from one tensor to another, with same shape
-     * \tparam dim specify the dim of tensor
-     * \param dst target tensor
-     * \param src source tensor
-     */
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );
-
-
-    /*!
-     * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
-     * \param dst destination
-     * \param energy input energy
-     */
-    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
-    /*! \brief refer to comment of cpu ver \sa Softmax */
-    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );
-
-}; // namespace mshadow
-
-
-namespace mshadow{
-    // function declarations to support expression, no need to understand them
-    // these functions do not need to be directly used
-
-    /*!
-     * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
-     * \tparam Saver specify storage method
-     * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
-     */
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
-    /*! \brief refer to comment of cpu ver \sa MapExp */
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );
-
-    /*!
-     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
-     * \tparam Saver specify storage method
-     * \tparam Reducer specify a reducer method
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \param scale scale the result before save
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
-     */
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-
-
-    /*!
-     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
-     * \tparam Saver specify storage method
-     * \tparam Reducer specify a reducer method
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \param scale scale the result before save
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
-     */
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-
-};// namespace mshadow
-
-// execution implementation of expression evaluations
-#include "tensor_expr_engine-inl.hpp"
-// cpu implementation of functions
-#include "tensor_cpu-inl.hpp"
-// gpu implementation of functions
-#include "tensor_gpu-inl.hpp"
-// extension of expressions
-#include "tensor_expr_ext.h"
-// io 
-#include "tensor_io.h"
-// container
-#include "tensor_container.h"
-// random number generator
-#include "tensor_random.h"
-#endif // TENSOR_H
+    return size;
+  }
+  /*!
+   * \return product shape in [dimstart,dimend)
+   * \param dimstart start dimension
+   * \param dimend end dimension
+   */
+  MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const {
+    index_t num = 1;
+    #pragma unroll
+    for (int i = dimstart; i < dimend; ++i) {
+      num *= this->shape_[i];
+    }
+    return num;
+  }
+  /*!
+   * \brief get subshape that takes off largest dimension
+v   * \return subshape
+   */
+  MSHADOW_XINLINE Shape<kSubdim> SubShape(void) const {
+    Shape<kSubdim> s;
+    // for cuda
+    #pragma unroll
+    for (int i = 0; i < kSubdim; ++i) {
+      s.shape_[i] = this->shape_[i + 1];
+    }
+    return s;
+  }
+  /*!
+   * \brief slice the shape from start to end
+   * \tparam dimstart start dimension
+   * \tparam dimend end dimension
+   * \return the sliced shape
+   */
+  template<int dimstart, int dimend>
+  MSHADOW_XINLINE Shape<dimend - dimstart> Slice(void) const {
+    Shape<dimend - dimstart> s;
+    #pragma unroll
+    for (int i = dimstart; i < dimend; ++i) {
+      s[i - dimstart] = this->shape_[i];
+    }
+    return s;
+  }
+};  // Shape
+//------------------------------------------------
+// useful construction functions to generate shape
+//-------------------------------------------------
+/*!
+ * \brief construct a one dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<1> Shape1(index_t s0) {
+  Shape<1> s; s[0] = s0;
+  return s;
+}
+/*!
+ * \brief construct a two dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \param s1 size of dimension 1
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) {
+  Shape<2> s; s[0] = s0; s[1] = s1;
+  return s;
+}
+/*!
+ * \brief construct a three dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \param s1 size of dimension 1
+ * \param s2 size of dimension 2
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) {
+  Shape<3> s;
+  s[0] = s0; s[1] = s1; s[2] = s2;
+  return s;
+}
+/*!
+ * \brief construct a four dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \param s1 size of dimension 1
+ * \param s2 size of dimension 2
+ * \param s3 size of dimension 3
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1,
+                                index_t s2, index_t s3) {
+  Shape<4> s;
+  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+  return s;
+}
+/*!
+ * \brief computaion stream structure, used for asynchronize computation
+ */
+template<typename Device>
+struct Stream {
+  // this is only a dummy implementation for CPU
+  // for GPU, the actual implementation will be specialized in tensor_gpu-inl.h
+  /*!
+   * \brief wait for all the computation associated
+   *  with this stream to complete
+   */
+  inline void Wait(void) {}
+  /*!
+   * \brief query whether the the stream is idle
+   * \return true if the stream is idle and all the job have been completed
+   */
+  inline bool CheckIdle(void) {
+    return true;
+  }
+};
+/*!
+ * \brief Tensor RValue, this is the super type of all kinds of possible tensors
+ * \tparam Container the tensor type
+ * \tparam Device which device the tensor is on
+ * \tparam dimension dimension of the tensor
+ * \tparam DType the type of elements in the tensor
+ */
+template<typename Container, typename Device, int dimension, typename DType>
+struct TRValue: public expr::RValueExp<Container, DType> {
+};
+// more compact template
+/*!
+ * \brief general tensor
+ * \tparam Device which device the tensor is on
+ * \tparam dimension dimension of the tensor
+ * \tparam DType the type of elements in the tensor
+ */
+template<typename Device, int dimension,
+         typename DType MSHADOW_DEFAULT_DTYPE>
+struct Tensor: public TRValue<Tensor<Device, dimension, DType>,
+                              Device, dimension, DType> {
+ public:
+  //--------------------------------
+  // struct memembers
+  //--------------------------------
+  /*! \brief whether current type lies in cpu */
+  static const bool kDevCPU = Device::kDevCPU;
+  /*! \brief dimension of subtype */
+  static const int  kSubdim = dimension - 1;
+  //--------------------------------
+  // struct memembers
+  //--------------------------------
+  /*! \brief pointer to the data */
+  DType *dptr_;
+  /*! \brief shape of the tensor */
+  Shape<dimension> shape_;
+  /*!
+   * \brief storing the stride information in x dimension
+   *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
+   */
+  index_t stride_;
+  /*!
+   * \brief stream where the computation lies
+   * stream is a device dependency concept where each computation
+   */
+  Stream<Device> *stream_;
+  //--------------------------------
+  // functions
+  //--------------------------------
+  /*! \brief default constructor */
+  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
+  /*! \brief constructor from shape  */
+  MSHADOW_XINLINE Tensor(const Shape<dimension> &shape)
+      : shape_(shape), stream_(NULL) {}
+  /*! \brief constructor from data pointer and shape, without stride */
+  MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape)
+      : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {}
+  /*! \brief constructor from data pointer and shape  */
+  MSHADOW_XINLINE Tensor(DType *dptr,
+                         const Shape<dimension> &shape,
+                         index_t stride, Stream<Device> *stream)
+      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
+  /*!
+   * \brief set the stream to do computation of current tensor
+   * \param stream the computation stream
+   */
+  inline void set_stream(Stream<Device> *stream) {
+    this->stream_ = stream;
+  }
+  /*!
+   * \return memory cost of the tensor, including the aligned x dimension
+   * \tparam startdim the starting dimension
+   */
+  template<int startdim>
+  MSHADOW_XINLINE size_t MemSize(void) const {
+    size_t memsz = this->stride_;
+    #pragma unroll
+    for (int i = startdim; i < kSubdim; ++i) {
+      memsz *= this->shape_[i];
+    }
+    return memsz;
+  }
+  /*!
+   * \return whether the tensor's memory is continuous
+   * x dimension same as stride
+   */
+  MSHADOW_XINLINE bool CheckContiguous(void) const {
+    return this->shape_[dimension - 1] == stride_;
+  }
+  /*!
+   * \return memory cost of the tensor, including the aligned x dimension
+   */
+  MSHADOW_XINLINE size_t MSize(void) const {
+    return this->MemSize<0>();
+  }
+  /*!
+   * \brief return size of i-th dimension, start counting from highest dimension
+   * \param idx the dimension count from the highest dimensin
+   * \return the size
+   */
+  MSHADOW_XINLINE index_t size(index_t idx) const {
+    return shape_[idx];
+  }
+  /*!
+   * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
+   * \return tensor after flatten
+   */
+  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
+    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
+  }
+  /*!
+   * \brief get a element of dimension - 1
+   * \param idx index
+   * \return the result tensor
+   */
+  MSHADOW_XINLINE Tensor<Device, kSubdim, DType> operator[](index_t idx) const {
+    return Tensor<Device, kSubdim, DType>(dptr_ + this->MemSize<1>() * idx,
+                                          shape_.SubShape(), stride_, stream_);
+  }
+  /*!
+   * \brief slice the tensor in highest dimension [begin,end)
+   * \param begin begin position of slice
+   * \param end end position of slice
+   * \return tensor after slice
+   */
+  MSHADOW_XINLINE Tensor<Device, dimension, DType>
+  Slice(index_t begin, index_t end) const {
+    Shape<dimension> s = this->shape_;
+    s[0] = end - begin;
+    return Tensor<Device, dimension, DType>(dptr_ + this->MemSize<1>() * begin,
+                                            s, stride_, stream_);
+  }
+  /*!\brief implement the assignment of same type */
+  inline Tensor<Device, dimension, DType> &
+  operator=(const Tensor<Device, dimension, DType> &exp) {
+    dptr_ = exp.dptr_;
+    shape_ = exp.shape_;
+    stride_ = exp.stride_;
+    stream_ = exp.stream_;
+    return *this;
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E, int etype>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    return this->__assign(exp);
+  }
+  /*!\brief functions to fit expression template */
+  inline Tensor<Device, dimension, DType> &operator=(const DType &exp) {
+    return this->__assign(exp);
+  }
+};
+/*
+ *  respecialized class Tensor1D, thei is due to different implementation in operator[]
+ */
+template<typename Device, typename DType>
+struct Tensor<Device, 1, DType>:
+      public TRValue<Tensor<Device, 1, DType>, Device, 1, DType> {
+ public:
+  DType *dptr_;
+  Shape<1> shape_;
+  index_t stride_;
+  Stream<Device> *stream_;
+  // constructor
+  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
+  MSHADOW_XINLINE Tensor(const Shape<1> &shape)
+      : shape_(shape), stream_(NULL) {}
+  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape)
+      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {}
+  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape,
+                         index_t stride, Stream<Device> *stream)
+      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
+  inline void set_stream(Stream<Device> *stream) {
+    this->stream_ = stream;
+  }
+  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
+    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
+  }
+  MSHADOW_XINLINE Tensor<Device, 1, DType> Slice(index_t begin, index_t end) const {
+    Shape<1> s;
+    s[0] = end  - begin;
+    return Tensor<Device, 1, DType>(dptr_ + begin, s, s[0], stream_);
+  }
+  MSHADOW_XINLINE bool CheckContiguous(void) const {
+    return true;
+  }
+  MSHADOW_XINLINE size_t MSize(void) const {
+    return shape_[0];
+  }
+  MSHADOW_XINLINE index_t size(index_t i) const {
+    return shape_[0];
+  }
+  MSHADOW_XINLINE DType &operator[](index_t idx) {
+    return dptr_[idx];
+  }
+  MSHADOW_XINLINE const DType &operator[](index_t idx) const {
+    return dptr_[idx];
+  }
+  /*!\brief implement the assignment of same type */
+  inline Tensor<Device, 1, DType> &
+  operator=(const Tensor<Device, 1, DType> &exp) {
+    dptr_ = exp.dptr_;
+    shape_ = exp.shape_;
+    stride_ = exp.stride_;
+    stream_ = exp.stream_;
+    return *this;
+  }
+  template<typename E, int etype>
+  inline Tensor<Device, 1, DType> &
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    return this->__assign(exp);
+  }
+  inline Tensor<Device, 1, DType> &operator=(const DType &exp) {
+    return this->__assign(exp);
+  }
+};
+//------------------------
+// Function Declarations
+//-----------------------
+/*!
+ * \brief initialize tensor engine, used to call intialization functions of dependent libs
+ *        this function should be called before all GPU tensor operations,
+ *        for using tensors in CPU, this call is actually not needed
+ * \param device_id GPU device id to be choosed
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline void InitTensorEngine(int device_id = 0);
+/*!
+ * \brief Shutdown tensor engine on current device
+ *     this function should be called after all GPU tensor operations,
+ *     for using tensors in CPU, this call is actually not needed
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline void ShutdownTensorEngine(void);
+/*!
+ * \brief set the device of current thread to work on
+ * \param devid the device id
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline void SetDevice(int devid);
+/*!
+ * \brief create a new stream from system
+ * \return a pointer to the created stream
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline Stream<Device> *NewStream(void);
+/*!
+ * \brief delete the computing stream
+ * \param stream the stream parameter to be deleted
+ */
+template<typename Device>
+inline void DeleteStream(Stream<Device> *stream);
+/*!
+ * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
+ *        this function is responsible to set the stride_ in each obj.shape
+ * \param obj the tensor object, with shape specified
+ * \param pad whether padding dimension 0, to make last dimension aligned,
+ *            padding may help improve efficiency of matrix multiplications
+ *            if true, will allocate space with stride_ that may not equals shape[0]
+ *            if false, will allocate continuous space
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<cpu, dim, DType> *obj,
+                       bool pad = MSHADOW_ALLOC_PAD);
+/*! \brief refer to comment of cpu ver \sa AllocSpace */
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<gpu, dim, DType> *obj,
+                       bool pad = MSHADOW_ALLOC_PAD);
+/*!
+ * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
+ * \param obj the tensor object
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<cpu, dim, DType> *obj);
+/*! \brief refer to comment of cpu ver \sa FreeSpace */
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<gpu, dim, DType> *obj);
+/*!
+ * \brief CPU/GPU: short cut to allocate and initialize a Tensor
+ * \param shape: shape of tensor
+ * \param initv: initialization value
+ * \param pad : padding option
+ * \tparam Device device of tensor
+ * \tparam DType type of element in tensor
+ * \tparam dim dimention of tensor
+ * \sa AllocSpace
+ */
+template<typename Device, typename DType, int dim>
+inline Tensor<Device, dim, DType> NewTensor(const Shape<dim> &shape,
+                                            DType initv,
+                                            bool pad = MSHADOW_ALLOC_PAD);
+/*!
+ * \brief copy data from one tensor to another, with same shape
+ * \param dst target tensor
+ * \param src source tensor
+ * \param stream the stream, when specified, the copy can exhibit asynchronize behavior
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> dst,
+                 const Tensor<cpu, dim, DType> &src,
+                 Stream<cpu> *stream = NULL);
+/*! \brief refer to comment of cpu ver \sa Copy */
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream = NULL);
+/*! \brief refer to comment of cpu ver \sa Copy */
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<cpu, dim, DType> &src,
+                 Stream<gpu> *stream = NULL);
+/*! \brief refer to comment of cpu ver \sa Copy */
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream = NULL);
+/*!
+ * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j]))
+ * \param dst destination
+ * \param energy input energy
+ */
+template<typename DType>
+inline void Softmax(Tensor<cpu, 2, DType> dst, const Tensor<cpu, 2, DType> &energy);
+/*! \brief refer to comment of cpu ver \sa Softmax */
+template<typename DType>
+inline void Softmax(Tensor<gpu, 2, DType> dst, const Tensor<gpu, 2, DType> &energy);
+// function declarations to support expression, no need to understand them
+// these functions do not need to be directly used
+/*!
+ * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
+ * \tparam Saver specify storage method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
+ * \tparam DType the type of elements in the tensor
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
+ */
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, cpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp);
+/*! \brief refer to comment of cpu ver \sa MapExp */
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp);
+/*!
+ * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
+ * \tparam Saver specify storage method
+ * \tparam Reducer specify a reducer method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam DType the type of elements in the tensor
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \param scale scale the result before save
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+ */
+template<typename Saver, typename Reducer,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, cpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale = 1);
+/*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
+template<typename Saver, typename Reducer, typename R,
+         typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale = 1);
+/*!
+ * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
+ * \tparam Saver specify storage method
+ * \tparam Reducer specify a reducer method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam DType the type of elements in the tensor
+ * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \param scale scale the result before save
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+ */
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale = 1);
+/*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale = 1);
+}  // namespace mshadow
+// include headers
+#include "./stream_gpu-inl.h"
+#include "./expr_engine-inl.h"
+#include "./extension.h"
+#include "./tensor_cpu-inl.h"
+#include "./tensor_gpu-inl.h"
+#include "./io.h"
+#include "./tensor_container.h"
+#include "./random.h"
+// add definition of scalar related operators
+#ifdef MSAHDOW_SCALAR_
+  #error "MSHADOW_SCALAR_ must not be defined"
+#endif
+// enumerate all the scalar data type we aim to be good at
+#define MSHADOW_SCALAR_ float
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#define MSHADOW_SCALAR_ double
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#define MSHADOW_SCALAR_ int
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#endif  // MSHADOW_TENSOR_H_
diff --git a/mshadow/tensor_base.h b/mshadow/tensor_base.h
deleted file mode 100644
index b251cbadf4fc..000000000000
--- a/mshadow/tensor_base.h
+++ /dev/null
@@ -1,298 +0,0 @@
-#ifndef MSHADOW_TENSOR_BASE_H
-#define MSHADOW_TENSOR_BASE_H
-/*!
- * \file tensor_base.h
- * \brief definitions of base types, macros functions
- *
- * \author Bing Xu, Tianqi Chen
- */
-#include <cmath>
-#include <cstdio>
-#include <cfloat>
-#include <climits>
-#include <algorithm>
-// macro defintiions
-
-/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */
-#ifndef MSHADOW_STAND_ALONE
-    #define MSHADOW_STAND_ALONE 0
-#endif
-
-/*! \brief whether do padding during allocation */
-#ifndef MSHADOW_ALLOC_PAD
-    #define MSHADOW_ALLOC_PAD true
-#endif
-
-/*! 
- * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation 
- *        for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64
- *        set it to 0 then we will always allocate padded memory
- */
-#ifndef MSHADOW_MIN_PAD_RATIO
-    #define MSHADOW_MIN_PAD_RATIO 2
-#endif
-
-#if MSHADOW_STAND_ALONE
-   #define MSHADOW_USE_CBLAS 0
-   #define MSHADOW_USE_MKL   0
-   #define MSHADOW_USE_CUDA  0
-#endif
-
-/*! \brief use CBLAS for CBLAS */
-#ifndef MSHADOW_USE_CBLAS
-   #define MSHADOW_USE_CBLAS 0
-#endif
-/*! \brief use MKL for BLAS */
-#ifndef MSHADOW_USE_MKL
-   #define MSHADOW_USE_MKL   1
-#endif
-/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */
-#ifndef MSHADOW_USE_CUDA
-  #define MSHADOW_USE_CUDA   1
-#endif
-/*! \brief use single precition float */
-#ifndef MSHADOW_SINGLE_PRECISION
-  #define MSHADOW_SINGLE_PRECISION 1
-#endif
-/*! \brief whether use SSE */
-#ifndef MSHADOW_USE_SSE
-  #define MSHADOW_USE_SSE 1
-#endif
-/*! \brief whether use NVML to get dynamic info */
-#ifndef MSHADOW_USE_NVML
-  #define MSHADOW_USE_NVML 0
-#endif
-// SSE is conflict with cudacc
-#ifdef __CUDACC__
-  #undef MSHADOW_USE_SSE
-  #define MSHADOW_USE_SSE 0
-#endif
-
-#if MSHADOW_USE_CBLAS
-extern "C"{
-    #include <cblas.h>
-}
-#elif MSHADOW_USE_MKL
-  #include <mkl.h>
-  #include <mkl_cblas.h>
-  #include <mkl_vsl.h>
-  #include <mkl_vsl_functions.h>
-#endif
-
-#if MSHADOW_USE_CUDA
-  #include <cublas.h>
-  #include <curand.h>
-#endif
-
-#if MSHADOW_USE_NVML
-  #include <nvml.h>
-#endif
-// --------------------------------
-// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code.
-#ifdef MSHADOW_XINLINE
-  #error "MSHADOW_XINLINE must not be defined"
-#endif
-#ifdef __CUDACC__
-  #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__
-#else
-  #define MSHADOW_XINLINE inline __attribute__((always_inline))
-#endif
-/*! \brief cpu force inline */
-#define MSHADOW_CINLINE inline __attribute__((always_inline))
-
-#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-  #define MSHADOW_CONSTEXPR constexpr
-#else
-  #define MSHADOW_CONSTEXPR const
-#endif
-
-/*! \brief namespace for mshadow */
-namespace mshadow {
-    /*! \brief buffer size for each random number generator */
-    const unsigned kRandBufferSize = 1000000;
-    /*! \brief pi  */
-    const float kPi = 3.1415926f;
-
-#if MSHADOW_SINGLE_PRECISION
-    /*! \brief type that will be used for content */
-    typedef float real_t;
-#else
-    typedef double real_t;
-#endif
-    /*! \brief type that will be used for index */
-    typedef unsigned index_t;
-}; // namespace mshadow
-
-namespace mshadow {
-    /*! \brief namespace for operators */
-    namespace op {
-        // binary operator
-        /*! \brief mul operator */
-        struct mul{
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a * b;
-            }
-        };
-        /*! \brief plus operator */
-        struct plus {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a + b;
-            }
-        };
-        /*! \brief minus operator */
-        struct minus {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a - b;
-            }
-        };
-        /*! \brief divide operator */
-        struct div {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a / b;
-            }
-        };
-        /*! \brief get rhs */
-        struct right {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return b;
-            }
-        };
-    }; // namespace op
-
-    /*! \brief namespace for savers */
-    namespace sv {
-        /*! \brief save to saver: = */
-        struct saveto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a  = b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 0.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::right OPType;
-        };
-        /*! \brief save to saver: += */
-        struct plusto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a += b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::plus OPType;
-        };
-        /*! \brief minus to saver: -= */
-        struct minusto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a -= b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::minus OPType;
-        };
-        /*! \brief multiply to saver: *= */
-        struct multo {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a *= b;
-            }
-            /*! \brief corresponding binary operator type */
-            typedef op::mul OPType;
-        };
-        /*! \brief divide to saver: /= */
-        struct divto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a /= b;
-            }
-            /*! \brief corresponding binary operator type */
-            typedef op::div OPType;
-        };
-    }; // namespace sv
-
-
-    namespace op {
-        // unary operator/ function: example
-        // these operators can be defined by user, in the same style as binary and unary operator
-        // to use, simply write F<op::identity>( src )
-        /*! \brief identity function that maps a real number to it self */
-        struct identity{
-            /*! \brief map a to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a;
-            }
-        };
-    }; // namespace op
-
-    /*! \brief namespace for potential reducer operations */
-    namespace red {
-        /*! \brief sum reducer */
-        struct sum {
-            /*! \brief do reduction into dst */
-            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
-                dst += src;
-            }
-            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
-            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
-                return 1.0f;
-            }
-            /*! \brief an intial value of reducer */
-            MSHADOW_CONSTEXPR static real_t kInitV = 0.0f;
-        };
-        /*! \brief maximum reducer */
-        struct maximum {
-            /*! \brief do reduction into dst */
-            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
-                using namespace std;
-                dst = max( dst, src );
-            }
-            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
-            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
-                return redres == redsrc ? 1.0f: 0.0f;
-            }
-            /*! \brief an intial value of reducer */
-#if MSHADOW_SINGLE_PRECISION
-            MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX;
-#else
-            MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX;
-#endif
-        };
-    };
-
-    /*! \brief namespace for helper utils of the project */
-    namespace utils{
-        /*! \brief send error message then exit */
-        inline void Error( const char *msg ){
-            fprintf( stderr, "Error:%s\n",msg );
-            exit( -1 );
-        }
-        /*! \brief assert a expression is true */
-        inline void Assert( bool exp ){
-            if( !exp ) Error( "AssertError" );
-        }
-        /*! \brief assert a expression is true */
-        inline void Assert( bool exp, const char *msg ){
-            if( !exp ) Error( msg );
-        }
-        /*! \brief warning */
-        inline void Warning( const char *msg ){
-            fprintf( stderr, "warning:%s\n",msg );
-        }
-    }; // namespace utils
-}; // namespace mshadow
-#endif // TENSOR_BASE_H
diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h
index f0699e735b0f..dbf250ceed28 100644
--- a/mshadow/tensor_container.h
+++ b/mshadow/tensor_container.h
@@ -1,152 +1,161 @@
-#ifndef MSHADOW_TENSOR_CONTAINER_H
-#define MSHADOW_TENSOR_CONTAINER_H
 /*!
+ *  Copyright (c) 2014 by Contributors
  * \file tensor_container.h
  * \brief tensor container that does memory allocation and resize like STL
  * \author Tianqi Chen
  */
-#include "tensor.h"
-#include "tensor_io.h"
+#ifndef MSHADOW_TENSOR_CONTAINER_H_
+#define MSHADOW_TENSOR_CONTAINER_H_
+#include "./tensor.h"
+#include "./io.h"
 
-namespace mshadow{
-    /*!
-     * \brief tensor container that does memory allocation and resize like STL,
-     *        use it to save the lines of FreeSpace in class.
-     *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
-     *
-     * \tparam Device which device the tensor is on
-     * \tparam dimension dimension of the tensor
-     */
-    template<typename Device, int dimension>
-    class TensorContainer: public Tensor<Device,dimension>{
-    public:
-        /*! 
-         * \brief constructor 
-         * \param pad whether use padding alignment in space allocation
-         */
-        TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){
-            this->pad_ = pad;
-            this->dptr = data_.dptr = NULL;
-            this->shape[0] = 0;
-            this->shape.stride_ = 0;
-            this->data_.shape.stride_ = 0;
-            this->data_.shape[1] = 0;
-        }
-        /*! 
-         * \brief constructor 
-         * \param shape intial shape
-         */
-        TensorContainer( const Shape<dimension> &shape ){
-            this->pad_ = MSHADOW_ALLOC_PAD;
-            data_.dptr = NULL;
-            this->AllocByShape( shape );
-        }
-        /*! 
-         * \brief constructor 
-         * \param shape intial shape
-         * \param initv intial value
-         */
-        TensorContainer( const Shape<dimension> &shape, real_t initv ){
-            this->pad_ = MSHADOW_ALLOC_PAD;
-            data_.dptr = NULL;
-            this->AllocByShape( shape );
-            (*this) = initv;
-        }
-        ~TensorContainer( void ){
-            this->FreeSpace();
-        }
-        /*! 
-         * \brief resize the container to given shape, content is NOT preserved
-         * \param shape target shape
-         */
-        inline void Resize( const Shape<dimension> &shape ){
-            Shape<2> s2 = shape.FlatTo2D();            
-            if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){
-                this->AllocByShape( shape );
-            }else{
-                this->shape = shape;
-                if( this->pad_ ){
-                    this->shape.stride_ = data_.shape.stride_;
-                }else{
-                    this->shape.stride_ = this->shape[ 0 ];
-                }
-            }
-        }
-        /*! 
-         * \brief resize the container to given shape, and initialize, content is NOT preserved
-         * \param shape target shape
-         * \param initv initialization value
-         */
-        inline void Resize( const Shape<dimension> &shape, real_t initv ){
-            this->Resize( shape );
-            (*this) = initv;
-        }
-        /*! \brief set whether padding is allowed in tensor */
-        inline void set_pad( bool pad ){
-            this->pad_ = pad;
-        }
-        /*! 
-         * \brief save by binary format
-         * \param fo output binary stream
-         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-         */
-        template<typename TStream>
-        inline void SaveBinary( TStream &fo ) const{
-            mshadow::SaveBinary( fo, *this );
-        }
-        /*! 
-         * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
-         * \param fi input binary stream
-         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-         */
-        template<typename TStream>
-        inline void LoadBinary( TStream &fi ) {
-            Tensor<cpu,dimension> tmp;
-            mshadow::LoadBinary( fi, tmp, false );
-            this->Resize( tmp.shape );
-            Copy( *this, tmp );
-            mshadow::FreeSpace( tmp );
-        }
-    public:
-        // functions to fit exp template
-        inline Tensor<Device,dimension>& operator=( real_t s ){
-            return this->__assign( s );
-        }
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    private:
-        /*! \brief whether we do padding in the space */
-        bool pad_;
-        /*! \brief the shape of data_ is actually current data space */
-        Tensor<Device, 2> data_;
-    private:
-        inline void FreeSpace (void){
-            if( data_.dptr != NULL ){
-                mshadow::FreeSpace( data_ );
-                data_.dptr = this->dptr = NULL;
-            }
-        }
-        inline void AllocByShape (const Shape<dimension>& shape){
-            if( data_.dptr != NULL ){
-                this->FreeSpace();
-            }
-            data_.shape = shape.FlatTo2D();
-            mshadow::AllocSpace( data_, pad_ );
-            this->dptr  = data_.dptr;
-            this->shape = shape;
-            if( this->pad_ ){
-                this->shape.stride_ = data_.shape.stride_;
-            }else{
-                this->shape.stride_ = shape[0];
-            }
-        }
-    };
-};// namespace mshadow
+namespace mshadow {
+/*!
+ * \brief tensor container that does memory allocation and resize like STL,
+ *        use it to save the lines of FreeSpace in class.
+ *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
+ *
+ * \tparam Device which device the tensor is on
+ * \tparam dimension dimension of the tensor
+ */
+template<typename Device, int dimension, typename DType = default_real_t>
+class TensorContainer: public Tensor<Device, dimension, DType> {
+ public:
+  /*! 
+   * \brief constructor 
+   * \param pad whether use padding alignment in space allocation
+   */
+  explicit TensorContainer(bool pad = MSHADOW_ALLOC_PAD) {
+    this->pad_ = pad;
+    this->dptr_ = data_.dptr_ = NULL;
+    this->shape_[0] = 0;
+    this->stride_ = 0;
+    this->data_.stride_ = 0;
+    this->data_.shape_[0] = 0;
+  }
+  /*! 
+   * \brief constructor 
+   * \param shape intial shape
+   */
+  explicit TensorContainer(const Shape<dimension> &shape) {
+    this->pad_ = MSHADOW_ALLOC_PAD;
+    data_.dptr_ = NULL;
+    this->AllocByShape(shape);
+  }
+  /*! 
+   * \brief constructor 
+   * \param shape intial shape
+   * \param initv intial value
+   */
+  explicit TensorContainer(const Shape<dimension> &shape, DType initv) {
+    this->pad_ = MSHADOW_ALLOC_PAD;
+    data_.dptr = NULL;
+    this->AllocByShape(shape);
+    (*this) = initv;
+  }
+  ~TensorContainer(void) {
+    this->FreeSpace();
+  }
+  /*! 
+   * \brief resize the container to given shape, content is NOT preserved
+   * \param shape target shape
+   */
+  inline void Resize(const Shape<dimension> &shape) {
+    Shape<2> s2 = shape.FlatTo2D();
+    if (s2.shape_[1] > data_.stride_ || s2.shape_[0] > data_.size(0)) {
+      this->AllocByShape(shape);
+    } else {
+      this->shape_ = shape;
+      if (this->pad_) {
+        this->stride_ = data_.stride_;
+      } else {
+        this->stride_ = s2.shape_[1];
+      }
+    }
+  }
+  /*! 
+   * \brief resize the container to given shape, and initialize, content is NOT preserved
+   * \param shape target shape
+   * \param initv initialization value
+   */
+  inline void Resize(const Shape<dimension> &shape, DType initv) {
+    this->Resize(shape);
+    (*this) = initv;
+  }
+  /*! \brief set whether padding is allowed in tensor */
+  inline void set_pad(bool pad) {
+    this->pad_ = pad;
+  }
+  /*! 
+   * \brief save by binary format
+   * \param fo output binary stream
+   * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+   */
+  template<typename TStream>
+  inline void SaveBinary(TStream &fo) const {
+    mshadow::SaveBinary(fo, *this);
+  }
+  /*! 
+   * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
+   * \param fi input binary stream
+   * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+   */
+  template<typename TStream>
+  inline void LoadBinary(TStream &fi) {
+    Tensor<cpu, dimension, DType> tmp;
+    mshadow::LoadBinary(fi, &tmp, false);
+    this->Resize(tmp.shape_);
+    Stream<Device> stream;
+    Copy(*this, tmp, &stream);
+    mshadow::FreeSpace(&tmp);
+  }
+  /*!\brief functions to fit expression template */
+  inline Tensor<Device, dimension, DType> &operator=(DType s) {
+    return this->__assign(s);
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, expr::type::kMapper> &exp) {
+    return this->__assign(exp);
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, expr::type::kChainer> &exp) {
+    return this->__assign(exp);
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, expr::type::kComplex> &exp) {
+    return this->__assign(exp);
+  }
 
-#endif
+ private:
+  /*! \brief whether we do padding in the space */
+  bool pad_;
+  /*! \brief the shape of data_ is actually current data space */
+  Tensor<Device, 2, DType> data_;
+  // freespace
+  inline void FreeSpace(void) {
+    if (data_.dptr_ != NULL) {
+      mshadow::FreeSpace(&data_);
+      data_.dptr_ = this->dptr_ = NULL;
+    }
+  }
+  inline void AllocByShape(const Shape<dimension>& shape) {
+    if (data_.dptr_ != NULL) this->FreeSpace();
+    data_.shape_ = shape.FlatTo2D();
+    mshadow::AllocSpace(&data_, pad_);
+    this->dptr_ = data_.dptr_;
+    this->shape_ = shape;
+    if (this->pad_) {
+      this->stride_ = data_.stride_;
+    } else {
+      this->stride_ = data_.size(1);
+    }
+  }
+};
+}  // namespace mshadow
+#endif  // MSHADOW_TENSOR_CONTAINER_H_
diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
new file mode 100644
index 000000000000..240c65faffd6
--- /dev/null
+++ b/mshadow/tensor_cpu-inl.h
@@ -0,0 +1,273 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_cpu-inl.h
+ * \brief implementation of CPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_TENSOR_CPU_INL_H_
+#define MSHADOW_TENSOR_CPU_INL_H_
+#include <cstring>
+#include "./base.h"
+#include "./tensor.h"
+#include "./sse-inl.h"
+
+namespace mshadow {
+template<>
+inline void InitTensorEngine<cpu>(int dev_id) {
+}
+template<>
+inline void ShutdownTensorEngine<cpu>(void) {
+}
+
+template<>
+inline void SetDevice<cpu>(int devid) {
+}
+template<>
+inline Stream<cpu> *NewStream<cpu>(void) {
+  return new Stream<cpu>();
+}
+template<>
+inline void DeleteStream<cpu>(Stream<cpu> *stream) {
+  delete stream;
+}
+
+template<typename xpu>
+inline void *AllocHost_(size_t size);
+template<typename xpu>
+inline void FreeHost_(void * dptr);
+
+#ifdef __CUDACC__
+template<>
+inline void *AllocHost_<gpu>(size_t size) {
+  void *dptr;
+  utils::Check(cudaMallocHost(&dptr, size,
+                 cudaHostAllocPortable) == cudaSuccess,               
+               "AllocHost");
+  return dptr;
+}
+template<>
+inline void FreeHost_<gpu>(void *dptr) {
+  cudaFreeHost(dptr);  
+}
+#endif
+
+template<>
+inline void *AllocHost_<cpu>(size_t size) {
+  size_t pitch;
+  return sse2::AlignedMallocPitch(&pitch, size, 1);  
+}
+template<>
+inline void FreeHost_<cpu>(void *dptr) {
+  sse2::AlignedFree(dptr);
+}
+
+template<typename xpu, int dim, typename DType>
+inline void AllocHost(Tensor<cpu, dim, DType> *obj) {
+  obj->stride_ = obj->size(dim - 1);
+  utils::Assert(obj->CheckContiguous(), "AllocHost");
+  void *dptr = AllocHost_<xpu>(obj->MSize() * sizeof(DType));
+  obj->dptr_ = reinterpret_cast<DType*>(dptr);
+}
+template<typename xpu, int dim, typename DType>
+inline void FreeHost(Tensor<cpu, dim, DType> *obj) {
+  utils::Assert(obj->dptr_ != NULL, "FreeHost:: double free");
+  FreeHost_<xpu>(obj->dptr_);
+  obj->dptr_ = NULL;
+}
+
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<cpu, dim, DType> *obj, bool pad) {
+  size_t pitch;
+  void *dptr;
+  if (pad) {
+    dptr = sse2::AlignedMallocPitch
+        (&pitch, obj->size(dim - 1) * sizeof(DType), obj->shape_.FlatTo2D()[0]);
+    obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
+  } else {
+    obj->stride_ = obj->size(dim - 1);
+    dptr = sse2::AlignedMallocPitch
+        (&pitch, obj->shape_.Size() * sizeof(DType), 1);
+  }
+  obj->dptr_ = reinterpret_cast<DType*>(dptr);
+}
+template<typename Device, typename DType, int dim>
+inline Tensor<Device, dim, DType>
+NewTensor(const Shape<dim> &shape, DType initv, bool pad) {
+  Tensor<Device, dim, DType> obj(shape);
+  AllocSpace(&obj, pad);
+  MapExp<sv::saveto>(&obj, expr::ScalarExp<DType>(initv));
+  return obj;
+}
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<cpu, dim, DType> *obj) {
+  sse2::AlignedFree(obj->dptr_);
+  obj->dptr_ = NULL;
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> _dst,
+                 const Tensor<cpu, dim, DType> &_src,
+                 Stream<cpu> *stream) {
+  utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch");
+  Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
+  Tensor<cpu, 2, DType> src = _src.FlatTo2D();
+  for (index_t y = 0; y < dst.size(0); ++y) {
+    memcpy(dst[y].dptr_, src[y].dptr_, sizeof(DType) * dst.size(1));
+  }
+}
+template<typename Saver, typename R, int dim,
+         typename DType, typename E>
+inline void MapPlan(TRValue<R, cpu, dim, DType> *dst,
+                    const expr::Plan<E, DType> &plan) {
+  Shape<2> shape = expr::ShapeCheck<dim, R>::Check(dst->self()).FlatTo2D();
+  expr::Plan<R, DType> dplan = expr::MakePlan(dst->self());
+  for (index_t y = 0; y < shape[0]; ++y) {
+    for (index_t x = 0; x < shape[1]; ++x) {
+      // trust your compiler! -_- they will optimize it
+      Saver::Save(dplan.REval(y, x), plan.Eval(y, x));
+    }
+  }
+}
+// code to handle SSE optimization
+template<bool pass_check, typename Saver,
+         typename R, int dim,
+         typename DType, typename E, int etype>
+struct MapExpCPUEngine {
+  inline static void Map(TRValue<R, cpu, dim, DType> *dst,
+                         const expr::Exp<E, DType, etype> &exp) {
+    MapPlan<Saver>(dst, MakePlan(exp.self()));
+  }
+};
+
+#if MSHADOW_USE_SSE
+template<typename SV, int dim, typename DType, typename E, int etype>
+struct MapExpCPUEngine<true, SV, Tensor<cpu, dim, DType>,
+                       dim, DType, E, etype> {
+  inline static void Map(Tensor<cpu, dim, DType> *dst,
+                         const expr::Exp<E, DType, etype> &exp) {
+    if (expr::SSEAlignCheck<dim, E>::Check(exp.self()) &&
+        expr::SSEAlignCheck<dim, Tensor<cpu, dim, DType> >::Check(*dst)) {
+      expr::MapSSEPlan<SV>(dst->self(), MakeSSEPlan(exp.self()));
+    } else {
+      MapPlan<SV>(dst, MakePlan(exp.self()));
+    }
+  }
+};
+#endif
+
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, cpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp) {
+  expr::TypeCheckPass<expr::TypeCheck<cpu, dim, DType, E>::kMapPass>
+      ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+  Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
+  Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
+  utils::Check(eshape[0] == 0 || eshape == dshape,
+               "Assignment: Shape of Tensors are not consistent with target");
+#if MSHADOW_USE_SSE
+  MapExpCPUEngine<expr::SSECheck<E>::kPass, Saver, R, dim, DType, E, etype>
+      ::Map(dst->ptrself(), exp);
+#else
+  MapExpCPUEngine<false, Saver, R, dim, DType, E, etype>::Map(dst, exp);
+#endif
+}
+
+template<typename Saver, typename Reducer,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, cpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<cpu, 1, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self()).FlatTo2D();
+  Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());  
+  utils::Check(eshape[1] == dshape[0],
+               "MapReduceKeepLowest::reduction dimension do not match");
+  utils::Check(eshape[0] != 0, "can not reduce over empty tensor");
+  // execution
+  expr::Plan<R, DType> dplan = MakePlan(dst->self());
+  expr::Plan<E, DType> splan = MakePlan(exp.self());
+  for (index_t x = 0; x < eshape[1]; ++x) {
+    DType res = splan.Eval(0, x);
+    for (index_t y = 1; y < eshape[0]; ++y) {
+      Reducer::Reduce(res, splan.Eval(y, x));
+    }
+    Saver::Save(dplan.REval(0, x), res * scale);
+  }
+}
+
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<cpu, dimkeep, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  typedef Shape<expr::ExpInfo<E>::kDim> EShape;
+  EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self());
+  Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());  
+  utils::Check(eshape[dimkeep] == dshape[0],
+               "MapReduceKeepHighDim::reduction dimension do not match");
+  // use equvalent form
+  Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
+                           eshape[dimkeep],
+                           eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
+                           eshape[EShape::kSubdim]);
+  // execution
+  expr::Plan<R, DType> dplan = MakePlan(dst->self());
+  expr::Plan<E, DType> splan = MakePlan(exp.self());
+  for (index_t c = 0; c < pshape[1]; ++c) {
+    DType res; Reducer::SetInitValue(res);
+    for (index_t n = 0; n < pshape[0]; ++n) {
+      DType tres; Reducer::SetInitValue(tres);
+      for (index_t y = 0; y < pshape[2]; ++y) {
+        for (index_t x = 0; x < pshape[3]; ++x) {
+          Reducer::Reduce(tres,
+                          splan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
+        }
+      }
+      Reducer::Reduce(res, tres);
+    }
+    Saver::Save(dplan.REval(0, c), res * scale);
+  }
+}
+
+template<typename DType>
+inline void Softmax(Tensor<cpu, 1, DType> dst,
+                    const Tensor<cpu, 1, DType> &energy) {
+  DType mmax = energy[0];
+  for (index_t x = 1; x < dst.size(0); ++x) {
+    if (mmax < energy[x]) mmax = energy[x];
+  }
+  DType sum = 0.0f;
+  for (index_t x = 0; x < dst.size(0); ++x) {
+    dst[x] = std::exp(energy[x] - mmax);
+    sum += dst[x];
+  }
+  for (index_t x = 0; x < dst.size(0); ++x) {
+    dst[x] /= sum;
+  }
+}
+template<typename DType>
+inline void Softmax(Tensor<cpu, 2, DType> dst,
+                    const Tensor<cpu, 2, DType> &energy) {
+  utils::Check(dst.shape_ == energy.shape_, "Softmax: shape mismatch");
+  for (index_t y = 0; y < dst.size(0); ++y) {
+    Softmax(dst[y], energy[y]);
+  }
+}
+
+template<typename DType>
+inline DType VDot(const Tensor<cpu, 1, DType> &lhs,
+                  const Tensor<cpu, 1, DType> &rhs) {
+  utils::Check(lhs.shape_ == rhs.shape_, "VDot: shape mismatch");
+  DType sum = static_cast<DType>(0);
+  for (index_t x = 0; x < lhs.size(0); ++x) {
+    sum += lhs[x] * rhs[x];
+  }
+  return sum;
+}
+}  // namespace mshadow
+#endif  // MSHADOW_TENSOR_CPU_INL_H_
diff --git a/mshadow/tensor_cpu-inl.hpp b/mshadow/tensor_cpu-inl.hpp
deleted file mode 100644
index 0fa3cfa50306..000000000000
--- a/mshadow/tensor_cpu-inl.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef MSHADOW_TENSOR_CPU_INL_HPP
-#define MSHADOW_TENSOR_CPU_INL_HPP
-/*!
- * \file tensor_cpu-inl.hpp
- * \brief implementation of CPU host code
- * \author Bing Xu, Tianqi Chen
- */
-#include <cstring>
-#include "tensor_base.h"
-#include "tensor_sse-inl.hpp"
-
-namespace mshadow {
-    template<int dim>
-    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){
-        size_t pitch;
-        if( pad ){
-            obj.dptr = (real_t*)sse2::AlignedMallocPitch
-                ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
-            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
-        }else{
-            obj.shape.stride_ = obj.shape[0];
-            obj.dptr = (real_t*)sse2::AlignedMallocPitch
-                ( pitch, obj.shape.Size() * sizeof(real_t), 1 );
-        }
-    }
-
-    template<typename Device, int dim>
-    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad ){
-        Tensor<Device, dim> obj( shape );
-        AllocSpace( obj, pad );
-        MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) );
-        return obj;
-    }
-
-    template<int dim>
-    inline void FreeSpace(Tensor<cpu,dim> &obj){
-        sse2::AlignedFree( obj.dptr );
-        obj.dptr = NULL;
-    }
-
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){
-        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        Tensor<cpu,2> src = _src.FlatTo2D();
-        for (index_t y = 0; y < dst.shape[1]; ++y ) {
-            memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] );
-        }
-    }
-
-    template<typename Saver, typename E, int dim>
-    inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        for (index_t y = 0; y < dst.shape[1]; ++y ) {
-            for (index_t x = 0; x < dst.shape[0]; ++x ) {
-                // trust your compiler! -_- they will optimize it
-                Saver::Save(dst[y][x], plan.Eval( y, x ) );
-            }
-        }
-    }
-
-    // code to handle SSE optimization
-    template<bool pass_check,typename Saver, int dim, typename E, int etype>
-    struct MapExpCPUEngine;
-    template<typename SV, int dim, typename E, int etype>
-    struct MapExpCPUEngine<false,SV,dim,E,etype>{
-        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-            MapPlan<SV>( dst, MakePlan( exp.self() ) );
-        }
-    };
-
-    #if MSHADOW_USE_SSE
-    template<typename SV, int dim, typename E, int etype>
-    struct MapExpCPUEngine<true,SV,dim,E,etype>{
-        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-            using namespace expr;
-            if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< dim,Tensor<cpu,dim> >::Check(dst) ){
-                MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) );
-            }else{
-                MapPlan<SV>( dst, MakePlan( exp.self() ) );
-            }
-        }
-    };
-    #endif
-
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
-        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
-        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
-        #if MSHADOW_USE_SSE
-        MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp );
-        #else
-        MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp );
-        #endif
-    }
-
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
-
-        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
-        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
-        // execution
-        expr::Plan<E> plan = MakePlan( exp.self() );
-        for( index_t x = 0; x < eshape[0]; ++x ){
-            real_t res = plan.Eval( 0, x );
-            for( index_t y = 1; y < eshape[1]; ++y ){
-                Reducer::Reduce( res, plan.Eval( y, x ) );
-            }
-            Saver::Save( dst[x], res*scale );
-        }
-    }
-
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        typedef Shape< ExpInfo<E>::kDim > EShape;
-        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
-        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
-        // use equvalent form
-        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], 
-                                  eshape.ProdShape(1,dimkeep), eshape[0] );
-
-        // execution
-        expr::Plan<E> plan = MakePlan( exp.self() );
-
-        for( index_t c = 0; c < pshape[2]; ++c ){
-            real_t res = Reducer::kInitV;
-            for( index_t n = 0; n < pshape[3]; ++n ){
-                real_t tres = Reducer::kInitV;
-                for( index_t y = 0; y < pshape[1]; ++y ){
-                    for( index_t x = 0; x < pshape[0]; ++x ){
-                        Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
-                    }
-                }
-                Reducer::Reduce( res, tres );
-            }
-            Saver::Save( dst[c], res*scale );
-        }
-    }
-
-    inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){
-        real_t mmax = energy[0];
-        for( real_t x = 1; x < dst.shape[0]; ++x )
-            if( mmax < energy[x] ) mmax = energy[x];
-        real_t sum = 0.0f;
-        for( index_t x = 0; x < dst.shape[0]; ++x ){
-            dst[x] = std::exp( energy[x] - mmax );
-            sum += dst[x];
-        }
-        for( index_t x = 0; x < dst.shape[0]; ++x ){
-            dst[x] /= sum;
-        }
-    }
-    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){
-        utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" );
-        for( index_t y = 0; y < dst.shape[1]; ++y ){
-            Softmax( dst[y], energy[y] );
-        }
-    }
-}; // namespace mshadow
-
-#endif // TENSOR_CPU_INL_HPP
diff --git a/mshadow/tensor_expr.h b/mshadow/tensor_expr.h
deleted file mode 100644
index ac8fde79f1c6..000000000000
--- a/mshadow/tensor_expr.h
+++ /dev/null
@@ -1,367 +0,0 @@
-#ifndef MSHADOW_TENSOR_EXPR_H
-#define MSHADOW_TENSOR_EXPR_H
-/*!
- * \file tensor_expr.h
- * \brief definitions of abstract expressions and expressions template
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_base.h"
-
-namespace mshadow{
-    /*!
-     * \brief namespace for abstract expressions and expressions template,
-     *        have no dependecy on tensor.h,
-     *        These data structure takes no charge in computations,
-     *        they are only used to define operations and represent expression in a symbolic way
-     */
-    namespace expr{
-
-        /*! \brief type of expressions */
-        namespace type{
-            /*! \brief this expression directly correspnds to a data class */
-            const int kContainer = 0;
-            /*! \brief this only contains element-wise vector operations */
-            const int kMapper    = 1;
-            /*! \brief othercase: e.g dot product */
-            const int kComplex   = 3;
-        };
-
-        /*!
-         * \brief expression engine that actually interprets these expressions
-         *        this is a function template that needed to be implemented for specific expressions
-         */
-        template<typename Saver,typename Container>
-        struct ExpEngine{
-            template<typename EType>
-            inline static void Eval( Container& dst, const EType &exp );
-        };
-
-        template<typename Container>
-        class ContainerExp;
-        class ScalarExp;
-
-        /*!
-         * \brief base class for expression
-         * \tparam SubType inheritated class must put their type into this parameter
-         * \tparam exp_type expression type, see namespace type
-         */
-        template<typename SubType, int exp_type>
-        struct Exp{
-        public:
-            /*! \return  subtype instance of current class */
-            inline const SubType& self( void ) const{
-                return *static_cast<const SubType*>(this);
-            }
-            /*! \return reference of subtype instance of current class */
-            inline SubType& refself( void ){
-                return *static_cast<SubType*>(this);
-            }
-        };
-
-        /*! \brief scalar expression */
-        struct ScalarExp: public Exp<ScalarExp, type::kMapper>{
-            /*! \brief scalar value */
-            real_t scalar_;
-            /*! \brief constructor */
-            ScalarExp( real_t scalar ):scalar_(scalar){}
-        };
-
-        /*! \brief represent a transpose expression of a container */
-        template<typename EType>
-        struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{
-        public:
-            /*! \brief expression to be transposed */
-            const EType &exp;
-            /*! \brief constructor */
-            TransposeExp( const EType &e ):exp(e){}
-            /*! \brief transpose expression */
-            inline const EType & T( void ) const{
-                return exp;
-            }
-        };
-        
-        /*!
-         * \brief base class of all variables, that can be assigned to values
-         * \tparam Container the actually class of data container, e.g. CTensor1D
-         */
-        template<typename Container>
-        class ContainerExp: public Exp< Container, type::kContainer >{
-        public:
-            /*!
-             *\brief transpose of a matrix
-             *\return transpose of current expression
-             */
-            inline const TransposeExp<Container> T( void ) const{
-                return TransposeExp<Container>( this->self() );
-            }
-        public:
-            /*! \brief operator overload */
-            inline Container &operator+=( real_t s ){
-                ExpEngine<sv::plusto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator-=( real_t s ){
-                ExpEngine<sv::minusto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator*=( real_t s ){
-                ExpEngine<sv::multo,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator/=( real_t s ){
-                ExpEngine<sv::divto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &__assign( real_t s ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-        public:
-            /*! \brief implementation of operator=, note that we can not define container = container */
-            template<typename E>
-            inline Container &__assign( const Exp<E,type::kMapper> &exp ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator=, note that we can not define container = container */
-            template<typename E>
-            inline Container &__assign( const Exp<E,type::kComplex> &exp ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator+= */
-            template<typename E,int etype>
-            inline Container &operator+=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::plusto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator-= */
-            template<typename E,int etype>
-            inline Container &operator-=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::minusto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator*= */
-            template<typename E,int etype>
-            inline Container &operator*=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::multo,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator/= */
-            template<typename E,int etype>
-            inline Container &operator/=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::divto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-        };
-    }; // namespace expr
-
-    namespace expr{
-        /*!
-         * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] )
-         * \tparam TA type of lhs
-         * \tparam TB type of rhs
-         * \tparam ltrans whether lhs is transposed
-         * \tparam rtrans whether rhs is transposed
-         */
-        template<typename TA,typename TB,bool ltrans,bool rtrans>
-        struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex >{
-            /*! \brief left operand */
-            const TA& lhs_;
-            /*! \brief right operand */
-            const TB& rhs_;
-            /*! \brief scale over result */
-            real_t scale_;
-            /*! \brief constructor */
-            DotExp( const TA &lhs, const TB &rhs, real_t scale )
-                :lhs_(lhs),rhs_(rhs),scale_(scale){}
-        };
-
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, const ContainerExp<TB> &rhs ){
-            return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, const ContainerExp<TB> &rhs ){
-            return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, const TransposeExp<TB> &rhs ){
-            return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const TransposeExp<TB> &rhs ){
-            return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB, bool ltrans, bool rtrans >
-        inline DotExp<TA,TB,ltrans,rtrans> operator*( const DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){
-            return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs );
-        }
-        /*! \brief scale of dot operation */
-        template<typename TA, typename TB, bool ltrans, bool rtrans >
-        inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const DotExp<TA,TB,ltrans,rtrans> &rhs ){
-            return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs );
-        }
-    }; // namespace expr
-
-    namespace expr{
-        /*!
-         * \brief binary map expression lhs [op] rhs
-         * \tparam OP operator
-         * \tparam TA type of lhs
-         * \tparam TB type of rhs
-         * \tparam etype expression type, sa namespace::type
-         */
-        template<typename OP, typename TA, typename TB, int etype >
-        struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{
-            /*! \brief left operand */
-            const TA& lhs_;
-            /*! \brief right operand */
-            const TB& rhs_;
-            /*! \brief constructor */
-            BinaryMapExp( const TA &lhs, const TB &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-        };
-
-        /*! \brief make expression */
-        template<typename OP,typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), rhs.self() );
-        }
-
-        /*! 
-         * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression 
-         * \param lhs left operand
-         * \param rhs right operand
-         * \tparam binary operator 
-         * \tparam TA lhs expression
-         * \tparam ta lhs expression type
-         * \tparam TB rhs expression
-         * \tparam tb rhs expression type
-         * \sa mshadow::op
-         */
-        template<typename OP,typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        /*! \brief operator overload for const */
-        template<typename OP,typename TA, int ta>
-        inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const Exp<TA,ta> &lhs, const ScalarExp &rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        /*! \brief operator overload for const */
-        template<typename OP,typename TB, int tb>
-        inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const ScalarExp &lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-
-        // operator rules
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-        // constant operators
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > operator+( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > operator-( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > operator*( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > operator/( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-        // constant operators 2
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-    };
-
-    namespace expr{
-        /*!
-         * \brief unary map expression op(src)
-         * \tparam OP operator
-         * \tparam TA type of src
-         * \tparam etype expression type, sa namespace::type
-         */
-        template<typename OP, typename TA, int etype >
-        struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{
-            /*! \brief source expression */
-            const TA& src_;
-            /*! \brief constructor */
-            UnaryMapExp( const TA &src ):src_(src){}
-        };
-
-        /*! \brief make expression */
-        template<typename OP,typename TA, int ta>
-        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const Exp<TA,ta> &src ){
-            return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() );
-        }
-
-        /*! 
-         * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression 
-         * \param src source expression
-         * \tparam operator 
-         * \tparam TA source expression
-         * \tparam ta source expression type
-         * \sa mshadow::op
-         */
-        template<typename OP,typename TA, int ta>
-        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src ){
-            return MakeExp<OP>(src);
-        }
-    };
-};
-#endif
diff --git a/mshadow/tensor_expr_engine-inl.hpp b/mshadow/tensor_expr_engine-inl.hpp
deleted file mode 100644
index 9c5f2c7f7a86..000000000000
--- a/mshadow/tensor_expr_engine-inl.hpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
-#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
-/*!
- * \file tensor_expr_engine-inl.hpp
- * \brief definitions of how expressions should be evaluated
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_expr.h"
-#include "tensor.h"
-
-namespace mshadow{
-    namespace expr{
-        /*! 
-         * \brief a general class that allows extension that makes tensors of some shape
-         * \tparam SubType type of subclass
-         * \tparam SrcExp source expression of the MakeTensorExp, the source of operation
-         * \tparam dim dimension of the expression
-         */
-        template<typename SubType, typename SrcExp, int dim>
-        struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, type::kMapper >{
-            /*! \brief the shape of this expression */
-            Shape<dim> shape_;
-            /*! \brief true self of subtype */
-            inline const SubType& real_self( void ) const{
-                return *static_cast<const SubType*>(this);
-            }
-        };
-    };
-    
-    namespace expr{
-        /*! \brief This part of code gives plan that can be used to carry out execution */
-        template<typename ExpType>
-        class Plan{
-        public:
-            /*!
-             * \brief evaluate the expression at index [y][x]
-             *        to be implemented by SubType
-             */
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const;
-        };
-
-        template <typename Device, int dim>
-        class Plan< Tensor<Device,dim> >{
-        public:
-            Plan( const Tensor<Device,dim> &t )
-                :dptr_(t.dptr),stride_(t.shape.stride_){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ y * stride_ + x ];
-            }
-        private:
-            const real_t  *dptr_;
-            index_t stride_;
-        };
-        // special evaluation case for 1d tensor
-        template <typename Device>
-        class Plan< Tensor<Device,1> >{
-        public:
-            Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ x ];
-            }
-        private:
-            const real_t  *dptr_;
-        };
-        
-        template<>
-        class Plan<ScalarExp>{
-        public:
-            Plan( real_t scalar ):scalar_(scalar){}
-            /*! \brief evaluate at [y][x] */
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                    return scalar_;
-            }
-        private:
-            real_t scalar_;
-        };
-
-        template<typename OP, typename TA, typename TB,int etype>
-        class Plan< BinaryMapExp<OP,TA,TB,etype> >{
-        public:
-            Plan( const Plan<TA> &lhs, const Plan<TB> &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
-            }
-        private:
-            Plan<TA> lhs_;
-            Plan<TB> rhs_;
-        };
-
-        template<typename OP, typename TA, int etype>
-        class Plan< UnaryMapExp<OP,TA,etype> >{
-        public:
-            Plan( const Plan<TA> &src ):src_(src){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( src_.Eval( y, x ) );
-            }
-        private:
-            Plan<TA> src_;
-        };
-
-        
-        template<typename SubType, typename SrcExp, int dim>
-        struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{
-        public:
-            Plan( const Plan<SubType> &src ):src_(src){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return src_.Eval( y, x );
-            }
-        private:
-            Plan<SubType> src_;  
-        };
-
-        // allow UnaryMap see the plan
-        template<typename OP, typename TA, typename TB, int etype>
-        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e );
-
-        // translate from exp to execution plan
-        inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){
-            return Plan<ScalarExp>( e.scalar_ );
-        }
-
-        template<typename T>
-        inline Plan<T> MakePlan( const ContainerExp<T> &e ){
-            return Plan<T>( e.self() );
-        }
-
-        template<typename T, typename SrcExp, int dim>
-        inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){
-            return Plan< T >( e.real_self() );
-        }
-
-        template<typename OP, typename TA, int etype>
-        inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const UnaryMapExp<OP,TA,etype> &e ){
-            return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) );
-        }
-
-        template<typename OP, typename TA, typename TB, int etype>
-        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
-            return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), MakePlan(e.rhs_) );
-        }
-    }; // namespace expr
-
-    namespace expr{
-        /*!
-         * \brief static type inference template, 
-         *        used to get the dimension of each expression, 
-         *        if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
-         *        if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu
-         * \tparam E expression
-         */
-        template<typename E>
-        struct ExpInfo{
-            const static int kDim = -1;
-            const static int kDevMask = 0;
-        };
-        template<>
-        struct ExpInfo<ScalarExp>{
-            const static int kDim = 0;
-            const static int kDevMask = 0xffff;
-        };
-        template<typename Device, int dim>
-        struct ExpInfo< Tensor<Device,dim> >{
-            const static int kDim = dim;
-            const static int kDevMask = Device::kDevMask;            
-        };
-        template<typename T, typename SrcExp, int dim>
-        struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{
-            const static int kDimSrc = ExpInfo<SrcExp>::kDim;
-            const static int kDim = kDimSrc >= 0 ? dim : -1;
-            const static int kDevMask = ExpInfo<SrcExp>::kDevMask;
-        };
-        template<typename OP, typename TA, int etype>
-        struct ExpInfo< UnaryMapExp<OP,TA,etype> >{
-            const static int kDim = ExpInfo<TA>::kDim;
-            const static int kDevMask = ExpInfo<TA>::kDevMask;
-        };
-        template<typename OP, typename TA, typename TB, int etype>
-        struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{
-            const static int kDimLhs = ExpInfo<TA>::kDim;
-            const static int kDimRhs = ExpInfo<TB>::kDim;
-            const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \
-                ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1;
-            const static int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
-        };
-
-        /*! \brief template to do type check */
-        template<typename Device, int dim, typename E>
-        struct TypeCheck{
-            /*! \brief dimension of expression*/
-            const static int kExpDim = ExpInfo<E>::kDim;
-            /*! \brief whether the expression device type matches */
-            const static bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
-            /*! \brief whether the expression can be mapped to expression of dim */
-            const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass;
-            /*! \brief whether the expression can be reduced to expression of dim */
-            const static bool kRedPass = (kExpDim > dim) && kDevPass;
-        };
-
-        template<bool kPass>
-        struct TypeCheckPass;
-        template<>
-        struct TypeCheckPass<false>{};
-        template<>
-        struct TypeCheckPass<true>{
-            inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){}
-            inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){}
-            inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){}
-        };
-    }; // namespace expr
-    
-    namespace expr{
-        // check shape consistency
-        template<int dim,typename E>
-        struct ShapeCheck{
-            inline static Shape<dim> Check( const E &t );
-        };
-        
-        template<int dim>
-        struct ShapeCheck<dim,ScalarExp>{
-            inline static Shape<dim> Check( const ScalarExp &exp ){
-                // use lowest dimension to mark scalar exp
-                Shape<dim> shape; shape[0] = 0; 
-                return shape;
-            }
-        };
-        template<int dim,typename Device>
-        struct ShapeCheck<dim,Tensor<Device,dim> >{
-            inline static Shape<dim> Check( const Tensor<Device,dim> &t ){
-                return t.shape;
-            }
-        };
-        template<int dim,typename SrcExp,typename T>
-        struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{
-            inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> &t ){
-                return t.shape_;
-            }
-        };
-        template<int dim, typename OP, typename TA, int etype>
-        struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{
-            inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t ){
-                Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ );
-                return s;
-            }
-        };
-        template<int dim, typename OP, typename TA, typename TB, int etype>
-        struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
-            inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
-                Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ );
-                Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ );
-                if( shape1[0] == 0 ) return shape2;
-                if( shape2[0] == 0 ) return shape1;
-                utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same");
-                return shape1;
-            }
-        };
-    }; // namespace expr
-
-    // the matrix OP depends on BLAS
-    namespace expr{
-        template<typename SV,typename Device, int ddim, int ldim, int rdim, bool ltrans, bool rtrans>
-        struct DotEngine{
-            inline static void Eval( Tensor<Device,ddim> &dst, const Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale );
-        };
-
-        // handles the dot
-        template<typename Device>
-        struct BLASEngine;
-
-        #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL)
-        template<>
-        struct BLASEngine<cpu>{
-            inline static CBLAS_TRANSPOSE GetT( bool t ){
-                return t ? CblasTrans : CblasNoTrans;
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \
-                                     const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
-                cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \
-                                     const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
-                cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-            }
-            inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
-                                     const float *X, int incX, float beta, float *Y, int incY ){
-                cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
-                                     const double *X, int incX, double beta, double *Y, int incY ){
-                cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
-                cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-            inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
-                cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-        };
-        #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
-
-        #if MSHADOW_USE_CUDA
-        // All CuBLAS goes to here, use legacy API: not threadsafe
-        template<>
-        struct BLASEngine<gpu>{
-            inline static char GetT( bool t ){
-                return t ? 'T' : 'N';
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, 
-                                     const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
-                cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, 
-                                     const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
-                cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);                
-            }
-            inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
-                                     const float *X, int incX, float beta, float *Y, int incY ){
-                cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
-                                     const double *X, int incX, double beta, double *Y, int incY ){
-                cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
-                cublasSger(m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-            inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
-                cublasDger(m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-        };
-        #endif
-
-        // helper function to decide which shape we are in 
-        inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){
-            return transpose ? Shape2(shape[0],shape[1]) : shape;
-        }
-        // dst = dot( lhs[.T], rhs[.T] )
-        template<typename SV, typename xpu, bool transpose_left, bool transpose_right>
-        struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{
-            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
-                Shape<2> sleft  = GetShape( lhs.shape, transpose_left );
-                Shape<2> sright = GetShape( rhs.shape, transpose_right );
-                utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \
-                               && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" );
-                // use column major argument to compatible with most BLAS
-                BLASEngine<xpu>::gemm
-                    ( transpose_right , transpose_left,
-                      transpose_right ? rhs.shape[1] : rhs.shape[0],
-                      transpose_left  ? lhs.shape[0] : lhs.shape[1],
-                      transpose_right ? rhs.shape[0] : rhs.shape[1], 
-                      scale * SV::kAlphaBLAS, 
-                      rhs.dptr, rhs.shape.stride_,
-                      lhs.dptr, lhs.shape.stride_,
-                      SV::kBetaBLAS, 
-                      dst.dptr, dst.shape.stride_ );
-            }
-        };
-        template<typename SV, typename xpu, bool transpose_right>
-        struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{
-            inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
-                Shape<2> sright = GetShape( rhs.shape, transpose_right );
-                utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch");
-                BLASEngine<xpu>::gemv
-                    ( transpose_right, 
-                      rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS,
-                      rhs.dptr, rhs.shape.stride_,
-                      lhs.dptr, 1, SV::kBetaBLAS,
-                      dst.dptr, 1 );
-            }
-        };        
-        template<typename SV, typename xpu>
-        struct DotEngine<SV,xpu,2,1,1,true,false>{
-            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,1> &rhs, real_t scale ) {
-                utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" );
-                if( SV::kBetaBLAS < 1e-6f ){
-                    BLASEngine<xpu>::ger
-                        ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS,
-                          rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ );
-                }else{
-                    DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale );
-                }
-            }
-        };
-
-    }; // namespace expr
-
-    namespace expr{
-        /*! \brief some engine that evaluate complex expression */
-        template<typename SV, typename Device, int dim, typename E>
-        struct ExpComplexEngine{
-            inline static void Eval( Tensor<Device,dim>& dst, const E &exp );
-        };
-        template<typename SV, typename Device, int dim>
-        struct ExpEngine<SV, Tensor<Device,dim> >{
-            template<typename E>
-            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kMapper> &exp ){
-                MapExp<SV,dim,E>( dst, exp );
-            }
-            template<typename E>
-            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kContainer> &exp ){
-                MapExp<SV,dim,E>( dst, exp );
-            }
-            template<typename E>
-            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kComplex> &exp ){
-                ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() );
-            }
-        };
-        template<typename SV, typename Device, int dim, int ldim,int rdim,bool ltrans,bool rtrans>
-        struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > >{
-            inline static void Eval( Tensor<Device,dim> &dst, const DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){
-                DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ );
-            }
-        };
-    }; // namespace expr
-};
-#endif
diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h
deleted file mode 100644
index 8399b1b7a26b..000000000000
--- a/mshadow/tensor_expr_ext.h
+++ /dev/null
@@ -1,978 +0,0 @@
-#ifndef MSHADOW_TENSOR_EXPR_EXT_H
-#define MSHADOW_TENSOR_EXPR_EXT_H
-/*!
- * \file tensor_expr_ext.h
- * \brief some extension of expressions, used to support something beyond elementwise op
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_expr_engine-inl.hpp"
-namespace mshadow{
-    // Declaration of expressions goes here
-    namespace expr{
-        /*!
-         * \brief broadcast Tensor1D into a higher dimension Tensor
-         * input: Tensor<Device,1>: ishape[0]
-         * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0]
-         * \tparam Device which device it lies
-         * \tparam dimdst  target tensor dimension
-         * \tparam dimcast the dimension where the 1D tensor fills in by index
-         */
-        template<typename Device, int dimdst, int dimcast>
-        struct Broadcast1DExp: public MakeTensorExp< Broadcast1DExp<Device,dimdst,dimcast>,Tensor<Device,1>,dimdst>{
-            /*! \brief source operand */
-            const Tensor<Device,1> src_;
-            /*! \brief constructor */
-            Broadcast1DExp( const Tensor<Device,1> &src, Shape<dimdst> shape ):src_(src){
-                this->shape_ = shape;
-            }
-        };
-
-        /*!
-         * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch        
-         *  this is a version support unpacking multiple images
-         *  after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations:
-         * \tparam SrcExp source expression
-         * \tparam dstdim destination dimension
-         */
-        template<typename SrcExp, int srcdim>
-        struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp<SrcExp,srcdim>, SrcExp, 2>{
-            /*! \brief source operand */
-            const SrcExp& img_;
-            /*! \brief patch size */
-            index_t psize_;
-            /*! \brief patch stride */
-            index_t pstride_;
-            /*! \brief number of input channel */
-            index_t i_channel_;
-            /*! \brief height of img */
-            index_t i_height_;
-            /*! \brief width of img */
-            index_t i_width_;            
-            /*! \brief constructor */
-            UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t pstride )
-                :img_(img), psize_(psize), pstride_(pstride){
-                Shape<srcdim> imshape = ShapeCheck<srcdim,SrcExp>::Check( img_ );
-                utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "UnpackPatchToCol:image shape smaller than patch size");
-                this->i_channel_ = imshape[2];
-                this->i_height_  = imshape[1];
-                this->i_width_   = imshape[0];
-                // calculate number of batches 
-                const index_t num = imshape.ProdShape( 3, srcdim );
-                const index_t o_height = ( i_height_ - psize ) / pstride + 1;
-                const index_t o_width  = ( i_width_  - psize ) / pstride + 1;
-                this->shape_[0] = o_height * o_width * num;
-                this->shape_[1] = psize * psize * imshape[2];
-            }
-        };
-
-        /*!
-         * \brief reverse operation of UnpackPatchToCol, used to backprop gradient back
-         *    this is a version supporting multiple images
-         * \tparam Device which device it lies
-         * \tparam dstdim destination dimension
-         */
-        template<typename Device, int dstdim>
-        struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp<Device,dstdim>, Tensor<Device,2>, dstdim>{
-            /*! \brief source operand */
-            const Tensor<Device,2>& mat_;
-            /*! \brief patch size */
-            index_t psize_;
-            /*! \brief patch stride */
-            index_t pstride_;
-            /*! \brief constructor */
-            PackColToPatchXExp( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride )
-                :mat_(mat), psize_(psize), pstride_(pstride){
-                this->shape_ = imshape;
-                const index_t o_height = ( imshape[1]  - psize ) / pstride + 1;                
-                const index_t o_width  = ( imshape[0]  - psize ) / pstride + 1;                
-                utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" );
-                utils::Assert( mat.shape[1] == psize * psize * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" );
-            }
-        };
-
-        /*!
-         * \brief reshape the content to another shape
-         * input: Tensor<Device,dimsrc>: ishape
-         * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size()
-         * \tparam SrcExp source expression
-         * \tparam dimdst target dimension
-         * \tparam dimsrc source dimension
-         */
-        template<typename SrcExp, int dimdst, int dimsrc>
-        struct ReshapeExp: public MakeTensorExp< ReshapeExp<SrcExp,dimdst,dimsrc>, SrcExp, dimdst>{
-            /*! \brief source expression */
-            const SrcExp& src_;
-            /*! \brief smallest dimension of input */
-            index_t ishape0_;
-            /*! \brief constructor */
-            ReshapeExp( const SrcExp &src, Shape<dimdst> shape ):src_(src){
-                Shape<dimsrc> ishape = ShapeCheck<dimsrc,SrcExp>::Check( src_ );
-                utils::Assert( ishape.Size() == shape.Size(), "reshape size must match" );
-                ishape0_ = ishape[0];
-                this->shape_ = shape;
-            }
-        };
-
-        /*!
-         * \brief swap two axis of a tensor
-         * input: Tensor<Device,dim>: ishape
-         * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1]
-         *
-         * \tparam SrcExp type of source expression
-         * \tparam dimsrc source dimension
-         * \tparam a1 smaller dimension to be swapped
-         * \tparam a2 larger dimension to be swapped
-         */
-        template<typename SrcExp,int dimsrc, int a1, int a2>
-        struct SwapAxisExp: public MakeTensorExp< SwapAxisExp<SrcExp,dimsrc,a1,a2>, SrcExp, dimsrc>{
-            /*! \brief source expression */
-            const SrcExp& src_;
-            /*! \brief constructor */
-            SwapAxisExp( const SrcExp &src ):src_(src){                
-                this->shape_ = ShapeCheck<dimsrc,SrcExp>::Check(src); 
-                std::swap( this->shape_[a1], this->shape_[a2] );
-            }
-        };
-
-        /*!
-         * \brief reduction to 1 dimension tensor
-         * input: Tensor<Device,k>: ishape
-         * output: Tensor<Device,1> shape[0] = ishape[dimkeep];
-         *
-         * \tparam EType type of expression to be reduced
-         * \tparam Reducer which reducer to use
-         * \tparam srcdim dimension of source
-         * \tparam dimkeep which dimension to be kept,
-         */
-        template<typename EType, typename Reducer,int dimkeep>
-        struct ReduceTo1DExp: public Exp< ReduceTo1DExp<EType,Reducer, dimkeep>, type::kComplex >{
-            /*! \brief source operand */
-            const EType& src_;
-            /*! \brief source operand, scale of the  */
-            real_t scale_;
-            /*! \brief construct a repmat expression from src and nrow */
-            ReduceTo1DExp( const EType& src, real_t scale ):src_(src),scale_(scale){}
-        };
-
-        /*!
-         * \brief pooling expression, do reduction over local patches of a image
-         * \tparam Reducer reduction method during pooling
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct PoolingExp: public MakeTensorExp< PoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief kernel size */
-            index_t ksize_;
-            /*! \brief kernel stride */
-            index_t kstride_;
-            /*! \brief source height shape[1] */
-            index_t src_height_;
-            /*! \brief source width shape[0] */
-            index_t src_width_;
-            /*! \brief constructor */
-            PoolingExp( const SrcExp &src, index_t ksize, index_t kstride )
-                : src_(src), ksize_(ksize), kstride_(kstride) {
-                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ );
-                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" );
-                this->src_height_ = sshape[1];
-                this->src_width_  = sshape[0];
-                this->shape_ = sshape;
-                this->shape_[1] =  (src_height_ - ksize) / kstride + 1;                
-                this->shape_[0] =  (src_width_  - ksize) / kstride + 1;
-            }
-            /*! \brief constructor, specify shape */
-            PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, index_t kstride )
-                : src_(src), ksize_(ksize), kstride_(kstride) {
-                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ );
-                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" );
-                this->src_height_ = sshape[1];
-                this->src_width_  = sshape[0];
-                this->shape_    = sshape;
-                this->shape_[1] = pshape[1];
-                this->shape_[0] = pshape[0];
-            } 
-        };
-
-        /*!
-         * \brief unpooling expr reverse operation of pooling, used to pass gradient back
-         * \tparam Reducer specifies reduction operation during pooling
-         * \tparam Device which device it lies
-         */
-        template<typename Reducer, typename Device>
-        struct UnPoolingExp: public MakeTensorExp< UnPoolingExp<Reducer, Device>, Tensor<Device,4>, 4> {
-            /*! \brief source input, corresponds to src in pooling */
-            const Tensor<Device, 4>& data_src_;
-            /*! \brief result of pooled data, corresponds to result of pooling */
-            const Tensor<Device, 4>& data_pooled_;
-            /*! \brief gradient data of pooled part, to be propgate down */
-            const Tensor<Device, 4>& grad_pooled_;
-            /*! \brief kernel size */
-            index_t ksize_;
-            /*! \brief kernel stride */
-            index_t kstride_;
-            /*! \brief constructor */
-            UnPoolingExp( const Tensor<Device,4> &data_src,  const Tensor<Device,4> &data_pooled,
-                          const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride )
-                : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled),
-                  ksize_(ksize), kstride_(kstride) {
-                utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" );
-                utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" );
-                utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" );
-                this->shape_ = data_src_.shape;
-            }
-        };
-
-        /*!
-         * \brief padding expression, pad a image with zeros
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename SrcExp, int srcdim>
-        struct PaddingExp : public MakeTensorExp<PaddingExp<SrcExp, srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief pad size */
-            index_t pad_;
-            /*! \brief source tensor height */
-            index_t src_height_;
-            /*! \brief source tensor width */
-            index_t src_width_;
-            /*! \brief constructor */
-            PaddingExp( const SrcExp &src, index_t pad )
-                : src_(src), pad_(pad) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                src_height_ = this->shape_[1];
-                src_width_  = this->shape_[0];
-                this->shape_[1] += pad * 2; // height
-                this->shape_[0] += pad * 2; // width
-            }
-        };
-
-        /*!
-         * \brief crop expression, cut off the boundary region, reverse operation of padding
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename SrcExp, int srcdim>
-        struct CroppingExp : public MakeTensorExp< CroppingExp<SrcExp, srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief pad height */
-            index_t pad_height_;
-            /*! \brief pad height */
-            index_t pad_width_;
-            /*! \brief src height */
-            index_t src_height_;
-            /*! \brief constructor */
-            CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met");
-                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met");
-                pad_height_ = (this->shape_[1] - cshape[1]) / 2;
-                pad_width_ = (this->shape_[0] - cshape[0]) / 2;
-                src_height_ = this->shape_[1];
-                this->shape_[1] = cshape[1]; // width
-                this->shape_[0] = cshape[0]; // height
-            }
-            /*! \brief constructor */
-            CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width  )
-                : src_(src), pad_height_(start_height), pad_width_(start_width) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met");
-                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met");
-                src_height_ = this->shape_[1];
-                this->shape_[1] = cshape[1]; // width
-                this->shape_[0] = cshape[0]; // height
-            }
-
-        }; // struct CroppingExp
-
-
-        /*!
-         * \brief mirror expression, mirror a image in width
-         * \tparam SrcExp source expression to be mirrored
-         * \tparam srcdim dimension of src
-         */
-        template<typename SrcExp, int srcdim>
-        struct MirroringExp : public MakeTensorExp<MirroringExp<SrcExp, srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief constructor */
-            MirroringExp( const SrcExp &src ): src_(src) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-            }
-        };
-
-        /*!
-         * \brief channel pooling expression, do reduction over (local nearby) channels, used to implement local response normalization
-         * \tparam Reducer reduction method during pooling
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct ChannelPoolingExp: public MakeTensorExp< ChannelPoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief neighbor size */
-            index_t nsize_;            
-            /*! \brief constructor */
-            ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), nsize_(nsize){
-                utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric" );
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels" );
-            }
-        };
-    }; // namespace expr
-
-
-    // Declaration of all functions go here
-    namespace expr{
-        /*! \brief operator overload */
-        template<typename E, typename R,int d>
-        inline ReduceTo1DExp<E,R,d> operator*( const ReduceTo1DExp<E,R,d> &e, real_t scale ){
-            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
-        }
-        /*! \brief operator overload */
-        template<typename E, typename R,int d>
-        inline ReduceTo1DExp<E,R,d> operator*( real_t scale, const ReduceTo1DExp<E,R,d> &e ){
-            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
-        }
-
-        /*!
-         * \brief a expression that replicate a 1 dimension tensor in dimension dimcast
-         * \param src Tensor<Device,1>: shape[0]
-         * \param shape shape of output
-         * \return a expresion with type Tensor<Device,dimdst>
-         * \tparam dimcast target dimension where the 1D tensor will be broadcasted
-         * \tparam Device which device it lies
-         * \tparam dimdst dimension of destination tensor
-         */
-        template<int dimcast,typename Device,int dimdst>
-        inline Broadcast1DExp<Device,dimdst,dimcast> broadcast( const Tensor<Device,1> &src, Shape<dimdst> shape ){
-            TypeCheckPass< dimcast<dimdst >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape mismatch" );
-            return Broadcast1DExp<Device,dimdst,dimcast>( src, shape );
-        }
-
-        /*!
-         * \brief  unpack local (overlap) patches of image to column of mat, can be used to implement convolution
-         *  after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations:
-         *
-         *  weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize
-         *  output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images
-         *  out_height = ( in_height - psize ) / pstride + 1, this means we pad inperfect patch with 0
-         *  out_width  = ( in_width - psize ) / pstride + 1
-         *
-         * \return mat target matrix; shape[1]: in_channel*psize*psize  shape[0]: out_height*out_width * num_of_images
-         * \param img source image; shape[2]:  in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images)
-         * \param psize height and width of each patch
-         * \param pstride stride of each patch
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename SrcExp, int etype>
-        inline UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim > unpack_patch2col( const Exp<SrcExp,etype> &img, index_t psize, index_t pstride ){
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim >( img.self(), psize, pstride );
-        }
-
-        /*!
-         * \brief reverse operation of pack_col2patch, can be used to implement deconvolution
-         * \return packed img expression
-         * \param mat source matrix
-         * \param imshape shape of target img
-         * \param psize height and width of each patch
-         * \param pstride stride of each patch
-         * \tparam Device the Device where input data lies
-         */
-        template<typename Device, int dstdim>
-        inline PackColToPatchXExp<Device,dstdim> pack_col2patch( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride ){
-            utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size");
-            return PackColToPatchXExp<Device,dstdim>( mat, imshape, psize, pstride );
-        }
-        /*!
-         * \brief a expression that reshapes a tensor to another shape
-         * \param src Tensor<Device,dimsrc>:
-         * \param oshape target shape
-         * \return a expresion with type Tensor<Device,dimdst>
-         * \tparam SrcExp source expression
-         * \tparam etype source expression type
-         * \tparam dimdst target dimension
-         */
-        template<typename SrcExp, int etype, int dimdst>
-        inline ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim > reshape( const Exp<SrcExp,etype> &src, Shape<dimdst> oshape ){
-            return ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim >( src.self(), oshape );
-        }
-
-        /*!
-         * \brief a expression that reshapes a tensor to another shape
-         * \param src Tensor<Device,dimsrc>:
-         * \return a expresion with type Tensor<Device,dimdst>
-         * \tparam a1 smaller dimension to be swapped
-         * \tparam a2 larger dimension to be swapped
-         * \tparam SrcExp source expression
-         * \tparam etype source expression type
-         */
-        template<int a1, int a2, typename SrcExp, int etype>
-        inline SwapAxisExp< SrcExp, ExpInfo<SrcExp>::kDim, a1,a2> swapaxis( const Exp<SrcExp,etype> &src ){ 
-            typedef ExpInfo<SrcExp> Info;
-            TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() );
-        }
-
-        /*!
-         * \brief a sum over all dimensions, except dimkeep
-         * \param exp input expression that must be a matrix Tensor<?,2>
-         * \return a expresion with type Tensor<Device,1>
-         * \tparam dimkeep the dimension that will be kept
-         * \tparam SrcExp expression
-         * \tparam etype type of expression
-         */
-        template<int dimkeep,  typename SrcExp, int etype>
-        inline ReduceTo1DExp<SrcExp, red::sum, dimkeep > sumall_except_dim( const Exp<SrcExp,etype> &exp ){
-            return ReduceTo1DExp<SrcExp,red::sum,dimkeep>( exp.self(), 1.0f );
-        }
-
-        /*!
-         * \brief pooling subregion results together
-         * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width
-         * \param ksize kernel size
-         * \param kstride stride for each kernel
-         * \return expression of pooled result
-         * \tparam Reducer reducer type
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename Reducer, typename SrcExp, int etype>
-        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, index_t ksize, index_t kstride ) {
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), ksize, kstride);
-        }
-        /*! 
-         * \brief same as pool, except the output shape is specified by pshape
-         * \param src source image
-         * \param pshape ouput shape 
-         * \param ksize kernel size
-         * \param kstride stride for each kernel
-         * \return expression of pooled result
-         * \tparam Reducer reducer type
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename Reducer, typename SrcExp, int etype>
-        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, Shape<2> pshape, index_t ksize, index_t kstride ) {
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), pshape, ksize, kstride);
-        }
-        /*!
-         * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling
-         * \param data_src  source input, corresponds to src in pooling
-         * \param data_pooled result of pooled data, corresponds to result of pooling
-         * \param grad_pooled gradient data of pooled part, to be propgate down
-         * \param ksize kernel size
-         * \param kstride stride for each kernel
-         * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient
-         * \tparam Reducer reducer type
-         * \tparam Device device where data lies
-         */
-         template<typename Reducer, typename Device>
-         inline UnPoolingExp<Reducer, Device> unpool( const Tensor<Device,4>&data_src, const Tensor<Device,4> &data_pooled,
-                                                      const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride ) {
-             return UnPoolingExp<Reducer, Device>(data_src, data_pooled, grad_pooled,ksize, kstride);
-         }
-
-        /*!
-         * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
-         * \param src original image batches
-         * \param pad padding size
-         * \return expression corresponding to padded result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim> pad(const Exp<SrcExp, etype> &src, index_t pad) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), pad);
-         }
-
-        /*!
-         * \brief revserse operationg of padding, cut off boundaries, crop output from center of input
-         * \param src original image batches
-         * \param oshape output shape to be cropped
-         * \return expression corresponding to padded result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape ) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape);
-         }
-        /*!
-         * \brief same as crop, but can specify starting position to do cropping
-         * \param src original image batches
-         * \param oshape output shape to be cropped
-         * \param start_height start height position to do cropping
-         * \param start_width  start width position to do cropping
-         * \return expression corresponding to padded result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape, index_t start_height, index_t start_width ) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape, start_height, start_width);
-         }
-
-        /*!
-         * \brief mirroring expression, mirror images in width
-         * \param src original image batches
-         * \return expression corresponding to mirrored result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim> mirror(const Exp<SrcExp, etype> &src) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self());
-         }
-
-        /*!
-         * \brief  channel pooling, do reduction over (local nearby) channels, used to implement local response normalization
-         * \param src source data 
-         * \param nsize neighbor size 
-         * \return expression of pooled result
-         * \tparam Reducer reducer type
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename Reducer, typename SrcExp, int etype>
-        inline ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > chpool( const Exp<SrcExp,etype> &src, index_t nsize ) {
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(),nsize);
-        }
-        // short cut functions
-        /*!
-         * \brief a expression that replicate a 1 dimension tensor for nrow times
-         * \param src Tensor<Device,1>: shape[0]
-         * \param nrow number of rows to replicate
-         * \return a expresion with type Tensor<Device,2> shape[0], shape[1] = nrow
-         * \tparam Device which device it lies
-         */
-        template<typename Device>
-        inline Broadcast1DExp<Device,2,0> repmat( const Tensor<Device,1> &src, index_t nrow ){
-            return broadcast<0>( src, Shape2( nrow, src.shape[0] ) );
-        }
-        /*!
-         * \brief a expression that sum over rows of a matrix
-         * \param exp input expression that must be a matrix Tensor<?,2>
-         * \return a expresion with type Tensor<Device,1>
-         * \tparam SrcExp expression
-         * \tparam etype type of expression
-         */
-        template<typename SrcExp, int etype>
-        inline ReduceTo1DExp<SrcExp, red::sum, 0 > sum_rows( const Exp<SrcExp,etype> &exp ){
-            return sumall_except_dim<0>( exp );
-        }
-
-    }; // namespace expr
-}; // namespace mshadow
-
-// ==================================================
-//  implementations afterwards,
-//  no need to read if only use the functions
-// --------------------------------------------------
-namespace mshadow{
-    namespace expr{
-        template<typename SV, typename Device, typename EType, typename Reducer, int dimkeep>
-        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,dimkeep> >{
-            inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,dimkeep> &exp ){
-                TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-                MapReduceKeepHighDim<SV,Reducer,dimkeep>( dst, exp.src_, exp.scale_ );
-            }
-        };
-
-        template<typename SV, typename Device, typename EType, typename Reducer>
-        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,0> >{
-            inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,0> &exp ){
-                MapReduceKeepLowest<SV,Reducer>( dst, exp.src_, exp.scale_ );
-            }
-        };
-    }; // namespace expr
-
-    namespace expr{
-        /*! \brief execution plan of Broadcast1DExp */
-        template<typename Device, int dimdst, int dimcast>
-        struct Plan< Broadcast1DExp<Device,dimdst,dimcast> >{
-        public:
-            Plan( const Broadcast1DExp<Device,dimdst,dimcast> &e )
-                : dptr_( e.src_.dptr ), 
-                  ystride_( e.shape_.ProdShape(1,dimcast) ),
-                  length_(e.shape_[dimcast]){
-                TypeCheckPass< dimcast!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            }
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ (y / ystride_) % length_ ];
-            }
-        private:
-            const real_t  *dptr_;
-            const index_t  ystride_, length_;
-        };
-
-        /*! \brief execution plan of Broadcast1DExp */
-        template<typename Device, int dimdst>
-        struct Plan< Broadcast1DExp<Device,dimdst,0> >{
-        public:
-            Plan( const Broadcast1DExp<Device,dimdst,0> &e ): dptr_( e.src_.dptr ){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ x ];
-            }
-        private:
-            const real_t *dptr_;
-        };
-    }; // namespace expr
-
-    namespace expr{
-        template<typename SrcExp, int srcdim>
-        struct Plan< UnpackPatchToColXExp<SrcExp,srcdim> >{
-        public:
-            Plan( const UnpackPatchToColXExp<SrcExp,srcdim> &e )
-                :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_),
-                 i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_),                 
-                 o_height_(( i_height_  - psize_ ) / pstride_ + 1),
-                 o_width_ (( i_width_   - psize_ ) / pstride_ + 1){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
-                const index_t x_offset = i % psize_;
-                const index_t idivp    = i / psize_;
-                const index_t y_offset = idivp % psize_;
-                const index_t c = idivp / psize_;                
-                const index_t x = (j % o_width_) * pstride_ + x_offset;
-                const index_t jdivw = j / o_width_;
-                const index_t y = (jdivw % o_height_) * pstride_ + y_offset;
-                const index_t n = jdivw / o_height_;
-
-                if( x < i_width_ && y < i_height_ ){
-                    return src_.Eval( ( n * i_channel_  + c ) * i_height_ + y, x );
-                }else{
-                    return 0.0f;
-                }
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t psize_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_;
-        };
-
-        template<typename Device, int dstdim>
-        struct Plan< PackColToPatchXExp<Device, dstdim> >{
-        public:
-            Plan( const PackColToPatchXExp<Device, dstdim> &e )
-                :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_),
-                 i_channel_(e.shape_[2]), i_height_(e.shape_[1]),
-                 o_width_(( e.shape_[0]  - psize_ ) / pstride_ + 1),
-                 o_height_(( e.shape_[1]  - psize_ ) / pstride_ + 1){
-                // note: i/o convention are same as unpack
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
-                using namespace std;
-                const index_t y = i % i_height_;
-                const index_t idivh = i / i_height_;                
-                const index_t c = idivh % i_channel_;
-                const index_t n = idivh / i_channel_; 
-                const index_t x = j;
-                const index_t py_min = y < psize_ ? 0 : (y-psize_+pstride_)/pstride_;
-                const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_;
-                const index_t py_max = min( (y+pstride_)/pstride_, o_height_);
-                const index_t px_max = min( (x+pstride_)/pstride_, o_width_ );
-                real_t res = 0.0f;
-                for( index_t py = py_min; py < py_max; ++py ){
-                    for( index_t px = px_min; px < px_max; ++px ){
-                        res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ];
-                    }
-                }
-                return res;
-            }
-        private:
-            Tensor<Device,2> mat_;
-            const index_t psize_, pstride_, i_channel_, i_height_, o_width_, o_height_;
-        };
-    };
-
-    namespace expr{
-        template<typename SrcExp, int dimdst, int dimsrc>
-        struct Plan< ReshapeExp<SrcExp,dimdst,dimsrc> >{
-        public:
-            Plan( const ReshapeExp<SrcExp,dimdst,dimsrc> &e )
-                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                const index_t idx = y * oshape0_ + x;
-                return src_.Eval( idx / ishape0_, idx % ishape0_ );
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t oshape0_, ishape0_;
-        };
-        // special work plan for 1 dimensional data
-        template<typename SrcExp,int dimdst>
-        struct Plan< ReshapeExp<SrcExp,dimdst,1> >{
-        public:
-            Plan( const ReshapeExp<SrcExp,dimdst,1> &e )
-                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return src_.Eval( 0, y * oshape0_ + x );
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t oshape0_;
-        };
-    };
-    
-    namespace expr{
-        template<typename SrcExp,int dimsrc, int a1, int a2>
-        struct Plan< SwapAxisExp<SrcExp,dimsrc,a1,a2> >{
-        public:
-            Plan( const SwapAxisExp<SrcExp,dimsrc,a1,a2> &e )
-                : src_(MakePlan(e.src_)),
-                  shape1_( e.shape_.ProdShape( 1, a1 ) ),
-                  shape2_( e.shape_[a1] ),
-                  shape3_( e.shape_.ProdShape( a1+1, a2 ) ),
-                  shape4_( e.shape_[a2] ){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
-                const index_t y = i % shape1_;
-                i /= shape1_; 
-                const index_t z = i % shape2_;
-                i /= shape2_;
-                const index_t c = i % shape3_;
-                i /= shape3_;
-                const index_t n = i % shape4_;
-                // swap z and n
-                return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n ) * shape1_ + y, j ); 
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t shape1_, shape2_, shape3_, shape4_;
-        };
-
-        template<typename SrcExp,int dimsrc, int a2>
-        struct Plan< SwapAxisExp<SrcExp,dimsrc,0,a2> >{
-        public:
-            Plan( const SwapAxisExp<SrcExp,dimsrc,0,a2> &e )
-                : src_(MakePlan(e.src_)),
-                  shape0_( e.shape_[0] ),
-                  shape1_( e.shape_.ProdShape(1,a2) ),
-                  shape2_( e.shape_[a2] ){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{
-                // swap x and z
-                const index_t y = i % shape1_;
-                i /= shape1_; 
-                const index_t z = i % shape2_;
-                const index_t n = i / shape2_;
-                return src_.Eval(  ( n*shape0_ + x ) * shape1_ + y , z ); 
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t shape0_, shape1_, shape2_;
-        };
-    };
-
-    namespace expr{
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > {
-        public:
-            Plan( const PoolingExp<Reducer, SrcExp, srcdim> &e )
-                : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), kstride_(e.kstride_),
-                  src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) {
-            }
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                using namespace std;
-                const index_t py = i % new_height_;
-                const index_t y_start = py * kstride_;
-                const index_t y_end = min( y_start + ksize_, src_height_ );
-                const index_t px = j;
-                const index_t x_start = px * kstride_;
-                const index_t x_end = min( x_start + ksize_, src_width_ );
-                const index_t c = i / new_height_;
-
-                real_t res = Reducer::kInitV;
-                for (index_t y = y_start; y < y_end; ++y) {
-                    for (index_t x = x_start; x < x_end; ++x) {
-                        Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) );
-                    }
-                }
-                return res;
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t ksize_, kstride_;
-            const index_t src_height_, src_width_;
-            const index_t new_height_;
-        };
-
-        template<typename Reducer, typename Device>
-        struct Plan<UnPoolingExp<Reducer, Device> > {
-        public:
-            Plan(const UnPoolingExp<Reducer, Device> &e)
-                : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_),
-                  ksize_(e.ksize_), kstride_(e.kstride_) {}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                using namespace std;
-                const index_t x = j;
-                const index_t y = i % data_src_.shape[1];
-                const index_t c = i / data_src_.shape[1];
-                const real_t vsrc = data_src_[0][c][y][x];
-
-                const index_t py_min = y < ksize_ ? 0 : (y-ksize_+kstride_)/kstride_;
-                const index_t px_min = x < ksize_ ? 0 : (x-ksize_+kstride_)/kstride_;
-                const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]);
-                const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]);
-
-                real_t val = 0;
-                for( index_t py = py_min; py < py_max; ++py ){
-                    for( index_t px = px_min; px < px_max; ++px ){
-                        val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px];
-                    }
-                }
-                return val;
-            }
-        private:
-            Tensor<Device, 4> data_src_, data_pooled_, grad_pooled_;
-            const index_t ksize_;
-            const index_t kstride_;
-        };
-    }; // namespace expr
-
-    namespace expr{
-        template<typename SrcExp, int srcdim>
-        struct Plan< PaddingExp<SrcExp, srcdim> > {
-        public:
-            Plan(const PaddingExp<SrcExp, srcdim> &e)
-                : src_(MakePlan(e.src_)), pad_(e.pad_), new_height_(e.shape_[1]),
-                  src_height_(e.src_height_), src_width_(e.src_width_) {}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                const index_t x = j;
-                const index_t y = i % new_height_;
-                const index_t c = i / new_height_;
-                if (y < pad_ || x < pad_) return 0.0f;
-                const index_t h = y - pad_;
-                const index_t w = x - pad_;
-                if (h < src_height_ && w < src_width_) {
-                    return src_.Eval(c * src_height_ + h, w);
-                } else {
-                    return 0.0f;
-                }
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t pad_;
-            const index_t new_height_;
-            const index_t src_height_;
-            const index_t src_width_;
-        };
-
-        template<typename SrcExp, int srcdim>
-        struct Plan<CroppingExp<SrcExp, srcdim> > {
-        public:
-            Plan(const CroppingExp<SrcExp, srcdim> &e)
-                : src_(MakePlan(e.src_)), pad_height_(e.pad_height_),pad_width_(e.pad_width_), 
-                  new_height_(e.shape_[1]), src_height_(e.src_height_) {}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                const index_t x = j;
-                const index_t y = i % new_height_;
-                const index_t c = i / new_height_;
-                const index_t h = y + pad_height_;
-                const index_t w = x + pad_width_;
-                return src_.Eval(c * src_height_ + h, w);
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t pad_height_, pad_width_;
-            const index_t new_height_;
-            const index_t src_height_;
-        };
-
-        template<typename SrcExp, int srcdim>
-        struct Plan< MirroringExp<SrcExp, srcdim> > {
-        public:
-            Plan(const MirroringExp<SrcExp, srcdim> &e)
-                : src_(MakePlan(e.src_)), width_(e.shape_[0]){}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                return src_.Eval( i, width_ - j - 1 );
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t width_;
-        };
-    }; // namespace expr
-
-    namespace expr{
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > {
-        public:
-            Plan( const ChannelPoolingExp<Reducer, SrcExp, srcdim> &e )
-                : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]),
-                  height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){
-            }
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                using namespace std;
-                const index_t y = i % height_;
-                i /= height_;
-                const index_t c = i % channel_;
-                const index_t n = i / channel_;
-                const index_t x = j;
-                const index_t cstart = c < hnsize_ ? 0  : c - hnsize_;
-                const index_t cend   = min( c + hnsize_ + 1, channel_ );
-                real_t res = Reducer::kInitV;
-                for( index_t cc = cstart; cc < cend; ++ cc ){
-                    Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + y, x ) );
-                }
-                return res;
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t channel_, height_, width_, hnsize_;
-        };
-    };
-}; // namespace mshadow
-
-#if MSHADOW_USE_SSE
-// implementations of SSE support, if possible
-#include "tensor_sse-inl.hpp"
-namespace mshadow{
-    namespace expr{
-        template<int dimdst>
-        struct SSECheck< Broadcast1DExp<cpu,dimdst,0> >{
-            const static bool kPass = true;
-        };
-        template<int dimdst>
-        struct SSEAlignCheck<2, Broadcast1DExp<cpu,dimdst,0> >{
-            inline static bool Check( const Broadcast1DExp<cpu,dimdst,0> &exp ){
-                return sse2::CheckAlign( exp.src_.dptr );
-            }
-        };
-        template<int dimdst>
-        class SSEPlan< Broadcast1DExp<cpu,dimdst,0> >{
-        public:
-            SSEPlan( const Broadcast1DExp<cpu,dimdst,0> &t )
-                :dptr_(t.src_.dptr){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::FVec<real_t>( &dptr_[ x ] );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ x ];
-            }
-        private:
-            const real_t  *dptr_;
-        };
-    };
-};
-#endif
-
-#endif
-
diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h
new file mode 100644
index 000000000000..ffd203d33a1a
--- /dev/null
+++ b/mshadow/tensor_gpu-inl.h
@@ -0,0 +1,174 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_gpu-inl.h
+ * \brief implementation of GPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_TENSOR_GPU_INL_H_
+#define MSHADOW_TENSOR_GPU_INL_H_
+#include "./base.h"
+#include "./tensor.h"
+
+namespace mshadow {
+#if MSHADOW_USE_CUDA
+template<>
+inline void InitTensorEngine<gpu>(int dev_id) {
+  cudaDeviceProp prop;
+  int device_id = 0;
+  int device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  utils::Check(device_count > 0,
+               "Cannot find CUDA device. Please check CUDA-Configuration");
+  if (dev_id < 0) {
+    device_id = 0;
+  } else {
+    device_id = dev_id;
+  }
+  utils::Check(device_id < device_count, "Incorrect Device ID");
+  utils::Check(cudaSetDevice(device_id) == cudaSuccess, "cannot set device");
+  cudaGetDeviceProperties(&prop, device_id);
+  printf("Use CUDA Device %d: %s\n", device_id, prop.name);
+  cublasInit();
+}
+template<>
+inline void ShutdownTensorEngine<gpu>(void) {
+  cublasShutdown();
+}
+template<>
+inline void SetDevice<gpu>(int devid) {
+  utils::Check(cudaSetDevice(devid) == cudaSuccess, "cannot set device"); 
+}
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<gpu, dim, DType> *obj, bool pad) {
+  size_t pitch;
+  // common choice for cuda mem align unit is 32
+  if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) {
+    cudaError_t err =
+        cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
+                        obj->size(dim - 1) * sizeof(DType),
+                        obj->shape_.FlatTo2D()[0]);
+    utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+    obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
+  } else {
+    obj->stride_ = obj->size(dim - 1);
+    cudaError_t err =
+        cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
+                        obj->shape_.Size() * sizeof(DType), 1);
+    utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+  }
+}
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<gpu, dim, DType> *obj) {
+  cudaFree(obj->dptr_); obj->dptr_ = NULL;
+}
+template<typename A, typename B, int dim, typename DType>
+inline void Copy(Tensor<A, dim, DType> _dst,
+                 Tensor<B, dim, DType> _src,
+                 cudaMemcpyKind kind,
+                 Stream<gpu> *stream) {
+  utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch");
+  Tensor<A, 2, DType> dst = _dst.FlatTo2D();
+  Tensor<B, 2, DType> src = _src.FlatTo2D();
+  cudaError_t err = cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType),
+                                      src.dptr_, src.stride_ * sizeof(DType),
+                                      dst.size(1) * sizeof(DType),
+                                      dst.size(0), kind,
+                                      Stream<gpu>::GetStream(stream));
+  utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+  // use synchronize call behavior for zero stream
+  if (stream == NULL) {
+    err = cudaStreamSynchronize(0);
+    utils::Check(err == cudaSuccess, cudaGetErrorString(err));
+  }
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream) {
+  Copy(dst, src, cudaMemcpyDeviceToHost, stream);
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream) {
+  Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<cpu, dim, DType> &src,
+                 Stream<gpu> *stream) {
+  Copy(dst, src, cudaMemcpyHostToDevice, stream);
+}
+#endif  // MSHADOW_USE_CUDA
+}  // namespace mshadow
+
+// the following part is included only if compiler is nvcc
+#ifdef __CUDACC__
+#include "./cuda/tensor_gpu-inl.cuh"
+
+namespace mshadow {
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp) {
+  expr::TypeCheckPass<expr::TypeCheck<gpu, dim, DType, E>::kMapPass>
+      ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+  Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
+  Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
+  utils::Check(eshape[0] == 0 || eshape == dshape,
+               "Assignment: Shape of Tensors are not consistent with target");
+  cuda::MapPlan<Saver>(MakePlan(dst->self()),
+                       MakePlan(exp.self()),
+                       dshape.FlatTo2D(),
+                       Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
+}
+
+template<typename Saver, typename Reducer,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<gpu, 1, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self()).FlatTo2D();
+  Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());  
+  utils::Check(eshape[1] == dshape[0],
+               "MapReduceKeepLowest::reduction dimension do not match");
+  utils::Check(eshape[0] != 0, "can not reduce over empty tensor");
+  cuda::MapReduceKeepLowest<Saver, Reducer>
+      (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape,
+       Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
+}
+
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<gpu, dimkeep, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  typedef Shape<expr::ExpInfo<E>::kDim> EShape;
+  EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self());
+    Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());  
+  utils::Check(eshape[dimkeep] == dshape[0],
+               "MapReduceKeepHighDim::reduction dimension do not match");
+  // use equvalent form
+  Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
+                           eshape[dimkeep],
+                           eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
+                           eshape[EShape::kSubdim]);
+  // call equavalent map red dim 2
+  cuda::MapReduceKeepDim1<Saver, Reducer>
+      (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape,
+       Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
+}
+template<typename DType>
+inline void Softmax(Tensor<gpu, 2, DType> dst,
+                    const Tensor<gpu, 2, DType>& src) {
+  cuda::Softmax(dst, src);
+}
+}  // namespace mshadow
+#endif  // __CUDACC__
+#endif  // MSHADOW_TENSOR_GPU_INL_H_
diff --git a/mshadow/tensor_gpu-inl.hpp b/mshadow/tensor_gpu-inl.hpp
deleted file mode 100644
index a2c1fc4a138f..000000000000
--- a/mshadow/tensor_gpu-inl.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef MSHADOW_TENSOR_GPU_INL_HPP
-#define MSHADOW_TENSOR_GPU_INL_HPP
-/*!
- * \file tensor_gpu-inl.hpp
- * \brief implementation of GPU host code
- * \author Bing Xu, Tianqi Chen
- */
-#include "tensor.h"
-
-#if !(MSHADOW_USE_CUDA)
-namespace mshadow {
-    // do nothing if no GPU operation is involved
-    inline void InitTensorEngine( int dev_id ){
-    }
-    inline void ShutdownTensorEngine( void ){
-    }
-};
-#else
-namespace mshadow {
-    #if (MSHADOW_USE_NVML)
-    inline int AutoSelectDevice(int device_count) {
-        // TODO nvml device id and cuda device id are not consistent
-        return 0;
-    }
-    #endif
-    inline void InitTensorEngine(int dev_id){
-        cudaDeviceProp prop;
-        int device_id = 0;
-        int device_count = 0;
-        cudaGetDeviceCount(&device_count);
-        utils::Assert(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration");
-        if (dev_id < 0) {
-            #if (MSHADOW_USE_NVML)
-            device_id = AutoSelectDevice(device_count);
-            #endif
-        } else {
-            device_id = dev_id;
-        }
-        utils::Assert( device_id < device_count, "Incorrect Device ID" );
-        utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" );
-        cudaGetDeviceProperties(&prop, device_id);
-        printf("Use CUDA Device %d: %s\n", device_id, prop.name);
-        cublasInit();
-    }
-    inline void ShutdownTensorEngine( void ){
-        cublasShutdown();
-    }
-
-    template<int dim>
-    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){
-        size_t pitch;
-        // common choice for cuda mem align unit is 32
-        if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){
-            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
-                                               obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
-            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
-            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
-        }else{
-            obj.shape.stride_ = obj.shape[0];
-            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
-                                               obj.shape.Size() * sizeof(real_t), 1 );
-            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
-        }
-    }
-
-    template<int dim>
-    inline void FreeSpace(Tensor<gpu,dim> &obj){
-        cudaFree( obj.dptr ); obj.dptr = NULL;
-    }
-
-    template<typename A,typename B, int dim>
-    inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind kind){
-        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
-        Tensor<A,2> dst = _dst.FlatTo2D();
-        Tensor<B,2> src = _src.FlatTo2D();
-        cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t),
-                                        src.dptr, src.shape.stride_ * sizeof(real_t),
-                                        dst.shape[0] * sizeof(real_t),
-                                        dst.shape[1], kind );
-        utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
-    }
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src){
-        Copy( dst, src, cudaMemcpyDeviceToHost );
-    }
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src){
-        Copy( dst, src, cudaMemcpyDeviceToDevice );
-    }
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src){
-        Copy( dst, src, cudaMemcpyHostToDevice );
-    }
-};
-
-#ifdef __CUDACC__
-// the following part is included only if compiler is nvcc
-#include "cuda/tensor_gpu-inl.cuh"
-
-namespace mshadow{
-    template<typename Saver, typename E, int dim>
-    inline void MapPlan(Tensor<gpu,dim> _dst, const expr::Plan<E> &plan){
-        cuda::MapPlan<Saver>( _dst.FlatTo2D(), plan );
-    }
-
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<gpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
-        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
-        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
-        MapPlan<Saver>( dst, MakePlan( exp.self() ) );
-    }
-
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<gpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
-
-        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
-        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
-        cuda::MapReduceKeepLowest<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, eshape );
-    }
-
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<gpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        typedef Shape< ExpInfo<E>::kDim > EShape;
-        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
-        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
-        // use equvalent form
-        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep],
-                                  eshape.ProdShape(1,dimkeep), eshape[0] );
-        // call equavalent map red dim 2
-        cuda::MapReduceKeepDim2<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, pshape );
-    }
-
-    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2>& src ){
-        cuda::Softmax( dst, src );
-    }
-}; // namespace mshadow
-
-#endif // __CUDACC__
-
-#endif // MSHADOW_USE_CUDA
-#endif // TENSOR_GPU_INL_HPP
diff --git a/mshadow/tensor_io.h b/mshadow/tensor_io.h
deleted file mode 100644
index 2ce28b3a75e6..000000000000
--- a/mshadow/tensor_io.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef MSHADOW_TENSOR_IO_H
-#define MSHADOW_TENSOR_IO_H
-/*!
- * \file tensor_io.h
- * \brief definitions of I/O functions for mshadow tensor
- * \author Tianqi Chen
- */
-#include <cstdio>
-#include "tensor.h"
-
-namespace mshadow{
-    namespace utils{
-        /*! 
-         * \brief interface of stream I/O, used to serialize data, 
-         *   it is not restricted to only this interface in SaveBinary/LoadBinary
-         *   mshadow accept all class that implements Read and Write
-         */
-        class IStream{
-        public:
-            /*! 
-             * \brief read data from stream
-             * \param ptr pointer to memory buffer
-             * \param size size of block
-             * \return usually is the size of data readed
-             */
-            virtual size_t Read( void *ptr, size_t size ) = 0;        
-            /*! 
-             * \brief write data to stream
-             * \param ptr pointer to memory buffer
-             * \param size size of block
-             */
-            virtual void Write( const void *ptr, size_t size ) = 0;
-            /*! \brief virtual destructor */
-            virtual ~IStream( void ){}
-        };
-    };
-    
-    /*! 
-     * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
-     * \param fo output binary stream
-     * \param src source data file
-     * \tparam dim dimension of tensor
-     * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-     */
-    template<int dim,typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa SaveBinary */
-    template<int dim,typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src );
-
-    /*! 
-     * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
-     *       if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded
-     *       if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst
-     * \param fi output binary stream
-     * \param dst destination file
-     * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen
-     * \tparam dim dimension of tensor     
-     * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-     */
-    template<int dim,typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst, bool pre_alloc );
-    /*! \brief refer to comment of cpu ver \sa LoadBinary */
-    template<int dim,typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc );
-    
-    namespace utils{
-        /*! \brief implementation of file i/o stream */
-        class FileStream: public IStream{
-        public:
-            /*! \brief constructor */
-            FileStream( FILE *fp ):fp_(fp){}
-            virtual size_t Read( void *ptr, size_t size ){
-                return fread( ptr, size, 1, fp_ );
-            }
-            virtual void Write( const void *ptr, size_t size ){
-                fwrite( ptr, size, 1, fp_ );
-            }
-            /*! \brief close file */
-            inline void Close( void ){
-                fclose( fp_ );
-            }
-        private:
-            FILE *fp_;
-        };
-    };
-};
-
-namespace mshadow{
-    // implementations
-    template<int dim, typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src_ ){
-        fo.Write( src_.shape.shape_, sizeof(index_t) * dim );
-        Tensor<cpu,2> src = src_.FlatTo2D();
-        for( index_t i = 0; i < src.shape[1]; ++ i ){
-            fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] );
-        }
-    }
-    template<int dim, typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src ){
-        // copy to CPU, then save
-        Tensor<cpu,dim> tmp( src.shape ); 
-        AllocSpace( tmp );
-        Copy( tmp, src );
-        SaveBinary( fo, tmp );
-        FreeSpace( tmp );
-    }
-
-    template<int dim, typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst_, bool pre_alloc ){
-        Shape<dim> shape;
-        utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, "mshadow::LoadBinary" );
-        if( pre_alloc ){
-            utils::Assert( shape == dst_.shape );
-        }else{
-            dst_.shape = shape; AllocSpace( dst_ );
-        }
-        Tensor<cpu,2> dst = dst_.FlatTo2D();
-        if( dst.shape[0] == 0 ) return;        
-        for( index_t i = 0; i < dst.shape[1]; ++ i ){
-            utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) != 0, "mshadow::LoadBinary" );
-        }
-    } 
-    template<int dim, typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc ){
-        Tensor<cpu,dim> tmp;
-        LoadBinary( fi, tmp, false );
-        if( pre_alloc ){
-            utils::Assert( tmp.shape == dst.shape );
-        }else{
-            dst.shape = tmp.shape; AllocSpace( dst );
-        }
-        Copy( dst, tmp );
-        FreeSpace( tmp );
-    }
-};
-#endif // TENSOR_IO_H
diff --git a/mshadow/tensor_random.h b/mshadow/tensor_random.h
deleted file mode 100644
index b3f0b8498e0c..000000000000
--- a/mshadow/tensor_random.h
+++ /dev/null
@@ -1,299 +0,0 @@
-#ifndef MSHADOW_TENSOR_RANDOM_H
-#define MSHADOW_TENSOR_RANDOM_H
-/*!
- *  \file tensor_random.h
- *  \brief Random inline functions for tensor.
- *  \author Bing Xu, Tianqi Chen
- *   Based on curand|MKL|stdlib
- */
-#include <cstdlib>
-#include "tensor.h"
-#include "tensor_container.h"
-
-namespace mshadow {
-    /*! 
-     * \brief random number generator 
-     * \tparam Device the device of random number generator
-     */
-    template<typename Device>
-    class Random {};
-
-    /*! \brief CPU random number generator */
-    template<>
-    class Random<cpu> {
-    public:
-        /*!
-         * \brief constructor of random engine
-         * \param seed random number seed
-         */
-        Random<cpu>( int seed ){
-            #if MSHADOW_USE_MKL
-            int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
-            utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine failed to be initialized.\n" );
-            #else
-            srand(seed);
-            #endif
-            buffer_.Resize( Shape1( kRandBufferSize ) );
-        }
-        ~Random<cpu>() {
-            #if MSHADOW_USE_MKL
-            vslDeleteStream(&vStream_);
-            #endif
-        }
-        /*!
-         * \brief seed random number generator using this seed
-         * \param seed seed of prng
-         */
-        inline void Seed( int seed ){
-            #if MSHADOW_USE_MKL
-            int status = vslDeleteStream(&vStream_);
-            utils::Assert(status == VSL_STATUS_OK);
-            status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
-            utils::Assert(status == VSL_STATUS_OK);
-            #else
-            srand( seed );
-            #endif
-        }
-        /*!
-         * \brief generate data from uniform [a,b)
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleUniform( Tensor<cpu, dim> &dst, real_t a=0.0f, real_t b=1.0f ) {
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
-            for ( index_t i = 0; i < mat.shape[1]; ++i ) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
-                #else
-                int status = vdRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
-                #else
-                // use stdlib
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = this->RandNext()*(b-a) + a;
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief generate data from standard gaussian
-         * \param dst destination
-         * \param mu mean variable
-         * \param sigma standard deviation
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleGaussian( Tensor<cpu, dim> &dst, real_t mu = 0.0f, real_t sigma = 1.0f ) {
-            if( sigma <= 0.0f ) {
-                dst = mu; return;
-            }
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
-            for (index_t i = 0; i < mat.shape[1]; ++i) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma );
-                #else
-                int status = vdRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
-                #else
-                real_t g1 = 0.0f, g2 = 0.0f;
-                for (index_t j = 0; j < mat.shape[0]; ++j) {
-                    if( (j & 1) == 0 ){
-                        this->SampleNormal2D( g1, g2 );
-                        mat[i][j] = mu + g1 * sigma;
-                    }else{
-                        mat[i][j] = mu + g2 * sigma;
-                    }
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief return a temporal expression storing standard gaussian random variables
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> gaussian( Shape<dim> shape ){
-            buffer_.Resize( Shape1( shape.Size() ) );
-            this->SampleGaussian( buffer_, 0.0f, 1.0f );
-            return expr::reshape( buffer_, shape );
-        }
-        /*!
-         * \brief return a temporal expression storing standard uniform [0,1)
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> uniform( Shape<dim> shape ){
-            buffer_.Resize( Shape1( shape.Size() ) );
-            this->SampleUniform( buffer_, 0.0f, 1.0f );
-            return expr::reshape( buffer_, shape );
-        }
-    private:
-        /*! \brief get next random number from rand */
-        inline real_t RandNext( void ){
-            return static_cast<real_t>(rand()) / (static_cast<real_t>(RAND_MAX)+1.0f);
-        }
-        /*! \brief return a real numer uniform in (0,1) */
-        inline real_t RandNext2( void ){
-            return (static_cast<real_t>( rand() ) + 1.0 ) / (static_cast<real_t>(RAND_MAX) + 2.0);
-        }
-        /*!
-         * \brief sample iid xx,yy ~N(0,1)
-         * \param xx first  gaussian output
-         * \param yy second gaussian output
-         */
-        inline void SampleNormal2D( real_t &xx, real_t &yy ){
-            real_t x,y,s;
-            do{
-                x = 2.0f * RandNext2() - 1.0f;
-                y = 2.0f * RandNext2() - 1.0f;
-                s = x*x + y*y;
-            }while( s >= 1.0f || s == 0.0f );
-            real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ;
-            xx = x * t; yy = y * t;
-        }
-    private:
-        #if MSHADOW_USE_MKL
-        /*! \brief stream used by MKL VSL */
-        VSLStreamStatePtr vStream_;
-        #endif
-        /*! \brief temporal space used to store random numbers */
-        TensorContainer<cpu,1> buffer_;
-    }; // class Random<cpu>
-
-#ifdef __CUDACC__
-
-    /*! \brief GPU random number generator */
-    template<>
-    class Random<gpu> {
-    public:
-        /*!
-         * \brief constructor of random engine
-         * \param seed random number seed
-         */
-        Random<gpu>(int seed) {
-            curandStatus_t status;
-            status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT);
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "Can not create CURAND Generator");
-            this->Seed( seed );
-            buffer_.Resize( Shape1(kRandBufferSize) );
-        }
-
-        ~Random<gpu>() {
-            curandStatus_t status;
-            status = curandDestroyGenerator(gen_);
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "Destory CURAND Gen failed");
-        }
-        /*!
-         * \brief seed random number generator using this seed
-         * \param seed seed of prng
-         */
-        inline void Seed( int seed ){
-            curandStatus_t status;
-            status = curandSetPseudoRandomGeneratorSeed(gen_, seed);
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "Set CURAND seed failed.");
-        }
-        /*!
-         * \brief generate data from uniform [a,b)
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleUniform(Tensor<gpu, dim> &dst, real_t a=0.0f, real_t b=1.0f) {
-            if( a == 0.0f && b == 1.0f ){
-                dst = this->uniform( dst.shape );
-            }else{
-                dst = this->uniform( dst.shape ) *(b-a) + a;
-            }
-        }
-        /*!
-         * \brief generate data from standard gaussian
-         * \param dst destination
-         * \param mu mean variable
-         * \param sigma standard deviation
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleGaussian(Tensor<gpu, dim> &dst, real_t mu = 0.0f, real_t sigma = 1.0f) {
-            dst = this->gaussian( dst.shape, mu, sigma );
-        }
-        /*!
-         * \brief return a temporal expression storing standard gaussian random variables
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \param mu mean
-         * \param sigma variance
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<gpu,1>,dim,1> gaussian( Shape<dim> shape, real_t mu=0.0f, real_t sigma=1.0f){
-            size_t aligned_sz = ((shape.Size() + 1UL)>>1)<<1;
-            // allocate alligned size
-            buffer_.Resize( Shape1( aligned_sz ) );
-            buffer_.Resize( Shape1( shape.Size() ) );
-            curandStatus_t status;
-            #if MSHADOW_SINGLE_PRECISION
-            status = curandGenerateNormal(gen_, buffer_.dptr, aligned_sz , mu, sigma);
-            #else
-            status = curandGenerateNormalDouble(gen_, buffer_.dptr, buffer_.shape[0], mu, sigma);
-            #endif
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n");
-            return expr::reshape( buffer_, shape );
-        }
-        /*!
-         * \brief return a temporal expression storing standard uniform [0,1)
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<gpu,1>,dim,1> uniform(Shape<dim> shape) {
-            buffer_.Resize( Shape1( shape.Size() ) );
-            curandStatus_t status;
-            #if MSHADOW_SINGLE_PRECISION
-            status = curandGenerateUniform(gen_, buffer_.dptr, buffer_.shape[0] );
-            #else
-            status = curandGenerateUniformDouble(gen_, buffer_.dptr, buffer_.shape[0] );
-            #endif
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n");
-            return expr::reshape( buffer_, shape );
-        }
-    private:
-        /*! \brief random numbeer generator */
-        curandGenerator_t gen_;
-        /*! \brief templ buffer */
-        TensorContainer<gpu, 1> buffer_;
-    }; // class Random<gpu>
-    #endif
-
-}; // namespace mshadow
-
-#endif // MSHADOW_TENSOR_RANDOM_H
diff --git a/mshadow/tensor_sse-inl.hpp b/mshadow/tensor_sse-inl.hpp
deleted file mode 100644
index b98383e83d6a..000000000000
--- a/mshadow/tensor_sse-inl.hpp
+++ /dev/null
@@ -1,431 +0,0 @@
-#ifndef MSHADOW_TENSOR_SSE_INL_HPP
-#define MSHADOW_TENSOR_SSE_INL_HPP
-/*!
- * \file tensor_sse-inl.hpp
- * \brief support of sse2 optimization of some operations
- * \author Tianqi Chen
- */
-#ifdef __APPLE__
-#include <stdlib.h>
-#else
-#include <malloc.h>
-#endif
-
-#include "tensor_expr.h"
-#include "tensor.h"
-
-namespace mshadow {
-    /*! \brief namespace to support sse2 vectorization */
-    namespace sse2{
-        /*! 
-         * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
-         * \param pitch output parameter, the actuall space allocated for each line
-         * \param lspace number of cells required for each line
-         * \param num_line number of lines to be allocated
-         */
-        inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){
-            pitch = ((lspace+15) >> 4) << 4;
-            #ifdef _MSC_VER
-            void * res = _aligned_malloc( pitch*num_line, 16 ); 
-            #else
-            #ifdef __APPLE__
-            void *res = malloc( pitch * num_line );
-            #else
-            void * res = memalign( 16, pitch*num_line ); 
-            #endif
-            #endif
-            utils::Assert( res != NULL, "AlignedMallocPitch failed" );
-            return res;
-        }
-        /*! 
-         * \brief free aligned space 
-         * \param ptr pointer to space to be freed
-         */
-        inline void AlignedFree( void *ptr ){
-            #ifdef _MSC_VER
-            _aligned_free( ptr );
-            #else
-            free( ptr );
-            #endif
-        }
-        /*! \brief check if a pointer is aligned */
-        inline bool CheckAlign( size_t pitch ){
-            return !(pitch & ((1<<4)-1));
-        }
-        /*! \brief check if a pointer is aligned */
-        inline bool CheckAlign( void *ptr ){
-            return CheckAlign( (size_t)ptr );
-        }
-        /*! 
-         * \brief get upper bound of aligned index of size 
-         * \param size size of the array
-         * \param fsize size of float
-         */
-        inline index_t UpperAlign( index_t size, size_t fsize ){
-            return (( (size*fsize+15) >> 4 ) << 4) / fsize;
-        }
-        /*! 
-         * \brief get lower bound of aligned index of size 
-         * \param size size of the array
-         * \param fsize size of float
-         */
-        inline index_t LowerAlign( index_t size, size_t fsize ){
-            return (( (size*fsize) >> 4 ) << 4) / fsize;
-        }
-    }; // namespace sse2
-}; // namespace  mshadow
-
-#if MSHADOW_USE_SSE
-// sse types are not compatible with nvcc, only use them in cpu mode
-#include <emmintrin.h>
-
-namespace mshadow{
-    namespace sse2{
-        /*! 
-         * \brief float vector real type, used for vectorization 
-         * \tparam FloatType double or float
-         */
-        template<typename FloatType> struct FVec{};
-        
-        /*! \brief vector real type for float */
-        template<> 
-        struct FVec<float> {
-        public:
-            typedef __m128 DType;
-            /*! \brief number of float in vector */
-            const static index_t kSize = 4;
-            /*! \brief data content */
-            DType data_;
-        public:
-            /* constructors */
-            FVec( void ){}
-            FVec( DType data ):data_(data){}
-            /* set the float */
-            FVec( const float &s ){
-                data_ = _mm_set1_ps( s );
-            }
-            /*!\brief load from pointer src */
-            FVec( const float *src ){
-                data_ = _mm_load_ps( src );                
-            } 
-        public:
-            /*! \brief store data into dst space */
-            inline void Store( float *dst ) const{
-                return _mm_store_ps( dst, data_ );
-            }
-            /*! \brief sum of all content */
-            inline float Sum( void ) const{
-                DType ans  = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );
-                DType rst  = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );
-                #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
-                return rst.m128_f32[ 0 ];
-                #else
-                float rr = _mm_cvtss_f32( rst ) ;
-                return rr;
-                #endif
-            }
-        };
-
-        /*! \brief vector real type for float */
-        template<> 
-        struct FVec<double> {
-        public:
-            typedef __m128d DType;
-            /*! \brief number of float in vector */
-            const static index_t kSize = 2;
-            /*! \brief data content */
-            DType data_;
-        public:
-            /* constructors */
-            FVec( void ){}
-            FVec( DType data ):data_(data){}
-            /* set the float */
-            FVec( const double &s ){
-                data_ = _mm_set1_pd( s );
-            }
-            /*!\brief load from pointer src */
-            FVec( const double *src ){
-                data_ = _mm_load_pd( src );                
-            } 
-        public:
-            /*! \brief store data into dst space */
-            inline void Store( double *dst ) const{
-                return _mm_store_pd( dst, data_ );
-            }
-            /*! \brief sum of all content */
-            inline double Sum( void ) const{
-                DType tmp =  _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;
-                #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
-                return tmp.m128d_f64[0];
-                #else
-                double ans = _mm_cvtsd_f64( tmp );
-                return ans;
-                #endif
-            }
-        };
-    };
-
-    namespace sse2{
-        /*! \brief sse2 operator type of certain operator */
-        template<typename OP>
-        struct SSEOp{
-            const static bool kEnabled = false;
-        };        
-        template<>
-        struct SSEOp<op::plus>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_add_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_add_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-        template<>
-        struct SSEOp<op::minus>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_sub_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_sub_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-        template<>
-        struct SSEOp<op::mul>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_mul_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_mul_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-        template<>
-        struct SSEOp<op::div>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_div_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_div_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-
-        template<>
-        struct SSEOp<op::identity>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &src ){
-                return src;
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &src ){
-                return src;
-            }
-        };
-    }; // namespace sse2
-    
-    namespace sse2{
-        // savers to do storage
-        template<typename SV, typename TFloat>
-        struct Saver{
-            MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
-                FVec<TFloat> lhs( dst );
-                FVec<TFloat> ans = SSEOp<typename SV::OPType>::Map( lhs, src );
-                ans.Store( dst );
-            }
-        };
-        template<typename TFloat>
-        struct Saver<sv::saveto,TFloat>{
-            MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
-                src.Store( dst );
-            }
-        };        
-    }; // namespace sse2
-}; // namespace mshadow
-
-namespace mshadow{
-    namespace expr{
-        // same as plan, but use sse2
-        template<typename ExpType>
-        class SSEPlan {
-        public:
-            /*!
-             * \brief evaluate the expression at index [y][x], x will be aligned to 4
-             *        to be implemented by SubType
-             */
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const;
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const;
-        };
-
-        template <typename Device, int dim>
-        class SSEPlan< Tensor<Device,dim> >{
-        public:
-            SSEPlan( const Tensor<Device,dim> &t )
-                :dptr_(t.dptr),stride_(t.shape.stride_){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::FVec<real_t>( &dptr_[ y*stride_+x ] );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ y * stride_ + x ];
-            }
-        private:
-            const real_t  *dptr_;
-            index_t stride_;
-        };
-
-        template<>
-        class SSEPlan<ScalarExp>{
-        public:
-            SSEPlan( real_t scalar ):scalar_(scalar){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::FVec<real_t>( scalar_ );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return scalar_;
-            }
-        private:
-            real_t scalar_;
-        };
-
-        template<typename OP, typename TA, typename TB,int etype>
-        class SSEPlan< BinaryMapExp<OP,TA,TB,etype> >{
-        public:
-            SSEPlan( const SSEPlan<TA> &lhs, const SSEPlan<TB> &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::SSEOp<OP>::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
-            }
-        private:
-            SSEPlan<TA> lhs_;
-            SSEPlan<TB> rhs_;
-        };
-
-        template<typename OP, typename TA, int etype>
-        class SSEPlan< UnaryMapExp<OP,TA,etype> >{
-        public:
-            SSEPlan( const SSEPlan<TA> &src ):src_(src){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::SSEOp<OP>::Map( src_.EvalSSE( y, x ) );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( src_.Eval( y, x ) );
-            }
-        private:
-            SSEPlan<TA> src_;
-        };
-
-        template<typename OP, typename TA, typename TB, int etype>
-        inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e );
-
-        inline SSEPlan<ScalarExp> MakeSSEPlan( const ScalarExp &e ){
-            return SSEPlan<ScalarExp>( e.scalar_ );
-        }
-
-        template<typename T>
-        inline SSEPlan<T> MakeSSEPlan( const ContainerExp<T> &e ){
-            return SSEPlan<T>( e.self() );
-        }
-
-        template<typename T,int dim>
-        inline SSEPlan<T> MakeSSEPlan( const MakeTensorExp<T,cpu,dim> &e ){
-            return SSEPlan<T>( e.real_self() );
-        }
-
-        template<typename OP, typename TA, int etype>
-        inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan( const UnaryMapExp<OP,TA,etype> &e ){
-            return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );
-        }
-
-        template<typename OP, typename TA, typename TB, int etype>
-        inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
-                return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );
-        }
-    };
-
-    namespace expr{
-        /*!
-         * \brief static check sse enable
-         *        if a expression E can not be evaluated using sse, then kPass = false
-         * \tparam Device the type of Device
-         * \tparam dim dimension of the tensor
-         * \tparam E expression
-         */
-        template<typename E>
-        struct SSECheck{
-            const static bool kPass = false;
-        };
-        template<>
-        struct SSECheck<ScalarExp>{
-            const static bool kPass = true;
-        };
-        template<int dim>
-        struct SSECheck<Tensor<cpu,dim> >{
-            const static bool kPass = true;
-        };
-        
-        template<typename OP, typename TA, int etype>
-        struct SSECheck<UnaryMapExp<OP,TA,etype> >{
-            const static bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;
-        };
-        template<typename OP, typename TA, typename TB, int etype>
-        struct SSECheck< BinaryMapExp<OP,TA,TB,etype> >{
-            const static bool kPass = SSECheck<TA>::kPass && SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;
-        }; 
-    }; // namespace expr
-    namespace expr{
-        // check if data is aligned and allow sse operation
-        template<int dim,typename E>
-        struct SSEAlignCheck{
-            inline static bool Check( const E &exp ){
-                return false;
-            }
-        };
-        template<int dim>
-        struct SSEAlignCheck< dim, ScalarExp >{
-            inline static bool Check( const ScalarExp &exp ){
-                return true;
-            }
-        };
-        template<int dim>
-        struct SSEAlignCheck< dim,Tensor<cpu,dim> >{
-            inline static bool Check( const Tensor<cpu,dim> &t ){
-                return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) );
-            }
-        };
-        template<int dim, typename OP, typename TA, int etype>
-        struct SSEAlignCheck< dim, UnaryMapExp<OP,TA,etype> >{
-            inline static bool Check( const UnaryMapExp<OP,TA,etype> &t ){
-                return SSEAlignCheck<dim,TA>::Check( t.src_);
-            }
-        };
-        template<int dim, typename OP, typename TA, typename TB, int etype>
-        struct SSEAlignCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{ 
-            inline static bool Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
-                return SSEAlignCheck<dim,TA>::Check( t.lhs_ ) && 
-                    SSEAlignCheck<dim,TB>::Check( t.rhs_ );
-            }
-        };
-    }; // namespace expr
-
-    /*! 
-     * \brief use SSEPlan to compute result
-     */
-    template<typename SV, typename E, int dim>
-    inline void MapSSEPlan(Tensor<cpu,dim> _dst, const expr::SSEPlan<E> &plan){        
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) );
-        for ( index_t y = 0; y < dst.shape[1]; y ++ ) {
-            for( index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){
-                sse2::Saver<SV,real_t>::Save( &dst[y][x], plan.EvalSSE( y,x ) );
-            }
-            for( index_t x = xlen; x < dst.shape[0]; x ++ ){
-                SV::Save( dst[y][x], plan.Eval(y,x) );
-            }
-        }
-    }
-}; // namespace mshadow
-#endif // MSHADOW_USE_SSE
-#endif // MSHADOW_TENSOR_SSE_INL_HPP
diff --git a/mshadow/utils.h b/mshadow/utils.h
new file mode 100644
index 000000000000..6003f5562814
--- /dev/null
+++ b/mshadow/utils.h
@@ -0,0 +1,81 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file utils.h
+ * \brief simple utils for error and checkings
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_UTILS_H_
+#define MSHADOW_UTILS_H_
+#define _CRT_SECURE_NO_WARNINGS
+#include <cstdio>
+#include <string>
+#include <cstdarg>
+#include <cstdlib>
+namespace mshadow {
+/*! \brief namespace for helper utils of the project */
+namespace utils {
+/*! \brief error message buffer length */
+const int kPrintBuffer = 1 << 12;
+
+#ifndef MSHADOW_CUSTOMIZE_ASSERT_
+/*! 
+ * \brief handling of Assert error, caused by in-apropriate input
+ * \param msg error message 
+ */
+inline void HandleAssertError(const char *msg) {
+  fprintf(stderr, "AssertError:%s\n", msg);
+  exit(-1);
+}
+/*! 
+ * \brief handling of Check error, caused by in-apropriate input
+ * \param msg error message 
+ */
+inline void HandleCheckError(const char *msg) {
+  fprintf(stderr, "%s\n", msg);
+  exit(-1);
+}
+#else
+// include declarations, some one must implement this
+void HandleAssertError(const char *msg);
+void HandleCheckError(const char *msg);
+void HandlePrint(const char *msg);
+#endif
+
+/*! \brief assert an condition is true, use this to handle debug information */
+inline void Assert(bool exp, const char *fmt, ...) {
+  if (!exp) {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleAssertError(msg.c_str());
+  }
+}
+
+/*!\brief same as assert, but this is intended to be used as message for user*/
+inline void Check(bool exp, const char *fmt, ...) {
+  if (!exp) {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleCheckError(msg.c_str());
+  }
+}
+
+/*! \brief report error message, same as check */
+inline void Error(const char *fmt, ...) {
+  {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleCheckError(msg.c_str());
+  }
+}
+}  // namespace utils
+}  // namespace mshadow
+#endif  // MSHADOW_UTILS_H_
diff --git a/example/neuralnet/Makefile.openblas b/test/Makefile
similarity index 56%
rename from example/neuralnet/Makefile.openblas
rename to test/Makefile
index ef82c1115df7..061b99b2e119 100644
--- a/example/neuralnet/Makefile.openblas
+++ b/test/Makefile
@@ -1,24 +1,21 @@
 # set LD_LIBRARY_PATH
-# echo "Link mshadow with precomplied Openblas"
-export OPENBLAS_ROOT=../../../OpenBLAS-v0.2.13-Win64-int32
 export CC  = gcc
 export CXX = g++
 export NVCC =nvcc
-export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../../ -I$(OPENBLAS_ROOT)/include -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1 -D__APPLE__
-export LDFLAGS= -static -lpthread -lopenblas -L$(OPENBLAS_ROOT)/lib
+export CFLAGS = -Wall -O3 -g -msse3 -Wno-unknown-pragmas -funroll-loops -I../
+export LDFLAGS= -g -lm -lcublas -lcudart
 export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX)
 
 # specify tensor path
-BIN = nnet convnet
+BIN =
 OBJ =
 CUOBJ =
-CUBIN =
+CUBIN = test
 .PHONY: clean all
 
-all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ)
+all: $(CUBIN) $(OBJ)
 
-nnet: nnet.cpp
-convnet: convnet.cpp
+test: test.cu
 
 $(BIN) :
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS)
diff --git a/test/pairtest.cu b/test/pairtest.cu
new file mode 100644
index 000000000000..56b0380747a7
--- /dev/null
+++ b/test/pairtest.cu
@@ -0,0 +1,105 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist, int I, int J) {
+  for (int i = 0; i < I; ++i) {
+    for (int j = 0; j < J; ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int X = 6;
+  const int K = 2;
+  const int O = (X - K) / 2 + 1;
+  mshadow::TensorContainer<mshadow::cpu, 4, float> srcm(mshadow::Shape4(1,1,X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 4> srcx(Xmshadow::Shape4(1,1,X, X));
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      srcm[0][0][i][j] = i * 0.1f + j * 0.1f;
+      srcx[0][0][i][j] = i * 0.1f + j * 0.1f;
+    }
+  }
+  printf("Source:\n");
+  Print(srcm[0][0], X, X);
+  printf("\n");
+  mshadow::TensorContainer<xpua, 4, float> mct(mshadow::Shape4(1,1,X, X));
+  Xmshadow::TensorContainer<xpub, 4> xct(Xmshadow::Shape4(1,1,X, X));
+  mshadow::Copy(mct, srcm);
+  Xmshadow::Copy(xct, srcx);
+
+  
+  mshadow::TensorContainer<xpua, 4, float> pool_ct(mshadow::Shape4(1,1, O, O));
+  Xmshadow::TensorContainer<xpub, 4> pool_xct(Xmshadow::Shape4(1,1,O,O));
+
+  pool_ct = mshadow::expr::pool<mshadow::red::maximum>(mct, K, K, K);
+  pool_xct = Xmshadow::expr::pool<Xmshadow::red::maximum>(xct, K, K);
+
+  printf("New pool:\n");
+  Print(pool_ct[0][0], O, O);
+  printf("\nOld pool:\n");
+  Print(pool_xct[0][0], O, O);
+  printf("\n");
+  mshadow::TensorContainer<mshadow::cpu, 4, float> gpool_src(mshadow::Shape4(1,1, O, O));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 4> gpool_xsrc(Xmshadow::Shape4(1,1,O,O));
+  for (int i = 0; i < O; ++i) {
+    for (int j = 0; j < O; ++j) {
+      gpool_src[0][0][i][j] = 0.1f;
+      gpool_xsrc[0][0][i][j] = 0.1f;
+    }
+  }
+  mshadow::TensorContainer<xpua, 4, float> gpool_ct(mshadow::Shape4(1,1, O, O));
+  Xmshadow::TensorContainer<xpub, 4> gpool_xct(Xmshadow::Shape4(1,1,O,O));
+  mshadow::Copy(gpool_ct, gpool_src);
+  Xmshadow::Copy(gpool_xct, gpool_xsrc);
+
+  mshadow::TensorContainer<xpua, 4, float> mout(mshadow::Shape4(1,1,X, X));
+  Xmshadow::TensorContainer<xpub, 4> xout(Xmshadow::Shape4(1,1,X, X));
+
+  mout = mshadow::expr::unpool<mshadow::red::maximum>(mct, pool_ct, gpool_ct, K, K, K);
+  xout = Xmshadow::expr::unpool<Xmshadow::red::maximum>(xct, pool_xct, gpool_xct, K, K);
+
+  mshadow::Copy(srcm, mout);
+  Xmshadow::Copy(srcx, xout);
+
+  mshadow::TensorContainer<mshadow::cpu, 2> l1(mshadow::Shape2(X,X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> l2(Xmshadow::Shape2(X, X));
+  l1 = mshadow::expr::reshape(srcm, l1.shape_);
+  l2 = Xmshadow::expr::reshape(srcx, l2.shape);
+  printf("New unpool\n");
+  Print(l1, l1.size(0), l1.size(1));
+  printf("\nOld unpool\n");
+  Print(l2, X, X);
+  if (Check(l1, l2)) {
+    printf("Pass\n");
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 1) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file
diff --git a/test/pool.cu b/test/pool.cu
new file mode 100644
index 000000000000..9641d53c9c45
--- /dev/null
+++ b/test/pool.cu
@@ -0,0 +1,69 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist) {
+  for (int i = 0; i < ist.size(0); ++i) {
+    for (int j = 0; j < ist.size(1); ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int X = 6;
+  const int K = 2;
+  mshadow::TensorContainer<mshadow::cpu, 2, float> srcm(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> srcx(Xmshadow::Shape2(X, X));
+  
+  mshadow::TensorContainer<xpua, 2, float> mct(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<xpub, 2> xct(Xmshadow::Shape2(X, X));
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      srcm[i][j] = i * 0.1f + j * 0.1f;
+      srcx[i][j] = i * 0.1f + j * 0.1f;
+    }
+  }
+  mshadow::Copy(mct, srcm);
+  Xmshadow::Copy(xct, srcx);
+  mshadow::TensorContainer<xpua, 2, float> pool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+  Xmshadow::TensorContainer<xpub, 2> pool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+
+  pool_ct = mshadow::expr::pool<mshadow::red::maximum>(mct, K, K, K);
+  pool_xct = Xmshadow::expr::pool<Xmshadow::red::maximum>(xct, K, K);
+
+  mshadow::TensorContainer<mshadow::cpu, 2, float> cpool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> cpool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+  mshadow::Copy(cpool_ct, pool_ct);
+  Xmshadow::Copy(cpool_xct, pool_xct);
+  if (Check(cpool_ct, cpool_xct)) {
+    printf("Pass\n");
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file
diff --git a/test/reshape.cu b/test/reshape.cu
new file mode 100644
index 000000000000..c1ad52e07c40
--- /dev/null
+++ b/test/reshape.cu
@@ -0,0 +1,74 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist) {
+  for (int i = 0; i < ist.size(0); ++i) {
+    for (int j = 0; j < ist.size(1); ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int X = 6;
+  const int K = 2;
+  mshadow::TensorContainer<mshadow::cpu, 2, float> srcm(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> srcx(Xmshadow::Shape2(X, X));
+  
+  mshadow::TensorContainer<xpua, 2, float> mct(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<xpub, 2> xct(Xmshadow::Shape2(X, X));
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      srcm[i][j] = i * 0.1f + j * 0.1f;
+      srcx[i][j] = i * 0.1f + j * 0.1f;
+    }
+  }
+  mshadow::Copy(mct, srcm);
+  Xmshadow::Copy(xct, srcx);
+
+  mshadow::TensorContainer<xpua, 4, float> mct4d(mshadow::Shape4(1, 1, X / K, X * K));
+  Xmshadow::TensorContainer<xpub, 4> xct4d(Xmshadow::Shape4(X / K, X * K, 1, 1));
+  
+  mct4d = mshadow::expr::reshape(mct, mct4d.shape_);
+  xct4d = Xmshadow::expr::reshape(xct, xct4d.shape);
+  
+  mct = mshadow::expr::reshape(mct4d, mct.shape_);
+  xct = Xmshadow::expr::reshape(xct4d, xct.shape);
+  
+  mshadow::TensorContainer<mshadow::cpu, 2, float> m_ct(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> x_ct(Xmshadow::Shape2(X, X));
+  
+  mshadow::Copy(m_ct, mct);
+  Xmshadow::Copy(x_ct, xct);
+  if (Check(m_ct, x_ct)) {
+    printf("Pass\n");
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file
diff --git a/test/test.cu b/test/test.cu
new file mode 100644
index 000000000000..37fe7e76cbd0
--- /dev/null
+++ b/test/test.cu
@@ -0,0 +1,77 @@
+#include "test.h"
+
+using namespace mshadow;
+
+
+int main() {
+  InitTensorEngine();
+  Tensor<cpu, 3, float> tc = NewTensor<cpu, float>(Shape3(3, 2, 4), 0.0f);
+  Tensor<gpu, 3, float> tg = NewTensor<gpu, float>(tc.shape_, 0.0f);
+  // init
+  for (index_t i = 0; i < tc.size(0); ++i) {
+    for (index_t j = 0; j < tc.size(1); ++j) {
+      for (index_t k = 0; k < tc.size(2); ++k) {
+        tc[i][j][k] = i * 0.1f + j * 0.2f + k * 0.1f;
+      }
+    }
+  }
+  Copy(tg, tc);
+  // print
+  printf("\n#print batch 0 of cpu tensor:\n");
+  Print2DTensor(tc[0]);
+  printf("\n");
+  Print2DTensor(tc[1]);
+  printf("\n");
+  Print2DTensor(tc[2]);
+  // check
+  if (Check2DTensor(tg[1], tc[1])) {
+    printf("batch 1 of gpu & cpu tensor are same.\n");
+  }
+  // sum of row
+  Tensor<cpu, 1, float> tmp_tc = NewTensor<cpu, float>(Shape1(tc[0].size(1)), 0.0f);
+  Tensor<gpu, 1, float> tmp_tg = NewTensor<gpu, float>(Shape1(tg[0].size(1)), 0.0f);
+  printf("\n#sum_rows of batch 0:\n");
+  tmp_tc = sum_rows(tc[0]);
+  tmp_tg = sum_rows(tg[0]);
+  Print1DTensor(tmp_tc);
+  if (Check1DTensor(tmp_tg, tmp_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  FreeSpace(&tmp_tc);
+  FreeSpace(&tmp_tg);
+  // sumall_except_dim
+  printf("\n#sumall_except_dim<0> of batch 0:\n");
+  Tensor<cpu, 1, float> red_tc = NewTensor<cpu, float>(Shape1(tc.size(0)), 0.0f);
+  Tensor<gpu, 1, float> red_tg = NewTensor<gpu, float>(Shape1(tg.size(0)), 0.0f);
+  red_tc = sumall_except_dim<0>(tc);
+  red_tg = sumall_except_dim<0>(tg);
+  Print1DTensor(red_tc);
+  if (Check1DTensor(red_tg, red_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  FreeSpace(&red_tc);
+  FreeSpace(&red_tg);
+  // softmax
+  printf("\n#Softmax\n");
+  Tensor<cpu, 2, float> sm_tc = NewTensor<cpu, float>(tc[0].shape_, 0.0f);
+  Tensor<gpu, 2, float> sm_tg = NewTensor<gpu, float>(tg[0].shape_, 0.0f);
+  Softmax(sm_tc, tc[0]);
+  Softmax(sm_tg, tg[0]);
+  if (Check2DTensor(sm_tg, sm_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  // mirror
+  printf("\n#mirror\n");
+  sm_tc = mirror(tc[0]);
+  sm_tg = mirror(tg[0]);
+  if (Check2DTensor(sm_tg, sm_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  FreeSpace(&sm_tc);
+  FreeSpace(&sm_tg);
+  // reshape
+  
+  FreeSpace(&tc);
+  FreeSpace(&tg);
+  ShutdownTensorEngine();
+}
diff --git a/test/test.h b/test/test.h
new file mode 100644
index 000000000000..2cfc515957ca
--- /dev/null
+++ b/test/test.h
@@ -0,0 +1,67 @@
+#ifndef TEST_H
+#define TEST_H
+
+#include "mshadow/tensor.h"
+#include "assert.h"
+
+#define EPS 0.0001
+using namespace mshadow;
+using namespace mshadow::expr;
+
+
+template<typename xpu>
+void Print2DTensor(Tensor<xpu, 2, float> const &ts);
+
+template<typename xpu>
+void Print1DTensor(Tensor<xpu, 1, float> const &ts);
+
+template<>
+void Print1DTensor(Tensor<cpu, 1, float> const &ts) {
+  for (index_t i = 0; i < ts.size(0); ++i) {
+    printf("%.2f ", ts[i]);
+  }
+  printf("\n");
+}
+
+
+template<>
+void Print2DTensor(Tensor<cpu, 2, float> const &ts) {
+  for (index_t i = 0; i < ts.size(0); ++i) {
+    Print1DTensor(ts[i]);
+  }
+}
+
+template<>
+void Print2DTensor(Tensor<gpu, 2, float> const &tg) {
+  Tensor<cpu, 2, float> tc = NewTensor<cpu, float>(tg.shape_, 0.0f);
+  Copy(tc, tg);
+  Print2DTensor(tc);
+  FreeSpace(&tc);
+}
+
+
+
+bool Check2DTensor(Tensor<gpu, 2, float> const &tg, Tensor<cpu, 2, float> const &tc) {
+  Tensor<cpu, 2, float> tcc = NewTensor<cpu, float>(tg.shape_, 0.0f);
+  Copy(tcc, tg);
+  for (index_t i = 0; i < tc.size(0); ++i) {
+    for (index_t j = 0; j < tc.size(1); ++j) {
+      assert(abs(tcc[i][j] - tc[i][j]) < EPS);
+    }
+  }
+  FreeSpace(&tcc);
+  return true;
+}
+
+bool Check1DTensor(Tensor<gpu, 1, float> const &tg, Tensor<cpu, 1, float> const &tc) {
+  Tensor<cpu, 1, float> tcc = NewTensor<cpu, float>(tc.shape_, 0.0f);
+  Copy(tcc, tg);
+  printf("gpu result:\n");
+  Print1DTensor(tcc);
+  for (index_t i = 0; i < tc.size(0); ++i) {
+    assert(abs(tcc[i] - tc[i]) < EPS);
+  }
+  FreeSpace(&tcc);
+  return true;
+}
+#endif
diff --git a/test/unpack.cu b/test/unpack.cu
new file mode 100644
index 000000000000..dd0c2b9c5821
--- /dev/null
+++ b/test/unpack.cu
@@ -0,0 +1,85 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist) {
+  for (int i = 0; i < ist.size(0); ++i) {
+    for (int j = 0; j < ist.size(1); ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int ksize = 3;
+  const int kstride = 2;
+  const int X = 6;
+  Xmshadow::TensorContainer<Xmshadow::cpu, 4> xsrc(Xmshadow::Shape4(1, 1, X, X));
+  mshadow::TensorContainer<mshadow::cpu, 4> src(mshadow::Shape4(1, 1, X, X));
+
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      xsrc[0][0][i][j] = i * 0.1f + j * 0.2f;
+      src[0][0][i][j] = i * 0.1f + j * 0.2f;
+    }
+  }
+  Xmshadow::TensorContainer<xpub, 4> xin(Xmshadow::Shape4(1, 1, X, X));
+  mshadow::TensorContainer<xpua, 4> in(mshadow::Shape4(1, 1, X, X));
+
+  mshadow::Copy(in, src);
+  Xmshadow::Copy(xin, xsrc);
+
+  Xmshadow::TensorContainer<xpub, 2> xtmp_col;
+  mshadow::TensorContainer<xpua, 2> tmp_col;
+  
+
+  index_t oheight  = (in.size(2) - ksize)/kstride + 1;
+  index_t owidth   = (in.size(3) - ksize)/kstride + 1;
+  index_t nbatch   = in.size(0);
+
+  
+  xtmp_col.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) );
+  tmp_col.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth));
+  xtmp_col = Xmshadow::expr::unpack_patch2col( xin, ksize, kstride );
+  tmp_col = mshadow::expr::unpack_patch2col(in, ksize, ksize, kstride);
+
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> xtc;
+  mshadow::TensorContainer<mshadow::cpu, 2> tc;
+
+  xtc.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) );
+  tc.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth));
+
+  mshadow::Copy(tc, tmp_col);
+  Xmshadow::Copy(xtc, xtmp_col);
+  if (Check(tc, xtc)) {
+    printf("Pass\n");
+  }
+  
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file