diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab29b6a7aaaf..dc9ca5f7bb0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -353,8 +353,10 @@ if(USE_CUDA)
     list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
     FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
     list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
+    FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+    list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver 
   else(MSVC)
-    list(APPEND mxnet_LINKER_LIBS nvrtc cuda cufft)
+    list(APPEND mxnet_LINKER_LIBS nvrtc cuda cufft cusolver)
     link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
   endif()
   list(APPEND SOURCE ${cuda_objs} ${CUDA})
diff --git a/Makefile b/Makefile
index 560b77a3e81e..33151e574ea7 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,10 @@ ifndef DLPACK_PATH
 	DLPACK_PATH = $(ROOTDIR)/dlpack
 endif
 
+ifndef AMALGAMATION_PATH
+	AMALGAMATION_PATH = $(ROOTDIR)/amalgamation
+endif
+
 ifneq ($(USE_OPENMP), 1)
 	export NO_OPENMP = 1
 endif
@@ -439,6 +443,7 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
+	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
 	$(RM) -r  $(patsubst %, %/*.o, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.o, $(EXTRA_OPERATORS))
 else
@@ -448,6 +453,7 @@ clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
+	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 endif
 
 clean_all: clean
diff --git a/NEWS.md b/NEWS.md
index 2557aadfed27..4fdd31430002 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,37 @@
 MXNet Change Log
 ================
+## 0.11.0-rc0
+### - Major Features
+  - Apple Core ML model converter
+  - Support for Keras v1.2.2
+  - For more information see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/v0.11.0+Release+Notes)
+### - API Changes
+  - Added `CachedOp`. You can now cache the operators that’s called frequently with the same set of arguments to reduce overhead.
+  - Added sample_multinomial for sampling from multinomial distributions.
+  - Added `trunc` operator for rounding towards zero.
+  - Added linalg_gemm, linalg_potrf, ... operators for lapack support.
+  - Added verbose option to Initializer for printing out initialization details.
+  - Added DeformableConvolution to contrib from the Deformable Convolutional Networks paper.
+  - Added float64 support for dot and batch_dot operator.
+  - `allow_extra` is added to Module.set_params to ignore extra parameters.
+  - Added `mod` operator for modulo.
+  - Added `multi_precision` option to SGD optimizer to improve training with float16. Resnet50 now achieves the same accuracy when trained with float16 and gives 50% speedup on Titan XP.
+### - Performance Improvements
+  - ImageRecordIter now stores data in pinned memory to improve GPU memcopy speed.
+### - Bugfixes
+  - Cython interface is fixed. `make cython` and `python setup.py install --with-cython` should install the cython interface and reduce overhead in applications that use imperative/bucketing.
+  - Fixed various bugs in Faster-RCNN example: https://github.com/dmlc/mxnet/pull/6486
+  - Fixed various bugs in SSD example.
+  - Fixed `out` argument not working for `zeros`, `ones`, `full`, etc.
+  - `expand_dims` now supports backward shape inference.
+  - Fixed a bug in rnn. BucketingSentenceIter that causes incorrect layout handling on multi-GPU.
+  - Fixed context mismatch when loading optimizer states.
+  - Fixed a bug in ReLU activation when using MKL.
+  - Fixed a few race conditions that causes crashes on shutdown.
+### - Refactors
+  - Refactored TShape/TBlob to use int64 dimensions and DLTensor as internal storage. Getting ready for migration to DLPack. As a result TBlob::dev_mask_ and TBlob::stride_ are removed.
+
+
 ## 0.10.0
 - Overhauled documentation for commonly used Python APIs, Installation instructions, Tutorials, HowTos and MXNet Architecture.  
 - Updated mxnet.io for improved readability.  
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index fb57b4b7e31f..e0b435513718 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 0.10.1
+Version: 0.11.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@qkou.info>
diff --git a/README.md b/README.md
index 5027f6d3fdb6..4a354d9bef75 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ deep learning systems, and interesting insights of DL systems for hackers.
 
 What's New
 ----------
+* [Version 0.11.0-rc0 Release](https://github.com/dmlc/mxnet/releases/tag/v0.11.0-rc0) - MXNet 0.11.0-rc0 Release.
 * [Apache Incubator](http://incubator.apache.org/projects/mxnet.html) - We are now an Apache Incubator project.
 * [Version 0.10.0 Release](https://github.com/dmlc/mxnet/releases/tag/v0.10.0) - MXNet 0.10.0 Release.
 * [Version 0.9.3 Release](./docs/architecture/release_note_0_9.md) - First 0.9 official release.
diff --git a/amalgamation/jni/org/dmlc/mxnet/MxnetException.java b/amalgamation/jni/org/dmlc/mxnet/MxnetException.java
index c342cfaeee1d..08d80d683a4a 100644
--- a/amalgamation/jni/org/dmlc/mxnet/MxnetException.java
+++ b/amalgamation/jni/org/dmlc/mxnet/MxnetException.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.dmlc.mxnet;
 
 public class MxnetException extends Exception {
diff --git a/amalgamation/jni/org/dmlc/mxnet/Predictor.java b/amalgamation/jni/org/dmlc/mxnet/Predictor.java
index a91312a4121d..53152dcf7436 100644
--- a/amalgamation/jni/org/dmlc/mxnet/Predictor.java
+++ b/amalgamation/jni/org/dmlc/mxnet/Predictor.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.dmlc.mxnet;
 
 import android.graphics.Bitmap;
@@ -37,7 +56,7 @@ int ctype() {
   private long handle = 0;
 
   public Predictor(byte[] symbol, byte[] params, Device dev, InputNode[] input) {
-	String[] keys = new String[input.length]; 
+	String[] keys = new String[input.length];
 	int[][] shapes = new int[input.length][];
 	for (int i=0; i<input.length; ++i) {
 		keys[i] = input[i].key;
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 6a9e01ab01a0..4194b5bae905 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  */
 #include <iostream>
 #include <map>
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index d95c97d8e734..f5fff853cbad 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  * Hua Zhang mz24cn@hotmail.com
  * The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API.
  * The generated params file is compatiable with python version.
diff --git a/cpp-package/example/feature_extract/feature_extract.cpp b/cpp-package/example/feature_extract/feature_extract.cpp
index 21853a3912e7..1886c576400d 100644
--- a/cpp-package/example/feature_extract/feature_extract.cpp
+++ b/cpp-package/example/feature_extract/feature_extract.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2015 by Contributors
  */
 #include <iostream>
 #include <fstream>
diff --git a/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
index 20cbe140fc09..a7b4cba0a64a 100644
--- a/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
+++ b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2015 by Contributors
  */
 #include <fstream>
 #include <iostream>
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index 2e59fbfe45cd..ac0585e81a70 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  */
 #include <string>
 #include <vector>
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 4442e006b5a5..de21aadea9b5 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  */
 #include <iostream>
 #include <map>
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 56f8d2c8743a..05cc4517fe1e 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2015 by Contributors
  */
 #include <iostream>
 #include <fstream>
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index f6301b52a61f..077f55622561 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  */
 #include <iostream>
 #include <fstream>
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index 6152eddc726a..c9c4ff245180 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2015 by Contributors
  */
 
 #include <iostream>
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 358e8348ac5e..748c32e8c274 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2017 by Contributors
  * Xin Li yakumolx@gmail.com
  */
 #include <chrono>
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index a6281c385dfb..531afbb29db6 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  * Xin Li yakumolx@gmail.com
  */
 #include <chrono>
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index b9766c7a64d0..ca5643de9d81 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  */
 #include <iostream>
 #include <map>
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 35342699558f..254a6d242fd6 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- * Copyright (c) 2016 by Contributors
  * Xin Li yakumolx@gmail.com
  */
 #include <chrono>
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 6887956290c2..0aa698174005 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- *  Copyright (c) 2016 by Contributors
  * \file executor.hpp
  * \brief implementation of the executor
  * \author Zhang Chen, Chuntao Hong
diff --git a/cpp-package/include/mxnet-cpp/io.hpp b/cpp-package/include/mxnet-cpp/io.hpp
index 1be7993fbe4f..677c0f6ee1f0 100644
--- a/cpp-package/include/mxnet-cpp/io.hpp
+++ b/cpp-package/include/mxnet-cpp/io.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
-*  Copyright (c) 2016 by Contributors
 * \file operator.hpp
 * \brief implementation of data iter
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp
index 4f66c1d637a5..f2b5e74990ce 100644
--- a/cpp-package/include/mxnet-cpp/kvstore.hpp
+++ b/cpp-package/include/mxnet-cpp/kvstore.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- *  Copyright (c) 2016 by Contributors
  * \file kvstore.hpp
  * \brief implementation of kvstore
  * \author Xin Li
diff --git a/cpp-package/include/mxnet-cpp/monitor.hpp b/cpp-package/include/mxnet-cpp/monitor.hpp
index eef218bff41d..f3584e2e8092 100644
--- a/cpp-package/include/mxnet-cpp/monitor.hpp
+++ b/cpp-package/include/mxnet-cpp/monitor.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
-*  Copyright (c) 2017 by Contributors
 * \file monitor.hpp
 * \brief monitor implementation
 * \author Xin Li
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index ba0954b3f815..5ed04a547b85 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- *  Copyright (c) 2016 by Contributors
  * \file ndarray.hpp
  * \brief implementation of the ndarray
  * \author Zhang Chen, Chuntao Hong
diff --git a/cpp-package/include/mxnet-cpp/operator.hpp b/cpp-package/include/mxnet-cpp/operator.hpp
index 17f4885133fc..a0100cd601be 100644
--- a/cpp-package/include/mxnet-cpp/operator.hpp
+++ b/cpp-package/include/mxnet-cpp/operator.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
-*  Copyright (c) 2016 by Contributors
 * \file operator.hpp
 * \brief implementation of operator
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
index 0d6a7be9dd6b..f9c885fc1fdd 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.hpp
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
-*  Copyright (c) 2016 by Contributors
 * \file optimizer.hpp
 * \brief implementation of optimizer
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index 7f88e485830f..ee1a11e26a40 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -1,5 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
- *  Copyright (c) 2016 by Contributors
  * \file symbol.hpp
  * \brief implementation of the symbol
  * \author Zhang Chen, Chuntao Hong
diff --git a/docs/_static/mxnet-theme/footer.html b/docs/_static/mxnet-theme/footer.html
index f7eec1321724..45ba457a0722 100644
--- a/docs/_static/mxnet-theme/footer.html
+++ b/docs/_static/mxnet-theme/footer.html
@@ -1,5 +1,5 @@
 <div class="container">
 <div class="footer">
-  <p> © 2015-2017 DMLC. All rights reserved. </p>
+  <p> </p>
 </div>
 </div>
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index b39e710d6155..e381428758c0 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -46,7 +46,7 @@ <h3>MXNet Joining Apache</h3>
           <h2>Examples</h2>
           <p>Explore projects from simple demos to state-of-the-art research</p>
           <div class='util-btn'>
-            <a href="https://github.com/dmlc/mxnet/tree/master/example">MXNet Examples</a>
+            <a id="example-link" href="https://github.com/dmlc/mxnet/tree/master/example">MXNet Examples</a>
           </div>
         </div>
         <div class="col-lg-6 col-sm-12">
@@ -54,7 +54,7 @@ <h2>Examples</h2>
           <h2>Model Zoo</h2>
           <p>Off the shelf pre-trained models</p>
           <div class='util-btn'>
-            <a href="model_zoo/index.html">Model Zoo</a>
+            <a id="model-zoo-link" href="model_zoo/index.html">Model Zoo</a>
           </div>
         </div>
       </div>
diff --git a/docs/api/python/autograd.md b/docs/api/python/autograd.md
index 444e01fc9688..de8188446b7c 100644
--- a/docs/api/python/autograd.md
+++ b/docs/api/python/autograd.md
@@ -86,6 +86,7 @@ Detailed tutorials are available in Part 1 of
     set_recording
     is_recording
     mark_variables
+    Function
 ```
 
 ## API Reference
diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md
index 361e0c91de63..a7632d4a61e8 100644
--- a/docs/architecture/overview.md
+++ b/docs/architecture/overview.md
@@ -48,7 +48,7 @@ The following API is the core interface for the execution engine:
 This API allows you to push a function (`exec_fun`),
 along with its context information and dependencies, to the engine.
 `exec_ctx` is the context information in which the `exec_fun` should be executed,
-`const_vars` denotes the variables that the function reads from,  
+`const_vars` denotes the variables that the function reads from, 
 and `mutate_vars` are the variables to be modified.
 The engine provides the following guarantee:
 
diff --git a/docs/architecture/program_model.md b/docs/architecture/program_model.md
index 380990e7019f..519a9a9024d8 100644
--- a/docs/architecture/program_model.md
+++ b/docs/architecture/program_model.md
@@ -92,7 +92,7 @@ are powerful DSLs that generate callable computation graphs for neural networks.
 <!-- In that sense, config-file input libraries are all symbolic. -->
 
 Intuitively, you might say that imperative programs
-are more *native* than  symbolic programs.
+are more *native* than symbolic programs.
 It's easier to use native language features.
 For example, it's straightforward to print out the values
 in the middle of computation or to use native control flow and loops
@@ -269,7 +269,7 @@ Recall the *be prepared to encounter all possible demands* requirement of impera
 If you are creating an array library that supports automatic differentiation,
 you have to keep the grad closure along with the computation.
 This means that none of the history variables can be
-garbage-collected because they are referenced by variable `d`  by way of function closure.
+garbage-collected because they are referenced by variable `d` by way of function closure.
 
 What if you want to compute only the value of `d`,
 and don't want the gradient value?
@@ -305,7 +305,6 @@ For example, one solution to the preceding
 problem is to introduce a context variable.
 You can introduce a no-gradient context variable
 to turn gradient calculation off.
-<!-- This provides an imperative program with the ability to impose some restrictions, but reduces efficiency. -->
 
 ```python
     with context.NoGradient():
@@ -315,6 +314,8 @@ to turn gradient calculation off.
         d = c + 1
 ```
 
+<!-- This provides an imperative program with the ability to impose some restrictions, but reduces efficiency. -->
+
 However, this example still must be prepared to encounter all possible demands,
 which means that you can't perform the in-place calculation
 to reuse memory in the forward pass (a trick commonly used to reduce GPU memory usage).
@@ -380,7 +381,7 @@ It's usually easier to write parameter updates in an imperative style,
 especially when you need multiple updates that relate to each other.
 For symbolic programs, the update statement is also executed as you call it.
 So in that sense, most symbolic deep learning libraries
-fall back on the imperative approach to perform  updates,
+fall back on the imperative approach to perform updates,
 while using the symbolic approach to perform gradient calculation.
 
 ### There Is No Strict Boundary
@@ -388,7 +389,7 @@ while using the symbolic approach to perform gradient calculation.
 In comparing the two programming styles,
 some of our arguments might not be strictly true,
 i.e., it's possible to make an imperative program
-more like a traditional symbolic program or vice versa.  
+more like a traditional symbolic program or vice versa. 
 However, the two archetypes are useful abstractions,
 especially for understanding the differences between deep learning libraries.
 We might reasonably conclude that there is no clear boundary between programming styles.
@@ -400,7 +401,7 @@ information held in symbolic programs.
 
 ## Big vs. Small Operations
 
-When designing a deep learning library, another important programming model decision  
+When designing a deep learning library, another important programming model decision 
 is precisely what operations to support.
 In general, there are two families of operations supported by most deep learning libraries:
 
@@ -418,7 +419,7 @@ For example, the sigmoid unit can simply be composed of division, addition and a
     sigmoid(x) = 1.0 / (1.0 + exp(-x))
 ```
 Using smaller operations as building blocks, you can express nearly anything you want.
-If you're  more familiar with CXXNet- or Caffe-style layers,
+If you're more familiar with CXXNet- or Caffe-style layers,
 note that these operations don't differ from a layer, except that they are smaller.
 
 ```python
@@ -433,7 +434,7 @@ because you only need to compose the components.
 Directly composing sigmoid layers requires three layers of operation, instead of one.
 
 ```python
-    SigmoidLayer(x) = EWiseDivisionLayer(1.0, AddScalarLayer(ExpLayer(-x),   1.0))
+    SigmoidLayer(x) = EWiseDivisionLayer(1.0, AddScalarLayer(ExpLayer(-x), 1.0))
 ```
 This code creates overhead for computation and memory (which could be optimized, with cost).
 
@@ -467,7 +468,7 @@ these optimizations are crucial to performance.
 Because the operations are small,
 there are many sub-graph patterns that can be matched.
 Also, because the final, generated operations
-might not enumerable,
+might not be enumerable,
 an explicit recompilation of the kernels is required,
 as opposed to the fixed amount of precompiled kernels
 in the big operation libraries.
@@ -476,7 +477,7 @@ that support small operations.
 Requiring compilation optimization also creates engineering overhead
 for the libraries that solely support smaller operations.
 
-As in the case of symbolic vs imperative,
+As in the case of symbolic vs. imperative,
 the bigger operation libraries "cheat"
 by asking you to provide restrictions (to the common layer),
 so that you actually perform the sub-graph matching.
@@ -522,7 +523,7 @@ The more suitable programming style depends on the problem you are trying to sol
 For example, imperative programs are better for parameter updates,
 and symbolic programs for gradient calculation.
 
-We advocate *mixing* the approaches.  
+We advocate *mixing* the approaches. 
 Sometimes the part that we want to be flexible
 isn't crucial to performance.
 In these cases, it's okay to leave some efficiency on the table
@@ -562,7 +563,7 @@ This is exactly like writing C++ programs and exposing them to Python, which we
 Because parameter memory resides on the GPU,
 you might not want to use NumPy as an imperative component.
 Supporting a GPU-compatible imperative library
-that interacts with symbolic compiled functions  
+that interacts with symbolic compiled functions 
 or provides a limited amount of updating syntax
 in the update statement in symbolic program execution
 might be a better choice.
diff --git a/docs/build_version_doc/AddVersion.py b/docs/build_version_doc/AddVersion.py
old mode 100644
new mode 100755
index 38ce48f63c2f..34ba40e0f3a4
--- a/docs/build_version_doc/AddVersion.py
+++ b/docs/build_version_doc/AddVersion.py
@@ -25,7 +25,7 @@
                         help='file to be modified')
 parser.add_argument('--current_version', type=str, default='master',
                         help='Current version')
-parser.add_argument('--root_url', type=str, default='https://mxnet.io',
+parser.add_argument('--root_url', type=str, default='https://mxnet.incubator.apache.org/',
                         help='Root URL')
 
 if __name__ == '__main__':
@@ -59,6 +59,9 @@
                 continue
             with open(os.path.join(path, name), 'r') as html_file:
                 content = bs(html_file, 'html.parser')
+            if os.path.join(path, name) == args.file_path + 'index.html':
+                content.find(id='example-link')['href'] = \
+                    'https://github.com/apache/incubator-mxnet/tree/%s/example' % (args.current_version)
             navbar = content.find(id="main-nav")
             navbar_mobile = content.find(id="burgerMenu")
             if navbar and navbar_mobile:
diff --git a/docs/build_version_doc/build_doc.sh b/docs/build_version_doc/build_doc.sh
index 5a4f15d33c9c..f98e1e0683dc 100755
--- a/docs/build_version_doc/build_doc.sh
+++ b/docs/build_version_doc/build_doc.sh
@@ -55,8 +55,7 @@ then
     make docs || exit 1
     echo -e "$latest_tag\n$(cat $tag_list_file)" > "$tag_list_file"
     cat $tag_list_file
-    tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "docs/_build/html/" \
-                                          --current_version "$latest_tag" --root_url "http://mxnet.incubator.apache.org/"
+    tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "docs/_build/html/" --current_version "$latest_tag"
     tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddPackageLink.py \
                                           --file_path "docs/_build/html/get_started/install.html" --current_version "$latest_tag"
     cp -a "docs/_build/html/." "$local_build"
@@ -79,8 +78,7 @@ make docs || exit 1
 
 rm -rfv "$web_folder/versions/master/*"
 cp -a "docs/_build/html/." "$web_folder/versions/master"
-tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "$web_folder/versions/master" \
-                                      --root_url "http://mxnet.incubator.apache.org/"
+tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "$web_folder/versions/master"
 
 # Update version list for all previous version website
 if [ $latest_tag != ${tag_list[0]} ]
@@ -89,6 +87,6 @@ then
     for (( i=0; i<=$(( $total -1 )); i++ ))
     do
         tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "$web_folder/versions/${tag_list[$i]}" \
-                                              --current_version "${tag_list[$i]}" --root_url "http://mxnet.incubator.apache.org/"
+                                              --current_version "${tag_list[$i]}"
     done
 fi
diff --git a/docs/how_to/index.md b/docs/how_to/index.md
index cc21aa0b8ae8..4920e1cd3f78 100644
--- a/docs/how_to/index.md
+++ b/docs/how_to/index.md
@@ -38,6 +38,8 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 
 * [How do I run Keras 1.2.2 with mxnet backend?](https://github.com/dmlc/keras/wiki/Installation)
 
+* [How to convert MXNet models to Apple CoreML format?](https://github.com/apache/incubator-mxnet/tree/master/tools/coreml)
+
 ## Extend and Contribute to MXNet
 
 * [How do I join the MXNet development discussion?](http://mxnet.io/community/mxnet_channels.html)
diff --git a/docs/tutorials/basic/data.md b/docs/tutorials/basic/data.md
index 93a1db066a8c..d4db7d0de1b6 100644
--- a/docs/tutorials/basic/data.md
+++ b/docs/tutorials/basic/data.md
@@ -30,7 +30,7 @@ Iterators provide an abstract interface for traversing various types of iterable
  without needing to expose details about the underlying data source.
 
 In MXNet, data iterators return a batch of data as `DataBatch` on each call to `next`.
-A `DataBatch` often contains *n* training examples and their corresponding labels. Here *n* is the `batch_size` of the iterator. At the end of the data stream when there is no more data to read, the iterator raises ``StopIteration`` exception like Python `iter`.  
+A `DataBatch` often contains *n* training examples and their corresponding labels. Here *n* is the `batch_size` of the iterator. At the end of the data stream when there is no more data to read, the iterator raises ``StopIteration`` exception like Python `iter`. 
 The structure of `DataBatch` is defined [here](http://mxnet.io/api/python/io.html#mxnet.io.DataBatch).
 
 Information such as name, shape, type and layout on each training example and their corresponding label can be provided as `DataDesc` data descriptor objects via the `provide_data` and `provide_label` properties in `DataBatch`.
diff --git a/example/gluon/tree_lstm/lib/CollapseUnaryTransformer.java b/example/gluon/tree_lstm/lib/CollapseUnaryTransformer.java
index 590dcb3dfa05..a0ff1936cb88 100644
--- a/example/gluon/tree_lstm/lib/CollapseUnaryTransformer.java
+++ b/example/gluon/tree_lstm/lib/CollapseUnaryTransformer.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 import java.util.List;
 
 import edu.stanford.nlp.ling.Label;
diff --git a/example/gluon/tree_lstm/lib/ConstituencyParse.java b/example/gluon/tree_lstm/lib/ConstituencyParse.java
index 7100eccde7f0..346138c6a06d 100644
--- a/example/gluon/tree_lstm/lib/ConstituencyParse.java
+++ b/example/gluon/tree_lstm/lib/ConstituencyParse.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 import edu.stanford.nlp.process.WordTokenFactory;
 import edu.stanford.nlp.ling.HasWord;
 import edu.stanford.nlp.ling.Word;
@@ -212,7 +231,7 @@ public static void main(String[] args) throws Exception {
       // produce parent pointer representation
       int[] parents = deps ? processor.depTreeParents(parse, tokens)
                            : processor.constTreeParents(parse);
-      
+
       // print
       if (tokPath != null) {
         processor.printTokens(tokens);
diff --git a/example/gluon/tree_lstm/lib/DependencyParse.java b/example/gluon/tree_lstm/lib/DependencyParse.java
index e94de7764e3c..445cab805cc9 100644
--- a/example/gluon/tree_lstm/lib/DependencyParse.java
+++ b/example/gluon/tree_lstm/lib/DependencyParse.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 import edu.stanford.nlp.process.WordTokenFactory;
 import edu.stanford.nlp.ling.HasWord;
 import edu.stanford.nlp.ling.Word;
diff --git a/example/rcnn/rcnn/cython/gpu_nms.hpp b/example/rcnn/rcnn/cython/gpu_nms.hpp
index 68b6d42cd88b..93d1f90183bb 100644
--- a/example/rcnn/rcnn/cython/gpu_nms.hpp
+++ b/example/rcnn/rcnn/cython/gpu_nms.hpp
@@ -1,2 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
           int boxes_dim, float nms_overlap_thresh, int device_id);
diff --git a/example/rcnn/rcnn/pycocotools/maskApi.c b/example/rcnn/rcnn/pycocotools/maskApi.c
index 85e397918278..9dd660de1252 100644
--- a/example/rcnn/rcnn/pycocotools/maskApi.c
+++ b/example/rcnn/rcnn/pycocotools/maskApi.c
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /**************************************************************************
 * Microsoft COCO Toolbox.      version 2.0
 * Data, paper, and tutorials available at:  http://mscoco.org/
diff --git a/example/ssd/tools/caffe_converter/make_win32.bat b/example/ssd/tools/caffe_converter/make_win32.bat
index 7d354dcaeb6c..1ee8e89f018f 100644
--- a/example/ssd/tools/caffe_converter/make_win32.bat
+++ b/example/ssd/tools/caffe_converter/make_win32.bat
@@ -1,3 +1,20 @@
+rem Licensed to the Apache Software Foundation (ASF) under one
+rem or more contributor license agreements.  See the NOTICE file
+rem distributed with this work for additional information
+rem regarding copyright ownership.  The ASF licenses this file
+rem to you under the Apache License, Version 2.0 (the
+rem "License"); you may not use this file except in compliance
+rem with the License.  You may obtain a copy of the License at
+rem
+rem   http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing,
+rem software distributed under the License is distributed on an
+rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+rem KIND, either express or implied.  See the License for the
+rem specific language governing permissions and limitations
+rem under the License.
+
 @protoc --python_out=./ ./caffe_parse/caffe.proto
 @echo done.
 @pause
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 50642049b8f8..695408380ec9 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -56,6 +56,13 @@
 #define MXNET_USE_CUDNN MSHADOW_USE_CUDNN
 #endif
 
+/*!
+ *\brief whether to use cusolver library
+ */
+#ifndef MXNET_USE_CUSOLVER
+#define MXNET_USE_CUSOLVER MSHADOW_USE_CUSOLVER
+#endif
+
 /*! \brief Error message for using gpu when MXNET_USE_CUDA==0 */
 #define MXNET_GPU_NOT_ENABLED_ERROR  "GPU is not enabled"
 
@@ -103,9 +110,9 @@
 /*! \brief major version */
 #define MXNET_MAJOR 0
 /*! \brief minor version */
-#define MXNET_MINOR 10
+#define MXNET_MINOR 11
 /*! \brief patch version */
-#define MXNET_PATCH 1
+#define MXNET_PATCH 0
 /*! \brief mxnet version */
 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
 /*! \brief helper for making version number */
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 499c4d205e13..a43f73fe45ab 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -145,6 +145,7 @@ enum CustomOpPropCallbacks {
   kCustomOpPropInferType
 };
 
+
 typedef int (*CustomOpFBFunc)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
                               const int* /*reqs*/, const int /*is_train*/,
                               void* /*state*/);
@@ -164,6 +165,17 @@ typedef int (*CustomOpPropCreator)(const char* /*op_type*/, const int /*num_kwar
                                    const char** /*keys*/, const char** /*values*/,
                                    struct MXCallbackList* /*ret*/);
 
+
+enum CustomFunctionCallbacks {
+  kCustomFunctionBackward,
+  kCustomFunctionDelete
+};
+
+typedef int (*CustomFunctionBwdFunc)(int /*num_ograds*/, int /*num_igrads*/, void** /*ptrs*/,
+                                     const int* /*reqs*/, const int /*is_train*/,
+                                     void* /*state*/);
+typedef int (*CustomFunctionDelFunc)(void* /*state*/);
+
 /*!
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
@@ -740,6 +752,12 @@ MXNET_DLL int MXAutogradBackwardEx(mx_uint num_output,
                                    NDArrayHandle* ograd_handles,
                                    int retain_graph,
                                    int is_train);
+/*
+ * \brief get the graph constructed by autograd.
+ * \param handle ndarray handle
+ * \param out output symbol handle
+ */
+MXNET_DLL int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out);
 /*!
  * \brief create cached operator
  */
@@ -1838,8 +1856,23 @@ MXNET_DLL int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output,
  * \brief Delete a MXRtc object
 */
 MXNET_DLL int MXRtcFree(RtcHandle handle);
-
+/*
+ * \brief register custom operators from frontend.
+ * \param op_type name of custom op
+ * \param creator
+ */
 MXNET_DLL int MXCustomOpRegister(const char* op_type, CustomOpPropCreator creator);
+/*
+ * \brief record custom function for backward later.
+ * \param num_inputs number of input NDArrays.
+ * \param inputs handle to input NDArrays.
+ * \param num_outputs number of output NDArrays.
+ * \param outputs handle to output NDArrays.
+ * \param callbacks callbacks for backward function.
+ */
+MXNET_DLL int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs,
+                                     int num_outputs, NDArrayHandle *outputs,
+                                     MXCallbackList *callbacks);
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 204c718451e7..1bafd8b272bd 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -521,6 +521,14 @@ class NDArray {
     ret.entry_ = autograd::AGNodeEntry{nullptr, 0, 0};
     return ret;
   }
+
+  nnvm::Symbol get_autograd_symbol() {
+    CHECK(!entry_.is_none())
+      << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
+    nnvm::Symbol ret;
+    ret.outputs.emplace_back(entry_.nn_entry());
+    return ret;
+  }
   /*!
    * \brief Allocate the space if it is delayed allocated.
    * This is an internal function used by system that normal user should not use
diff --git a/mshadow b/mshadow
index 5a11d7544841..497eb9180b24 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 5a11d7544841b55a8ac1a65081759dc2289c335d
+Subproject commit 497eb9180b24592b7332e7e08f2c053ec5346524
diff --git a/perl-package/AI-MXNet/examples/calculator.pl b/perl-package/AI-MXNet/examples/calculator.pl
index f41895508450..aadc7cd2641e 100755
--- a/perl-package/AI-MXNet/examples/calculator.pl
+++ b/perl-package/AI-MXNet/examples/calculator.pl
@@ -1,4 +1,22 @@
 #!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 use strict;
 use warnings;
 use AI::MXNet ('mx');
diff --git a/perl-package/AI-MXNet/examples/char_lstm.pl b/perl-package/AI-MXNet/examples/char_lstm.pl
index 1b69ee1e93c6..54a9e3672f63 100755
--- a/perl-package/AI-MXNet/examples/char_lstm.pl
+++ b/perl-package/AI-MXNet/examples/char_lstm.pl
@@ -1,4 +1,22 @@
 #!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 use strict;
 use warnings;
 use PDL;
diff --git a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
index 4cfe51bfd94a..8976e6465003 100755
--- a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
+++ b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
@@ -1,4 +1,22 @@
 #!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 use strict;
 use warnings;
 use AI::MXNet qw(mx);
@@ -280,4 +298,4 @@ =head1 SYNOPSIS
 else
 {
     $train->();
-}
\ No newline at end of file
+}
diff --git a/perl-package/AI-MXNet/examples/lstm_bucketing.pl b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
index ffc176dccb79..e6699d79f0b1 100755
--- a/perl-package/AI-MXNet/examples/lstm_bucketing.pl
+++ b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
@@ -1,4 +1,22 @@
 #!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 use strict;
 use warnings;
 use PDL;
diff --git a/perl-package/AI-MXNet/examples/mnist.pl b/perl-package/AI-MXNet/examples/mnist.pl
index 891b5348039c..ca452cd95444 100755
--- a/perl-package/AI-MXNet/examples/mnist.pl
+++ b/perl-package/AI-MXNet/examples/mnist.pl
@@ -1,4 +1,22 @@
 #!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 use strict;
 use warnings;
 # derived from http://mxnet.io/tutorials/python/mnist.html
@@ -115,7 +133,7 @@ sub nn_fc {
     # Epoch[9] Validation-accuracy=0.964600
     my($data) = @_;
 
-    # Flatten the data from 4-D shape (batch_size, num_channel, width, height) 
+    # Flatten the data from 4-D shape (batch_size, num_channel, width, height)
     # into 2-D (batch_size, num_channel*width*height)
     $data = mx->sym->Flatten(data => $data);
 
@@ -175,7 +193,7 @@ sub nn_conv {
 );
 $model->fit(
     $train_iter,       # training data
-    num_epoch => 10,      # number of data passes for training 
+    num_epoch => 10,      # number of data passes for training
     eval_data => $val_iter, # validation data
     batch_end_callback => mx->callback->Speedometer($batch_size, 200), # output progress for each 200 data batches
     optimizer => 'adam',
diff --git a/perl-package/AI-MXNet/examples/plot_network.pl b/perl-package/AI-MXNet/examples/plot_network.pl
index a0bcf847af1b..fc38ef2baaab 100755
--- a/perl-package/AI-MXNet/examples/plot_network.pl
+++ b/perl-package/AI-MXNet/examples/plot_network.pl
@@ -1,4 +1,22 @@
 #!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 use strict;
 use warnings;
 use AI::MXNet qw(mx);
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index 7340851cdef6..292bcc2308fc 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -20,11 +20,14 @@
 from __future__ import absolute_import
 from __future__ import division
 
+from threading import Lock
+import traceback
 import ctypes
+from ctypes import c_int, c_void_p, CFUNCTYPE, POINTER, cast
 from .base import _LIB, check_call, string_types
-from .base import mx_uint, NDArrayHandle, c_array
+from .base import mx_uint, NDArrayHandle, c_array, MXCallbackList, SymbolHandle
 from .ndarray import NDArray
-from .symbol import _GRAD_REQ_MAP
+from .symbol import _GRAD_REQ_MAP, Symbol
 
 
 def set_recording(is_recording): #pylint: disable=redefined-outer-name
@@ -265,3 +268,163 @@ def backward(heads, head_grads=None, retain_graph=False, train_mode=True): #pyli
         c_array(NDArrayHandle, ograd_handles),
         ctypes.c_int(retain_graph),
         ctypes.c_int(train_mode)))
+
+
+def get_symbol(x):
+    """Retrieve recorded computation history as `Symbol`.
+
+    Parameters
+    ----------
+    x : NDArray
+        Array representing the head of computation graph.
+
+    Returns
+    -------
+    Symbol
+        The retrieved Symbol.
+    """
+    hdl = SymbolHandle()
+    check_call(_LIB.MXAutogradGetSymbol(x.handle, ctypes.byref(hdl)))
+    return Symbol(hdl)
+
+
+class Function(object):
+    """User-defined differentiable function.
+
+    Function allows defining both forward and backward computation for
+    custom operators. During gradient computation, the used-defined
+    backward function will be used instead of the default chain-rule.
+    You can also cast to numpy array and back for some operations in
+    forward and backward.
+
+    For example, a stable sigmoid function can be defined as::
+
+        class sigmoid(Function):
+            def forward(self, x):
+                y = 1 / (1 + mx.nd.exp(-x))
+                self.save_for_backward(y)
+                return y
+
+            def backward(self, dy):
+                # backward takes as many inputs as forward's return value,
+                # and returns as many NDArrays as forward's arguments.
+                y, = self.saved_tensors
+                return y * (1-y)
+    """
+    _bwd_functype = CFUNCTYPE(c_int, c_int, c_int, POINTER(c_void_p),
+                              POINTER(c_int), c_int, c_void_p)
+    _del_functype = CFUNCTYPE(c_int, c_void_p)
+    class _Registry(object):
+        """CustomOp registry."""
+        def __init__(self):
+            self.ref_holder = {}
+            self.counter = 0
+            self.lock = Lock()
+
+        def inc(self):
+            """Get index for new entry."""
+            self.lock.acquire()
+            cur = self.counter
+            self.counter += 1
+            self.lock.release()
+            return cur
+
+    _registry = _Registry()
+
+    def __init__(self):
+        self._used = False
+        self.saved_tensors = ()
+
+    def save_for_backward(self, *args):
+        self.saved_tensors = args
+
+    def __call__(self, *inputs):
+        assert not self._used, \
+            "Each Function instance can only be called once. "\
+            "Please create another instance."
+        self._used = True
+
+        prev_recording = set_recording(False)
+        outputs = self.forward(*inputs)
+        set_recording(prev_recording)
+
+        if not prev_recording:
+            return outputs
+
+        ret_outputs = outputs
+        if isinstance(outputs, NDArray):
+            outputs = (outputs,)
+
+        key = Function._registry.inc()
+
+        def backward_entry(num_ograds, num_igrads, ptrs, reqs, is_train, _):
+            """entry point for backward."""
+            # pylint: disable=W0613
+            try:
+                output_grads = [NDArray(ctypes.cast(i, NDArrayHandle), writable=False) \
+                                for i in ptrs[:num_ograds]]
+                input_grads = [NDArray(ctypes.cast(i, NDArrayHandle), writable=True) \
+                               for i in ptrs[num_ograds:num_ograds+num_igrads]]
+                reqs = [reqs[i] for i in range(num_igrads)]
+                rets = self.backward(*output_grads)
+                if isinstance(rets, NDArray):
+                    rets = (rets,)
+                assert len(rets) == len(input_grads), \
+                    "%s.backward must return exactly the same number " \
+                    "of NDArrays as the number of NDArrays arguments to forward." \
+                    "Expecting %d got %d"%(self.__class__.name, len(input_grads), len(rets))
+                for igrad, ret, req in zip(input_grads, rets, reqs):
+                    assert isinstance(ret, NDArray), \
+                        "autograd.Function.backward must return NDArrays, not %s"%type(ret)
+                    if req == 0:  # null
+                        return
+                    elif req == 1 or req == 2:  # write or inplace
+                        igrad[:] = ret
+                    elif req == 'add':
+                        igrad[:] += ret
+            except Exception:  # pylint: disable=broad-except
+                print('Error in Function.backward: %s' % traceback.format_exc())
+                return False
+            return True
+
+        def delete_entry(_):
+            """C Callback for CustomFunction::delete"""
+            try:
+                del Function._registry.ref_holder[key]
+            except Exception:  # pylint: disable=broad-except
+                print('Error in autograd.Function.delete: %s' % traceback.format_exc())
+                return False
+            return True
+
+        input_handles = [x.handle for x in inputs]
+        output_handles = [x.handle for x in outputs]
+        callbacks = [Function._bwd_functype(backward_entry),
+                     Function._del_functype(delete_entry)]
+        callbacks = [cast(i, CFUNCTYPE(c_int)) for i in callbacks]
+        context = MXCallbackList(c_int(len(callbacks)),
+                                 cast(c_array(CFUNCTYPE(c_int), callbacks),
+                                      POINTER(CFUNCTYPE(c_int))),
+                                 cast(c_array(c_void_p, [None]*len(callbacks)),
+                                      POINTER(c_void_p)))
+        check_call(_LIB.MXCustomFunctionRecord(
+            c_int(len(inputs)),
+            c_array(NDArrayHandle, input_handles),
+            c_int(len(outputs)),
+            c_array(NDArrayHandle, output_handles),
+            ctypes.byref(context)))
+
+        Function._registry.ref_holder[key] = context
+
+        return ret_outputs
+
+    def forward(self, *inputs):
+        """Forward computation."""
+        raise NotImplementedError
+
+    def backward(self, *output_grads):
+        """Backward computation.
+
+        Takes as many inputs as forward's outputs,
+        and returns as many NDArrays as forward's inputs.
+        """
+        raise NotImplementedError
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 06c6f2d73ecf..d446355da0b5 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -87,6 +87,15 @@ def __str__(self):
         msg += ' is not supported for SparseNDArray and only available in NDArray.'
         return msg
 
+class MXCallbackList(ctypes.Structure):
+    """Structure that holds Callback information. Passed to CustomOpProp."""
+    _fields_ = [
+        ('num_callbacks', ctypes.c_int),
+        ('callbacks', ctypes.POINTER(ctypes.CFUNCTYPE(ctypes.c_int))),
+        ('contexts', ctypes.POINTER(ctypes.c_void_p))
+        ]
+
+
 def _load_lib():
     """Load library by searching possible path."""
     lib_path = libinfo.find_lib_path()
diff --git a/python/mxnet/gluon/data/dataset.py b/python/mxnet/gluon/data/dataset.py
index f3dd691962bc..37d103266d8f 100644
--- a/python/mxnet/gluon/data/dataset.py
+++ b/python/mxnet/gluon/data/dataset.py
@@ -20,7 +20,7 @@
 """Dataset container."""
 import os
 
-from ... import recordio, image
+from ... import recordio
 
 class Dataset(object):
     """Abstract dataset class. All datasets should have this interface.
@@ -80,27 +80,3 @@ def __getitem__(self, idx):
 
     def __len__(self):
         return len(self._record.keys)
-
-
-class ImageRecordDataset(RecordFileDataset):
-    """A dataset wrapping over a RecordIO file containing images.
-
-    Each sample is an image and its corresponding label.
-
-    Parameters
-    ----------
-    filename : str
-        Path to rec file.
-    flag : {0, 1}, default 1
-        If 0, always convert images to greyscale.
-
-        If 1, always convert images to colored (RGB).
-    """
-    def __init__(self, filename, flag=1):
-        super(ImageRecordDataset, self).__init__(filename)
-        self._flag = flag
-
-    def __getitem__(self, idx):
-        record = super(ImageRecordDataset, self).__getitem__(idx)
-        header, img = recordio.unpack(record)
-        return image.imdecode(img, self._flag), header.label
diff --git a/python/mxnet/gluon/data/vision.py b/python/mxnet/gluon/data/vision.py
index a16e736b027d..b63624508124 100644
--- a/python/mxnet/gluon/data/vision.py
+++ b/python/mxnet/gluon/data/vision.py
@@ -23,11 +23,12 @@
 import gzip
 import tarfile
 import struct
+import warnings
 import numpy as np
 
 from . import dataset
 from ..utils import download, check_sha1
-from ... import nd
+from ... import nd, image, recordio
 
 
 class _DownloadedDataset(dataset.Dataset):
@@ -42,7 +43,9 @@ def __init__(self, root, train, transform):
         self._get_data()
 
     def __getitem__(self, idx):
-        return self._transform(self._data[idx], self._label[idx])
+        if self._transform is not None:
+            return self._transform(self._data[idx], self._label[idx])
+        return self._data[idx], self._label[idx]
 
     def __len__(self):
         return len(self._label)
@@ -68,7 +71,7 @@ class MNIST(_DownloadedDataset):
             transform=lambda data, label: (data.astype(np.float32)/255, label)
     """
     def __init__(self, root='~/.mxnet/datasets/', train=True,
-                 transform=lambda data, label: (data, label)):
+                 transform=None):
         super(MNIST, self).__init__(root, train, transform)
 
     def _get_data(self):
@@ -116,7 +119,7 @@ class CIFAR10(_DownloadedDataset):
             transform=lambda data, label: (data.astype(np.float32)/255, label)
     """
     def __init__(self, root='~/.mxnet/datasets/', train=True,
-                 transform=lambda data, label: (data, label)):
+                 transform=None):
         self._file_hashes = {'data_batch_1.bin': 'aadd24acce27caa71bf4b10992e9e7b2d74c2540',
                              'data_batch_2.bin': 'c0ba65cce70568cd57b4e03e9ac8d2a5367c1795',
                              'data_batch_3.bin': '1dd00a74ab1d17a6e7d73e185b69dbf31242f295',
@@ -158,3 +161,101 @@ def _get_data(self):
 
         self._data = [nd.array(x, dtype=x.dtype) for x in data]
         self._label = label
+
+
+class ImageRecordDataset(dataset.RecordFileDataset):
+    """A dataset wrapping over a RecordIO file containing images.
+
+    Each sample is an image and its corresponding label.
+
+    Parameters
+    ----------
+    filename : str
+        Path to rec file.
+    flag : {0, 1}, default 1
+        If 0, always convert images to greyscale.
+
+        If 1, always convert images to colored (RGB).
+    transform : function
+        A user defined callback that transforms each instance. For example::
+
+            transform=lambda data, label: (data.astype(np.float32)/255, label)
+    """
+    def __init__(self, filename, flag=1, transform=None):
+        super(ImageRecordDataset, self).__init__(filename)
+        self._flag = flag
+        self._transform = transform
+
+    def __getitem__(self, idx):
+        record = super(ImageRecordDataset, self).__getitem__(idx)
+        header, img = recordio.unpack(record)
+        if self._transform is not None:
+            return self._transform(image.imdecode(img, self._flag), header.label)
+        return image.imdecode(img, self._flag), header.label
+
+
+class ImageFolderDataset(dataset.Dataset):
+    """A dataset for loading image files stored in a folder structure like::
+
+        root/car/0001.jpg
+        root/car/xxxa.jpg
+        root/car/yyyb.jpg
+        root/bus/123.jpg
+        root/bus/023.jpg
+        root/bus/wwww.jpg
+
+    Parameters
+    ----------
+    root : str
+        Path to root directory.
+    flag : {0, 1}, default 1
+        If 0, always convert loaded images to greyscale (1 channel).
+        If 1, always convert loaded images to colored (3 channels).
+    transform : callable
+        A function that takes data and label and transforms them::
+
+            transform = lambda data, label: (data.astype(np.float32)/255, label)
+
+    Attributes
+    ----------
+    synsets : list
+        List of class names. `synsets[i]` is the name for the integer label `i`
+    items : list of tuples
+        List of all images in (filename, label) pairs.
+    """
+    def __init__(self, root, flag=1, transform=None):
+        self._root = os.path.expanduser(root)
+        self._flag = flag
+        self._transform = transform
+        self._exts = ['.jpg', '.jpeg', '.png']
+        self._list_iamges(self._root)
+
+    def _list_iamges(self, root):
+        self.synsets = []
+        self.items = []
+
+        for folder in sorted(os.listdir(root)):
+            path = os.path.join(root, folder)
+            if not os.path.isdir(path):
+                warnings.warn('Ignoring %s, which is not a directory.'%path, stacklevel=3)
+                continue
+            label = len(self.synsets)
+            self.synsets.append(folder)
+            for filename in sorted(os.listdir(path)):
+                filename = os.path.join(path, filename)
+                ext = os.path.splitext(filename)[1]
+                if ext.lower() not in self._exts:
+                    warnings.warn('Ignoring %s of type %s. Only support %s'%(
+                        filename, ext, ', '.join(self._exts)))
+                    continue
+                self.items.append((filename, label))
+
+    def __getitem__(self, idx):
+        img = image.imread(self.items[idx][0], self._flag)
+        label = self.items[idx][1]
+        if self._transform is not None:
+            return self._transform(img, label)
+        return img, label
+
+    def __len__(self):
+        return len(self.items)
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index 8c73f1e5549e..f67b05de5de3 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -81,7 +81,7 @@ def dumps(self):
 
     def __call__(self, src, label):
         """Augmenter implementation body"""
-        src = self.augmenter(src)[0]
+        src = self.augmenter(src)
         return (src, label)
 
 
@@ -275,7 +275,7 @@ def _random_crop_proposal(self, label, height, width):
         from math import sqrt
 
         if not self.enabled or height <= 0 or width <= 0:
-            return None
+            return ()
         min_area = self.area_range[0] * height * width
         max_area = self.area_range[1] * height * width
         for _ in range(self.max_attempts):
@@ -317,7 +317,7 @@ def _random_crop_proposal(self, label, height, width):
                 new_label = self._update_labels(label, (x, y, w, h), height, width)
                 if new_label is not None:
                     return (x, y, w, h, new_label)
-        return None
+        return ()
 
 
 class DetRandomPadAug(DetAugmenter):
@@ -386,7 +386,7 @@ def _random_pad_proposal(self, label, height, width):
         """Generate random padding region"""
         from math import sqrt
         if not self.enabled or height <= 0 or width <= 0:
-            return None
+            return ()
         min_area = self.area_range[0] * height * width
         max_area = self.area_range[1] * height * width
         for _ in range(self.max_attempts):
@@ -411,7 +411,7 @@ def _random_pad_proposal(self, label, height, width):
             x = random.randint(0, max(0, w - width))
             new_label = self._update_labels(label, (x, y, w, h), height, width)
             return (x, y, w, h, new_label)
-        return None
+        return ()
 
 
 def CreateMultiRandCropAugmenter(min_object_covered=0.1, aspect_ratio_range=(0.75, 1.33),
@@ -771,7 +771,7 @@ def next(self):
                     continue
                 for datum in [data]:
                     assert i < batch_size, 'Batch size must be multiples of augmenter output length'
-                    batch_data[i][:] = self.postprocess_data(datum)
+                    batch_data[i] = self.postprocess_data(datum)
                     num_object = label.shape[0]
                     batch_label[i][0:num_object] = nd.array(label)
                     if num_object < batch_label[i].shape[0]:
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 24ad55f636ad..d99db214222c 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -41,11 +41,52 @@
 from .. import recordio
 
 
+def imread(filename, *args, **kwargs):
+    """Read and decode an image to an NDArray.
+
+    Note: `imread` uses OpenCV (not the CV2 Python library).
+    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+
+    Parameters
+    ----------
+    filename : str
+        Name of the image file to be loaded.
+    flag : {0, 1}, default 1
+        1 for three channel color output. 0 for grayscale output.
+    to_rgb : bool, default True
+        True for RGB formatted output (MXNet default).
+        False for BGR formatted output (OpenCV default).
+    out : NDArray, optional
+        Output buffer. Use `None` for automatic allocation.
+
+    Returns
+    -------
+    NDArray
+        An `NDArray` containing the image.
+
+    Example
+    -------
+    >>> mx.img.imread("flower.jpg")
+    <NDArray 224x224x3 @cpu(0)>
+
+    Set `flag` parameter to 0 to get grayscale output
+
+    >>> mx.img.imdecode("flower.jpg", flag=0)
+    <NDArray 224x224x1 @cpu(0)>
+
+    Set `to_rgb` parameter to 0 to get output in OpenCV format (BGR)
+
+    >>> mx.img.imdecode(str_image, to_rgb=0)
+    <NDArray 224x224x3 @cpu(0)>
+    """
+    return _internal._cvimread(filename, *args, **kwargs)
+
+
 def imdecode(buf, *args, **kwargs):
     """Decode an image to an NDArray.
 
     Note: `imdecode` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with OpenCV for `imdecode` to work.
+    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -130,7 +171,7 @@ def scale_down(src_size, size):
     return int(w), int(h)
 
 
-def _get_interp_method(interp, sizes=None):
+def _get_interp_method(interp, sizes=()):
     """Get the interpolation method for resize functions.
     The major purpose of this function is to wrap a random interp method selection
     and a auto-estimation method.
@@ -481,7 +522,7 @@ def __init__(self, size, interp=2):
 
     def __call__(self, src):
         """Augmenter body"""
-        return [resize_short(src, self.size, self.interp)]
+        return resize_short(src, self.size, self.interp)
 
 
 class ForceResizeAug(Augmenter):
@@ -502,7 +543,7 @@ def __init__(self, size, interp=2):
     def __call__(self, src):
         """Augmenter body"""
         sizes = (src.shape[0], src.shape[1], self.size[1], self.size[0])
-        return [imresize(src, *self.size, interp=_get_interp_method(self.interp, sizes))]
+        return imresize(src, *self.size, interp=_get_interp_method(self.interp, sizes))
 
 
 class RandomCropAug(Augmenter):
@@ -522,7 +563,7 @@ def __init__(self, size, interp=2):
 
     def __call__(self, src):
         """Augmenter body"""
-        return [random_crop(src, self.size, self.interp)[0]]
+        return random_crop(src, self.size, self.interp)[0]
 
 
 class RandomSizedCropAug(Augmenter):
@@ -549,7 +590,7 @@ def __init__(self, size, min_area, ratio, interp=2):
 
     def __call__(self, src):
         """Augmenter body"""
-        return [random_size_crop(src, self.size, self.min_area, self.ratio, self.interp)[0]]
+        return random_size_crop(src, self.size, self.min_area, self.ratio, self.interp)[0]
 
 
 class CenterCropAug(Augmenter):
@@ -569,7 +610,7 @@ def __init__(self, size, interp=2):
 
     def __call__(self, src):
         """Augmenter body"""
-        return [center_crop(src, self.size, self.interp)[0]]
+        return center_crop(src, self.size, self.interp)[0]
 
 
 class RandomOrderAug(Augmenter):
@@ -590,10 +631,9 @@ def dumps(self):
 
     def __call__(self, src):
         """Augmenter body"""
-        src = [src]
         random.shuffle(self.ts)
         for t in self.ts:
-            src = [j for i in src for j in t(i)]
+            src = t(src)
         return src
 
 
@@ -613,7 +653,7 @@ def __call__(self, src):
         """Augmenter body"""
         alpha = 1.0 + random.uniform(-self.brightness, self.brightness)
         src *= alpha
-        return [src]
+        return src
 
 
 class ContrastJitterAug(Augmenter):
@@ -636,7 +676,7 @@ def __call__(self, src):
         gray = (3.0 * (1.0 - alpha) / gray.size) * nd.sum(gray)
         src *= alpha
         src += gray
-        return [src]
+        return src
 
 
 class SaturationJitterAug(Augmenter):
@@ -660,7 +700,7 @@ def __call__(self, src):
         gray *= (1.0 - alpha)
         src *= alpha
         src += gray
-        return [src]
+        return src
 
 
 class HueJitterAug(Augmenter):
@@ -694,7 +734,7 @@ def __call__(self, src):
                        [0.0, vsw, vsu]])
         t = np.dot(np.dot(self.tyiq, bt), self.ityiq).T
         src = nd.dot(src, nd.array(t))
-        return [src]
+        return src
 
 
 class ColorJitterAug(RandomOrderAug):
@@ -743,7 +783,7 @@ def __call__(self, src):
         alpha = np.random.normal(0, self.alphastd, size=(3,))
         rgb = np.dot(self.eigvec * alpha, self.eigval)
         src += nd.array(rgb)
-        return [src]
+        return src
 
 
 class ColorNormalizeAug(Augmenter):
@@ -763,7 +803,7 @@ def __init__(self, mean, std):
 
     def __call__(self, src):
         """Augmenter body"""
-        return [color_normalize(src, self.mean, self.std)]
+        return color_normalize(src, self.mean, self.std)
 
 
 class RandomGrayAug(Augmenter):
@@ -785,7 +825,7 @@ def __call__(self, src):
         """Augmenter body"""
         if random.random() < self.p:
             src = nd.dot(src, self.mat)
-        return [src]
+        return src
 
 
 class HorizontalFlipAug(Augmenter):
@@ -804,7 +844,7 @@ def __call__(self, src):
         """Augmenter body"""
         if random.random() < self.p:
             src = nd.flip(src, axis=1)
-        return [src]
+        return src
 
 
 class CastAug(Augmenter):
@@ -815,7 +855,7 @@ def __init__(self):
     def __call__(self, src):
         """Augmenter body"""
         src = src.astype(np.float32)
-        return [src]
+        return src
 
 
 def CreateAugmenter(data_shape, resize=0, rand_crop=False, rand_resize=False, rand_mirror=False,
@@ -1108,18 +1148,17 @@ def next(self):
         try:
             while i < batch_size:
                 label, s = self.next_sample()
-                data = [self.imdecode(s)]
+                data = self.imdecode(s)
                 try:
                     self.check_valid_image(data)
                 except RuntimeError as e:
                     logging.debug('Invalid image, skipping:  %s', str(e))
                     continue
                 data = self.augmentation_transform(data)
-                for datum in data:
-                    assert i < batch_size, 'Batch size must be multiples of augmenter output length'
-                    batch_data[i][:] = self.postprocess_data(datum)
-                    batch_label[i][:] = label
-                    i += 1
+                assert i < batch_size, 'Batch size must be multiples of augmenter output length'
+                batch_data[i] = self.postprocess_data(data)
+                batch_label[i] = label
+                i += 1
         except StopIteration:
             if not i:
                 raise StopIteration
@@ -1157,7 +1196,7 @@ def read_image(self, fname):
     def augmentation_transform(self, data):
         """Transforms input data with specified augmentation."""
         for aug in self.auglist:
-            data = [ret for src in data for ret in aug(src)]
+            data = aug(data)
         return data
 
     def postprocess_data(self, datum):
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index d6521c558ac4..7da0dcfc8d2d 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -61,4 +61,4 @@ def find_lib_path():
 
 
 # current version
-__version__ = "0.10.1"
+__version__ = "0.11.0"
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 692c7fe827ee..1337bbccc3c8 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -26,7 +26,7 @@
 from ctypes import CFUNCTYPE, POINTER, Structure, pointer
 from ctypes import c_void_p, c_int, c_char, c_char_p, cast, c_bool
 
-from .base import _LIB, check_call
+from .base import _LIB, check_call, MXCallbackList
 from .base import c_array, c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol, context
 from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
@@ -594,15 +594,6 @@ def register(reg_name):
     """Register a subclass of CustomOpProp to the registry with name reg_name."""
     def do_register(prop_cls):
         """Register a subclass of CustomOpProp to the registry."""
-
-        class MXCallbackList(Structure):
-            """Structure that holds Callback information. Passed to CustomOpProp."""
-            _fields_ = [
-                ('num_callbacks', c_int),
-                ('callbacks', POINTER(CFUNCTYPE(c_int))),
-                ('contexts', POINTER(c_void_p))
-                ]
-
         fb_functype = CFUNCTYPE(c_int, c_int, POINTER(c_void_p), POINTER(c_int),
                                 POINTER(c_int), c_int, c_void_p)
         del_functype = CFUNCTYPE(c_int, c_void_p)
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index 138c5c84304f..2c25e6856fd3 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Full Linux-x86_64 CPU-only</name>
   <packaging>jar</packaging>
 
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index 7e818cb28123..892851281655 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Full Linux-x86_64 GPU</name>
   <packaging>jar</packaging>
 
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index ead035668892..e3f433f673e4 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Full OSX-x86_64 CPU-only</name>
   <packaging>jar</packaging>
 
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index a1009ae6b08c..52a2cc42228f 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-full-parent_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Full Parent</name>
   <packaging>pom</packaging>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 7f639b9a8d39..51e8a3596b1a 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-core_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Core</name>
 
   <profiles>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index bda4fcdab5c4..356690cf0176 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>mxnet-examples_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Examples</name>
 
   <profiles>
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 7e6c02aefd83..9d784c471cb9 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>libmxnet-init-scala-linux-x86_64</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Initializer Native Linux-x86_64</name>
   <url>http://maven.apache.org</url>
 
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index 4f5125c06f15..fb3748e5698f 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>libmxnet-init-scala-osx-x86_64</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Initializer Native OSX-x86_64</name>
   <url>http://maven.apache.org</url>
 
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 3ce227a9b587..2b633169501d 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>mxnet-scala-init-native-parent</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Initializer Native Parent</name>
   <packaging>pom</packaging>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index 9f079565874e..04413e219429 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>mxnet-init_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Initializer</name>
 
   <profiles>
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index fd7fe3e4ab7b..2a1498cb2639 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>mxnet-macros_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Macros</name>
 
   <profiles>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index b2cfa4263cda..df45cd9e6c2e 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Native Linux-x86_64 CPU-only</name>
   <url>http://maven.apache.org</url>
 
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index 27f9221c3bad..edc70e923ff7 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,13 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Native Linux-x86_64 GPU</name>
   <url>http://maven.apache.org</url>
 
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index f924106a605c..b6fb83f26a71 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Native OSX-x86_64 CPU-only</name>
   <url>http://maven.apache.org</url>
 
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 0af9e087f906..e68ebb96666e 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>mxnet-scala-native-parent</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Native Parent</name>
   <packaging>pom</packaging>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 86d8cfc16a43..7bfd8774de6b 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -5,7 +5,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
+  <version>${project.version}</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/dmlc/mxnet/tree/master/scala-package</url>
   <description>MXNet Scala Package</description>
@@ -48,6 +48,7 @@
   </developers>
 
   <properties>
+    <project.version>0.11.0-SNAPSHOT</project.version>
     <scala.version>2.11.8</scala.version>
     <scala.binary.version>2.11</scala.binary.version>
   </properties>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index f35cbe45d9de..18170b95579b 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,12 +6,11 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.10.1-SNAPSHOT</version>
+    <version>${project.version}</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <artifactId>mxnet-spark_2.11</artifactId>
-  <version>0.10.1-SNAPSHOT</version>
   <name>MXNet Scala Package - Spark ML</name>
 
   <properties>
diff --git a/setup-utils/install-mxnet-windows-python.bat b/setup-utils/install-mxnet-windows-python.bat
index 206c66c4c008..021baaeff331 100644
--- a/setup-utils/install-mxnet-windows-python.bat
+++ b/setup-utils/install-mxnet-windows-python.bat
@@ -1,3 +1,20 @@
+rem Licensed to the Apache Software Foundation (ASF) under one
+rem or more contributor license agreements.  See the NOTICE file
+rem distributed with this work for additional information
+rem regarding copyright ownership.  The ASF licenses this file
+rem to you under the Apache License, Version 2.0 (the
+rem "License"); you may not use this file except in compliance
+rem with the License.  You may obtain a copy of the License at
+rem
+rem   http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing,
+rem software distributed under the License is distributed on an
+rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+rem KIND, either express or implied.  See the License for the
+rem specific language governing permissions and limitations
+rem under the License.
+
 @echo off
 setlocal
 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
diff --git a/snapcraft.yaml b/snapcraft.yaml
index b9329a0ccd41..27356c332a29 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '0.10.1'
+version: '0.11.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
new file mode 100644
index 000000000000..3d8b5328c1a0
--- /dev/null
+++ b/src/c_api/c_api_function.cc
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file custom.cc
+ * \brief
+ * \author Junyuan Xie
+*/
+#include <mxnet/c_api.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+
+#include "./c_api_common.h"
+#include "../ndarray/autograd.h"
+
+namespace mxnet {
+namespace custom_function {
+
+struct CustomFunctionParam {
+  size_t num_args, num_outs;
+  std::shared_ptr<MXCallbackList> info;
+  std::vector<TShape> out_shapes;
+  std::vector<int> out_dtypes;
+};
+
+std::vector<nnvm::NodeEntry> Gradient(
+    const nnvm::NodePtr& n,
+    const std::vector<nnvm::NodeEntry>& out_grads) {
+  const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(n->attrs.parsed);
+
+  nnvm::NodePtr g = nnvm::Node::Create();
+  g->attrs.op = nnvm::Op::Get("_backward_CustomFunction");
+  g->attrs.name = n->attrs.name + "_backward";
+  g->attrs.parsed = params;
+  g->control_deps.emplace_back(n);
+
+  g->inputs = out_grads;
+
+  std::vector<nnvm::NodeEntry> ret;
+  for (index_t i = 0; i < g->num_outputs(); ++i) {
+    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+  }
+
+  return ret;
+}
+
+OpStatePtr CreateState(const nnvm::NodeAttrs& attrs,
+                               Context ctx,
+                               const std::vector<TShape>& ishape,
+                               const std::vector<int>& itype) {
+  LOG(FATAL) << "Not reached";
+  return OpStatePtr::Create<void*>(nullptr);
+}
+
+void Forward(const OpStatePtr& state,
+             const OpContext& ctx,
+             const std::vector<NDArray>& inputs,
+             const std::vector<OpReqType>& req,
+             const std::vector<NDArray>& outputs) {
+  LOG(FATAL) << "Not reached";
+}
+
+void Backward(const OpStatePtr& state,
+              const OpContext& ctx,
+              const std::vector<NDArray>& inputs,
+              const std::vector<OpReqType>& req,
+              const std::vector<NDArray>& outputs) {
+  const CustomFunctionParam& params = state.get_state<CustomFunctionParam>();
+
+  std::vector<NDArrayHandle> ptrs;
+
+  for (const auto& i : inputs) {
+    NDArray* nd = new NDArray(i.Detach());
+    ptrs.push_back(reinterpret_cast<NDArrayHandle>(nd));
+  }
+  for (const auto& i : outputs) {
+    NDArray* nd = new NDArray(i.Detach());
+    ptrs.push_back(reinterpret_cast<NDArrayHandle>(nd));
+  }
+
+  bool prev_recording = autograd::AutogradRuntime::Get()->SetIsRecording(false);
+  bool prev_training = autograd::AutogradRuntime::Get()->SetIsTraining(ctx.is_train);
+
+  CHECK(reinterpret_cast<CustomFunctionBwdFunc>(
+      params.info->callbacks[kCustomFunctionBackward])(
+          inputs.size(), outputs.size(), ptrs.data(),
+          reinterpret_cast<const int*>(req.data()), ctx.is_train,
+          params.info->contexts[kCustomFunctionBackward]));
+
+  autograd::AutogradRuntime::Get()->SetIsTraining(prev_training);
+  autograd::AutogradRuntime::Get()->SetIsRecording(prev_recording);
+}
+
+
+NNVM_REGISTER_OP(_CustomFunction)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(attrs.parsed);
+    return params.num_args;
+  })
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(attrs.parsed);
+    return params.num_outs;
+  })
+.set_attr<nnvm::FInferShape>("FInferShape",
+  [](const NodeAttrs& attrs, std::vector<TShape> *in_shape,
+     std::vector<TShape> *out_shape) {
+    const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(attrs.parsed);
+    *out_shape = params.out_shapes;
+    return true;
+  })
+.set_attr<nnvm::FInferType>("FInferType",
+  [](const NodeAttrs& attrs, std::vector<int> *in_type,
+     std::vector<int> *out_type) {
+    const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(attrs.parsed);
+    *out_type = params.out_dtypes;
+    return true;
+  })
+.set_attr<FCreateOpState>("FCreateOpState", CreateState)
+.set_attr<nnvm::FGradient>("FGradient", Gradient)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Forward)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Forward);
+
+
+NNVM_REGISTER_OP(_backward_CustomFunction)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(attrs.parsed);
+    return params.num_outs;
+  })
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const CustomFunctionParam& params = nnvm::get<CustomFunctionParam>(attrs.parsed);
+    return params.num_args;
+  })
+.set_attr<bool>("TIsBackward", true)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+    return ExecType::kLocal;
+  })
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Backward)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Backward);
+
+}  // namespace custom_function
+}  // namespace mxnet
+
+int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs,
+                           int num_outputs, NDArrayHandle *outputs,
+                           MXCallbackList *callbacks) {
+  using namespace mxnet;
+  using namespace mxnet::custom_function;
+  using mxnet::autograd::AutogradRuntime;
+  API_BEGIN();
+  CHECK(AutogradRuntime::Get()->IsRecording());
+  std::vector<NDArray> ndinputs, ndoutputs;
+  for (int i = 0; i < num_inputs; ++i) {
+    ndinputs.emplace_back(*reinterpret_cast<NDArray*>(inputs[i]));
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    ndoutputs.emplace_back(*reinterpret_cast<NDArray*>(outputs[i]));
+  }
+  CustomFunctionParam params;
+  params.num_args = num_inputs;
+  params.num_outs = num_outputs;
+  params.info.reset(callbacks, [](MXCallbackList* ptr){
+      reinterpret_cast<CustomFunctionDelFunc>(ptr->callbacks[kCustomFunctionDelete])(
+        ptr->contexts[kCustomFunctionDelete]);
+    });
+  for (const auto& i : ndoutputs) {
+    params.out_shapes.emplace_back(i.shape());
+    params.out_dtypes.emplace_back(i.dtype());
+  }
+  nnvm::NodeAttrs attrs;
+  attrs.op = nnvm::Op::Get("_CustomFunction");
+  attrs.parsed = params;
+  // TODO(piiswrong): remove state by using FComputeEx
+  auto state = OpStatePtr::Create<CustomFunctionParam>(params);
+  AutogradRuntime::Get()->RecordImperativeOperator(
+      state, attrs.op, attrs, &ndinputs, &ndoutputs);
+
+  for (size_t i = 0; i < ndoutputs.size(); ++i) {
+    *reinterpret_cast<NDArray*>(outputs[i]) = ndoutputs[i];
+  }
+
+  API_END();
+}
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index d5296f6a12db..d392baf45d3e 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -786,3 +786,11 @@ int MXAutogradBackwardEx(mx_uint num_output,
   AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph, is_train);
   API_END();
 }
+
+int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out) {
+  API_BEGIN();
+  NDArray *head = reinterpret_cast<NDArray*>(handle);
+  auto sym = new nnvm::Symbol(head->get_autograd_symbol());
+  *out = reinterpret_cast<SymbolHandle>(sym);
+  API_END();
+}
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 2879ab3cbec2..8897007207fb 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -87,6 +87,35 @@ inline const char* CublasGetErrorString(cublasStatus_t error) {
   return "Unknown cuBLAS status";
 }
 
+/*!
+ * \brief Get string representation of cuSOLVER errors.
+ * \param error The error.
+ * \return String representation.
+ */
+inline const char* CusolverGetErrorString(cusolverStatus_t error) {
+  switch (error) {
+  case CUSOLVER_STATUS_SUCCESS:
+    return "CUSOLVER_STATUS_SUCCESS";
+  case CUSOLVER_STATUS_NOT_INITIALIZED:
+    return "CUSOLVER_STATUS_NOT_INITIALIZED";
+  case CUSOLVER_STATUS_ALLOC_FAILED:
+    return "CUSOLVER_STATUS_ALLOC_FAILED";
+  case CUSOLVER_STATUS_INVALID_VALUE:
+    return "CUSOLVER_STATUS_INVALID_VALUE";
+  case CUSOLVER_STATUS_ARCH_MISMATCH:
+    return "CUSOLVER_STATUS_ARCH_MISMATCH";
+  case CUSOLVER_STATUS_EXECUTION_FAILED:
+    return "CUSOLVER_STATUS_EXECUTION_FAILED";
+  case CUSOLVER_STATUS_INTERNAL_ERROR:
+    return "CUSOLVER_STATUS_INTERNAL_ERROR";
+  case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+    return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  default:
+    break;
+  }
+  return "Unknown cuSOLVER status";
+}
+
 /*!
  * \brief Get string representation of cuRAND errors.
  * \param status The status.
@@ -164,6 +193,19 @@ inline const char* CurandGetErrorString(curandStatus_t status) {
         << "cuBLAS: " << common::cuda::CublasGetErrorString(e); \
   }
 
+/*!
+ * \brief Protected cuSolver call.
+ * \param func Expression to call.
+ *
+ * It checks for cuSolver errors after invocation of the expression.
+ */
+#define CUSOLVER_CALL(func)                                         \
+  {                                                                 \
+    cusolverStatus_t e = (func);                                    \
+    CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS)                            \
+        << "cuSolver: " << common::cuda::CusolverGetErrorString(e); \
+  }
+
 /*!
  * \brief Protected cuRAND call.
  * \param func Expression to call.
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index f9d7f33a5a44..e6b5a624448e 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -33,6 +33,8 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/tuple.h>
 
+#include <fstream>
+
 #include "../operator/elemwise_op_common.h"
 
 #if MXNET_USE_OPENCV
@@ -108,8 +110,66 @@ struct ImdecodeParam : public dmlc::Parameter<ImdecodeParam> {
               "(instead of opencv's default BGR).");
   }
 };
+
 DMLC_REGISTER_PARAMETER(ImdecodeParam);
 
+struct ImreadParam : public dmlc::Parameter<ImreadParam> {
+  std::string filename;
+  int flag;
+  bool to_rgb;
+  DMLC_DECLARE_PARAMETER(ImreadParam) {
+    DMLC_DECLARE_FIELD(filename)
+    .describe("Name of the image file to be loaded.");
+    DMLC_DECLARE_FIELD(flag)
+    .set_lower_bound(0)
+    .set_default(1)
+    .describe("Convert decoded image to grayscale (0) or color (1).");
+    DMLC_DECLARE_FIELD(to_rgb)
+    .set_default(true)
+    .describe("Whether to convert decoded image to mxnet's default RGB format "
+              "(instead of opencv's default BGR).");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(ImreadParam);
+
+
+#if MXNET_USE_OPENCV
+void ImdecodeImpl(int flag, bool to_rgb, void* data, size_t size,
+                  NDArray* out) {
+  cv::Mat buf(1, size, CV_8U, data);
+  cv::Mat dst;
+  if (out->is_none()) {
+    cv::Mat res = cv::imdecode(buf, flag);
+    if (res.empty()) {
+      LOG(INFO) << "Invalid image file. Only supports png and jpg.";
+      *out = NDArray();
+      return;
+    }
+    *out = NDArray(mshadow::Shape3(res.rows, res.cols, flag == 0 ? 1 : 3),
+                   Context::CPU(), false, mshadow::kUint8);
+    dst = cv::Mat(out->shape()[0], out->shape()[1], flag == 0 ? CV_8U : CV_8UC3,
+                  out->data().dptr_);
+    res.copyTo(dst);
+  } else {
+    dst = cv::Mat(out->shape()[0], out->shape()[1], flag == 0 ? CV_8U : CV_8UC3,
+                out->data().dptr_);
+#if (CV_MAJOR_VERSION > 2 || (CV_MAJOR_VERSION == 2 && CV_MINOR_VERSION >=4))
+    cv::imdecode(buf, flag, &dst);
+#else
+    cv::Mat tmp = cv::imdecode(buf, flag);
+    CHECK(!tmp.empty());
+    tmp.copyTo(dst);
+#endif
+  }
+  CHECK(!dst.empty());
+  CHECK_EQ(static_cast<void*>(dst.ptr()), out->data().dptr_);
+  if (to_rgb && flag != 0) {
+    cv::cvtColor(dst, dst, CV_BGR2RGB);
+  }
+}
+#endif  // MXNET_USE_OPENCV
+
 void Imdecode(const nnvm::NodeAttrs& attrs,
               const std::vector<NDArray>& inputs,
               std::vector<NDArray>* outputs) {
@@ -118,63 +178,71 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
 
   CHECK_EQ(inputs[0].ctx().dev_mask(), cpu::kDevMask) << "Only supports cpu input";
   CHECK_EQ(inputs[0].dtype(), mshadow::kUint8) << "Input needs to be uint8 buffer";
-  const uint8_t* str_img = reinterpret_cast<uint8_t*>(inputs[0].data().dptr_);
-  uint32_t len = inputs[0].shape().Size();
+  inputs[0].WaitToRead();
 
-  NDArray ndin = inputs[0];
-  ndin.WaitToRead();
+  uint8_t* str_img = inputs[0].data().dptr<uint8_t>();
+  size_t len = inputs[0].shape().Size();
   TShape oshape(3);
   oshape[2] = param.flag == 0 ? 1 : 3;
   if (get_jpeg_size(str_img, len, &oshape[1], &oshape[0])) {
   } else if (get_png_size(str_img, len, &oshape[1], &oshape[0])) {
   } else {
-    cv::Mat buf(1, ndin.shape().Size(), CV_8U, ndin.data().dptr_);
-    cv::Mat res = cv::imdecode(buf, param.flag);
-    if (res.empty()) {
-      LOG(INFO) << "Invalid image file. Only supports png and jpg.";
-      (*outputs)[0] = NDArray();
-      return;
-    }
-    oshape[0] = res.rows;
-    oshape[1] = res.cols;
-    NDArray ndout(oshape, Context::CPU(), false, mshadow::kUint8);
-    cv::Mat dst(ndout.shape()[0], ndout.shape()[1],
-                param.flag == 0 ? CV_8U : CV_8UC3,
-                ndout.data().dptr_);
-    res.copyTo(dst);
-    if (param.to_rgb && param.flag != 0) {
-      cv::cvtColor(dst, dst, CV_BGR2RGB);
-    }
-    (*outputs)[0] = ndout;
+    (*outputs)[0] = NDArray();
+    ImdecodeImpl(param.flag, param.to_rgb, str_img, len, &((*outputs)[0]));
     return;
   }
 
-  NDArray ndout(oshape, Context::CPU(), true, mshadow::kUint8);
-  Engine::Get()->PushSync([ndin, ndout, param](RunContext ctx){
-      cv::Mat buf(1, ndin.shape().Size(), CV_8U, ndin.data().dptr_);
-      cv::Mat dst(ndout.shape()[0], ndout.shape()[1],
-                  param.flag == 0 ? CV_8U : CV_8UC3,
-                  ndout.data().dptr_);
-#if (CV_MAJOR_VERSION > 2 || (CV_MAJOR_VERSION == 2 && CV_MINOR_VERSION >=4))
-      cv::imdecode(buf, param.flag, &dst);
-#else
-      cv::Mat tmp = cv::imdecode(buf, param.flag);
-      CHECK(!tmp.empty());
-      tmp.copyTo(dst);
-#endif
-      CHECK(!dst.empty());
-      CHECK_EQ(static_cast<void*>(dst.ptr()), ndout.data().dptr_);
-      if (param.to_rgb && param.flag != 0) {
-        cv::cvtColor(dst, dst, CV_BGR2RGB);
-      }
+  const NDArray& ndin = inputs[0];
+  NDArray& ndout = (*outputs)[0];
+  ndout = NDArray(oshape, Context::CPU(), true, mshadow::kUint8);
+  Engine::Get()->PushSync([ndin, ndout, str_img, len, param](RunContext ctx){
+      ImdecodeImpl(param.flag, param.to_rgb, str_img, len,
+                   const_cast<NDArray*>(&ndout));
     }, ndout.ctx(), {ndin.var()}, {ndout.var()},
     FnProperty::kNormal, 0, PROFILER_MESSAGE("Imdecode"));
-  (*outputs)[0] = ndout;
 #else
   LOG(FATAL) << "Build with USE_OPENCV=1 for image io.";
 #endif  // MXNET_USE_OPENCV
 }
 
+void Imread(const nnvm::NodeAttrs& attrs,
+            const std::vector<NDArray>& inputs,
+            std::vector<NDArray>* outputs) {
+#if MXNET_USE_OPENCV
+  const auto& param = nnvm::get<ImreadParam>(attrs.parsed);
+
+  std::ifstream file(param.filename, std::ios::binary | std::ios::ate);
+  size_t fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+  auto buff = new uint8_t[fsize];
+  file.read(reinterpret_cast<char*>(buff), fsize);
+  CHECK(file.good()) << "Failed reading image file " << param.filename;
+
+  TShape oshape(3);
+  oshape[2] = param.flag == 0 ? 1 : 3;
+  if (get_jpeg_size(buff, fsize, &oshape[1], &oshape[0])) {
+  } else if (get_png_size(buff, fsize, &oshape[1], &oshape[0])) {
+  } else {
+    (*outputs)[0] = NDArray();
+    ImdecodeImpl(param.flag, param.to_rgb, buff, fsize, &((*outputs)[0]));
+    delete buff;
+    return;
+  }
+
+  NDArray& ndout = (*outputs)[0];
+  ndout = NDArray(oshape, Context::CPU(), true, mshadow::kUint8);
+  Engine::Get()->PushSync([ndout, buff, fsize, param](RunContext ctx){
+      ImdecodeImpl(param.flag, param.to_rgb, buff, fsize,
+                   const_cast<NDArray*>(&ndout));
+      delete buff;
+    }, ndout.ctx(), {}, {ndout.var()},
+    FnProperty::kNormal, 0, PROFILER_MESSAGE("Imread"));
+#else
+  LOG(FATAL) << "Build with USE_OPENCV=1 for image io.";
+#endif  // MXNET_USE_OPENCV
+}
+
+
 struct ResizeParam : public dmlc::Parameter<ResizeParam> {
   int w;
   int h;
@@ -301,6 +369,16 @@ NNVM_REGISTER_OP(_cvimdecode)
 .add_argument("buf", "NDArray", "Buffer containing binary encoded image")
 .add_arguments(ImdecodeParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_cvimread)
+.describe("Read and decode image with OpenCV. \n"
+          "Note: return image in RGB by default, "
+          "instead of OpenCV's default BGR.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(op::ParamParser<ImreadParam>)
+.set_attr<FNDArrayFunction>("FNDArrayFunction", Imread)
+.add_arguments(ImreadParam::__FIELDS__());
+
 NNVM_REGISTER_OP(_cvimresize)
 .describe("Resize image with OpenCV. \n")
 .set_num_inputs(1)
diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc
index 33d0d5d307ed..78b98dabc661 100644
--- a/src/ndarray/autograd.cc
+++ b/src/ndarray/autograd.cc
@@ -82,19 +82,21 @@ void AutogradRuntime::MarkVariables(
   for (uint32_t i = 0; i < variables.size(); ++i) {
     std::string str_c(std::to_string(variable_count_++));
 
-    AGNodeEntry e{AGNode::Create(Node::Create()), 0, 0};
+    AGNodeEntry e{
+      AGNode::Create(
+        nnvm::Symbol::CreateVariable("var" + str_c).outputs[0].node), 0, 0};
     variables[i]->entry_.clear();
     e.ag_node->outputs.emplace_back(*variables[i]);
 
-    AGNodeEntry ge{AGNode::Create(Node::Create()), 0, 0};
+    AGNodeEntry ge{
+      AGNode::Create(
+        nnvm::Symbol::CreateVariable("grad" + str_c).outputs[0].node), 0, 0};
     gradients[i]->entry_.clear();
     ge.ag_node->outputs.emplace_back(*gradients[i]);
-    ge.ag_node->nn_node->attrs.name = "grad" + str_c;
     gradients[i]->entry_ = std::move(ge);
     e.ag_node->out_grads.emplace_back(*gradients[i]);
 
     e.ag_node->grad_req = static_cast<OpReqType>(grad_reqs[i]);
-    e.ag_node->nn_node->attrs.name = "var" + str_c;
     variables[i]->entry_ = std::move(e);  // assign last to prevent cyclic reference
   }
 }
@@ -141,10 +143,12 @@ AGNodePtr AutogradRuntime::RecordOp(const nnvm::Op* op,
 
   for (size_t i = 0; i < inputs.size(); ++i) {
     if (inputs[i].entry_.is_none()) {
-      AGNodeEntry e{AGNode::Create(Node::Create()), 0, 0};
+      AGNodeEntry e{
+        AGNode::Create(
+          nnvm::Symbol::CreateVariable(
+            "null" + std::to_string(variable_count_++)).outputs[0].node), 0, 0};
       e.ag_node->outputs.emplace_back(inputs[i]);
       e.ag_node->out_grads.emplace_back();
-      e.ag_node->nn_node->attrs.name = "var_" + std::to_string(variable_count_++);
       inputs[i].entry_ = std::move(e);  // assign last to prevent cyclic reference
     }
     nn_node->inputs.push_back(inputs[i].entry_.nn_entry());
@@ -177,7 +181,7 @@ void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs,
   for (const auto& i : outputs) {
     CHECK(!i.entry_.is_none())
       << "Cannot differentiate node because it is not in a computational graph. "
-      << "You need to set is_training to true or use autograd.record() to save "
+      << "You need to set is_recording to true or use autograd.record() to save "
       << "computational graphs for backward. If you want to differentiate the same "
       << "graph twice, you need to pass retain_graph=True to backward.";
     heads.emplace_back(i.entry_);
diff --git a/include/mxnet/c_lapack_api.h b/src/operator/c_lapack_api.h
similarity index 74%
rename from include/mxnet/c_lapack_api.h
rename to src/operator/c_lapack_api.h
index 1ae90a9396d5..96a9b3a23709 100644
--- a/include/mxnet/c_lapack_api.h
+++ b/src/operator/c_lapack_api.h
@@ -19,14 +19,24 @@
 
 /*!
  * \file c_lapack_api.h
- * \brief Unified interface for LAPACK calls from within mxnet.
+ * \brief Unified interface for CPU-based LAPACK calls.
  *  Purpose is to hide the platform specific differences.
  */
-#ifndef MXNET_C_LAPACK_API_H_
-#define MXNET_C_LAPACK_API_H_
+#ifndef MXNET_OPERATOR_C_LAPACK_API_H_
+#define MXNET_OPERATOR_C_LAPACK_API_H_
 
 // Manually maintained list of LAPACK interfaces that can be used
 // within MXNET. Conventions:
+//    - We should only import LAPACK-functions that are useful and
+//      ensure that we support them most efficiently on CPU/GPU. As an
+//      example take "potrs": It can be emulated by two calls to
+//      "trsm" (from BLAS3) so not really needed from functionality point
+//      of view. In addition, trsm on GPU supports batch-mode processing
+//      which is much more efficient for a bunch of smaller matrices while
+//      there is no such batch support for potrs. As a result, we may
+//      not support "potrs" internally and if we want to expose it to the user as
+//      a convenience operator at some time, then we may implement it internally
+//      as a sequence of trsm.
 //    - Interfaces must be compliant with lapacke.h in terms of signature and
 //      naming conventions so wrapping a function "foo" which has the
 //      signature
@@ -36,14 +46,21 @@
 //      Note that function signatures in lapacke.h will always have as first
 //      argument the storage order (row/col-major). All wrappers have to support
 //      that argument. The underlying fortran functions will always assume a
-//      column-major layout. It is the responsibility of the wrapper function
-//      to handle the (usual) case that it is called with data in row-major
-//      format, either by doing appropriate transpositions explicitly or using
-//      transposition options of the underlying fortran function.
-//    - It is ok to assume that matrices are stored in contiguous memory
-//      (which removes the need to do special handling for lda/ldb parameters
-//      and enables us to save additional matrix transpositions around
-//      the fortran calls).
+//      column-major layout.
+//    - In the (usual) case that a wrapper is called specifying row-major storage
+//      order of input/output data, there are two ways to handle this:
+//        1) The wrapper may support this without allocating any additional memory
+//           for example by exploiting the fact that a matrix is symmetric and switching
+//           certain flags (upper/lower triangular) when calling the fortran code.
+//        2) The wrapper may cause a runtime error. In that case it should be clearly
+//           documented that these functions do only support col-major layout.
+//      Rationale: This is a low level interface that is not expected to be called
+//      directly from many upstream functions. Usually all calls should go through
+//      the tensor-based interfaces in linalg.h which simplify calls to lapack further
+//      and are better suited to handle additional transpositions that may be necessary.
+//      Also we want to push allocation of temporary storage higher up in order to
+//      allow more efficient re-use of temporal storage. And don't want to plaster
+//      these interfaces here with additional requirements of providing buffers.
 //    - It is desired to add some basic checking in the C++-wrappers in order
 //      to catch simple mistakes when calling these wrappers.
 //    - Must support compilation without lapack-package but issue runtime error in this case.
@@ -54,9 +71,10 @@
 using namespace mshadow;
 
 extern "C" {
+
   // Fortran signatures
   #define MXNET_LAPACK_FSIGNATURE1(func, dtype) \
-    void func##_(char* uplo, int* n, dtype* a, int* lda, int *info);
+    void func##_(char *uplo, int *n, dtype *a, int *lda, int *info);
 
   MXNET_LAPACK_FSIGNATURE1(spotrf, float)
   MXNET_LAPACK_FSIGNATURE1(dpotrf, double)
@@ -73,9 +91,6 @@ extern "C" {
 #define MXNET_LAPACK_ROW_MAJOR 101
 #define MXNET_LAPACK_COL_MAJOR 102
 
-#define CHECK_LAPACK_CONTIGUOUS(a, b) \
-  CHECK_EQ(a, b) << "non contiguous memory for array in lapack call";
-
 #define CHECK_LAPACK_UPLO(a) \
   CHECK(a == 'U' || a == 'L') << "neither L nor U specified as triangle in lapack call";
 
@@ -117,9 +132,9 @@ inline void flip<cpu, double>(int m, int n,
 
 #if MXNET_USE_LAPACK
 
+  // These functions can be called with either row- or col-major format.
   #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
-  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda ) { \
-    CHECK_LAPACK_CONTIGUOUS(n, lda); \
+  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, int lda) { \
     CHECK_LAPACK_UPLO(uplo); \
     char o(loup(uplo, (matrix_layout == MXNET_LAPACK_ROW_MAJOR))); \
     int ret(0); \
@@ -172,7 +187,7 @@ inline void flip<cpu, double>(int m, int n,
 
   // Define compilable stubs.
   #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
-  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda ) { \
+  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda) { \
     LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
     return 1; \
   }
@@ -209,4 +224,4 @@ inline int MXNET_LAPACK_posv<double>(int matrix_layout, char uplo, int n,
   return mxnet_lapack_dposv(matrix_layout, uplo, n, nrhs, a, lda, b, ldb);
 }
 
-#endif  // MXNET_C_LAPACK_API_H_
+#endif  // MXNET_OPERATOR_C_LAPACK_API_H_
diff --git a/src/operator/contrib/krprod.h b/src/operator/contrib/krprod.h
index 6ce94c648d79..a54ece79e9d7 100644
--- a/src/operator/contrib/krprod.h
+++ b/src/operator/contrib/krprod.h
@@ -26,7 +26,7 @@
 #define MXNET_OPERATOR_CONTRIB_KRPROD_H_
 #include <vector>
 #include "mshadow/tensor.h"
-#include "mxnet/c_lapack_api.h"
+#include "../c_lapack_api.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 7b257ba843c3..59414d30ddc3 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -286,13 +286,15 @@ void Forward(const OpStatePtr& state,
     tags.push_back(4);
   }
 
-  bool old = autograd::AutogradRuntime::Get()->SetIsRecording(false);
+  bool prev_recording = autograd::AutogradRuntime::Get()->SetIsRecording(false);
+  bool prev_training = autograd::AutogradRuntime::Get()->SetIsTraining(ctx.is_train);
 
   CHECK(reinterpret_cast<CustomOpFBFunc>(params.info->callbacks[kCustomOpForward])(
     ptrs.size(), ptrs.data(), tags.data(), reinterpret_cast<const int*>(req.data()),
     static_cast<int>(ctx.is_train), params.info->contexts[kCustomOpForward]));
 
-  autograd::AutogradRuntime::Get()->SetIsRecording(old);
+  autograd::AutogradRuntime::Get()->SetIsTraining(prev_training);
+  autograd::AutogradRuntime::Get()->SetIsRecording(prev_recording);
 }
 
 
@@ -330,13 +332,15 @@ void Backward(const OpStatePtr& state,
     tags.push_back(4);
   }
 
-  bool old = autograd::AutogradRuntime::Get()->SetIsRecording(false);
+  bool prev_recording = autograd::AutogradRuntime::Get()->SetIsRecording(false);
+  bool prev_training = autograd::AutogradRuntime::Get()->SetIsTraining(ctx.is_train);
 
   CHECK(reinterpret_cast<CustomOpFBFunc>(params.info->callbacks[kCustomOpBackward])(
     ptrs.size(), ptrs.data(), tags.data(), reinterpret_cast<const int*>(req.data()),
     static_cast<int>(ctx.is_train), params.info->contexts[kCustomOpBackward]));
 
-  autograd::AutogradRuntime::Get()->SetIsRecording(old);
+  autograd::AutogradRuntime::Get()->SetIsTraining(prev_training);
+  autograd::AutogradRuntime::Get()->SetIsRecording(prev_recording);
 }
 
 
diff --git a/src/operator/linalg.h b/src/operator/linalg.h
new file mode 100644
index 000000000000..9284a5825d2c
--- /dev/null
+++ b/src/operator/linalg.h
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file linalg.h
+ * \brief Unified tensor interface for advanced linear algebra functions
+ * (specifically BLAS3/LAPACK) from within mxnet.
+ */
+#ifndef MXNET_OPERATOR_LINALG_H_
+#define MXNET_OPERATOR_LINALG_H_
+
+#include <mshadow/tensor.h>
+#include "./c_lapack_api.h"
+using namespace mshadow;
+
+// The purpose of this header is to expose the interfaces of the advanced
+// linear algebra functions without clutter by the implementations. In contrast
+// to the implementations in linalg_inline.h, no macros are used to generate
+// similar functions that just differ by name/type in order to improve readability.
+//
+// Guidelines for extensions:
+// For any type of computation the following should be provided at minimum:
+//   - 1 templated function supporting cpu/gpu float/double in non-batch mode
+//   - 1 templated function supporting cpu/gpu float/double in batch mode
+// Naming conventions:
+//   - linalg_<func>()
+//   - linalg_batch_<func>()
+// Signatures of CPU/GPU versions should be equivalent whenever possible including
+// that a stream is supplied to the cpu-versions as (optional) last argument.
+// The batched versions all work on tensors with one more dimension as the
+// non-batched ones and the first/highest dimension iterates over the elements
+// within the batch.
+
+//////////////////////////////// GEMM ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "gemm". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is C = gemm(A,B,C), so C is input and output parameter.
+template<typename xpu, typename DType>
+void linalg_gemm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                 const Tensor<xpu, 2, DType>& C, DType alpha, DType beta,
+                 bool tA, bool tB, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_gemm(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                       const Tensor<xpu, 3, DType>& C, DType alpha, DType beta,
+                       bool tA, bool tB, Stream<xpu> *s = 0);
+
+//////////////////////////////// TRSM ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "trsm". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is B = trsm(A,B), so B is input and output parameter.
+template<typename xpu, typename DType>
+void linalg_trsm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+inline void linalg_batch_trsm(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                   DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s = 0);
+
+//////////////////////////////// TRMM ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "trmm". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is B = trmm(A,B), so B is input and output parameter.
+
+template<typename xpu, typename DType>
+void linalg_trmm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_trmm(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                    DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s = 0);
+
+//////////////////////////////// POTRF ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "potrf". Please refer to the LAPACK-documentation
+// for further information about the function and its parameters.
+// Note that this is A = potrf(A), so A is input and output parameter.
+
+template<typename xpu, typename DType>
+void linalg_potrf(const Tensor<xpu, 2, DType>& A, bool lower, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_potrf(const Tensor<xpu, 3, DType>& A, bool lower, Stream<xpu> *s = 0);
+
+//////////////////////////////// POTRI ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "potri". Please refer to the LAPACK-documentation
+// for further information about the function and its parameters.
+// Note that this is A = potri(A), so A is input and output parameter.
+
+template<typename xpu, typename DType>
+void linalg_potri(const Tensor<xpu, 2, DType>& A, bool lower, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_potri(const Tensor<xpu, 3, DType>& A, bool lower, Stream<xpu> *s = 0);
+
+#include "linalg_impl.h"
+
+#endif  // MXNET_OPERATOR_LINALG_H_
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
new file mode 100644
index 000000000000..affa7941640b
--- /dev/null
+++ b/src/operator/linalg_impl.h
@@ -0,0 +1,508 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file linalg.h
+ * \brief Implementation of unified tensor interface for advanced linear algebra functions
+ * (specifically BLAS3/LAPACK) from within mxnet.
+ */
+#ifndef MXNET_OPERATOR_LINALG_IMPL_H_
+#define MXNET_OPERATOR_LINALG_IMPL_H_
+
+#include <algorithm>
+
+// Convenience functions.
+inline void linalg_check_batch_size(int A, int B, int C) {
+  CHECK_EQ(A, B) << "Inconsistent batch size between arguments to linear algebra operator";
+  CHECK_EQ(A, C) << "Inconsistent batch size between arguments to linear algebra operator";
+  CHECK_GT(A, 0) << "Zero batch size for arguments to linear algebra operator";
+}
+
+//////////////////////////////// GEMM ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "gemm". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is C = gemm(A,B,C), so C is input and output parameter.
+
+template<typename xpu, typename DType>
+inline void check_gemm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                const Tensor<xpu, 2, DType>& C, DType alpha, DType beta, bool tA, bool tB) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ((tA ? A.size(1) : A.size(0)), C.size(0))
+    << "Non compatible matrix dimensions between inputs A and C for gemm";
+  CHECK_EQ((tB ? B.size(0) : B.size(1)), C.size(1))
+    << "Non compatible matrix dimensions between inputs B and C for gemm";
+  CHECK_EQ((tA ? A.size(0) : A.size(1)), (tB ? B.size(1) : B.size(0)))
+    << "Non compatible matrix dimensions between inputs A and B for gemm";
+}
+
+#define LINALG_CPU_GEMM(fname, DType) \
+template<> inline \
+void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
+                             const Tensor<cpu, 2, DType>& C, DType alpha, DType beta, \
+                             bool tA, bool tB, Stream<cpu> *s) { \
+  check_gemm(A, B, C, alpha, beta, tA, tB); \
+  cblas_##fname(CblasRowMajor, (tA ? CblasTrans : CblasNoTrans), (tB ? CblasTrans : CblasNoTrans), \
+                C.size(0), C.size(1), (tA ? A.size(0) : A.size(1)), alpha, \
+                A.dptr_, A.stride_, B.dptr_, B.stride_, beta, C.dptr_, C.stride_); \
+}
+LINALG_CPU_GEMM(sgemm, float)
+LINALG_CPU_GEMM(dgemm, double)
+
+#define LINALG_CPU_BATCH_GEMM(DType) \
+template<> inline \
+void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
+                                   const Tensor<cpu, 3, DType>& C, DType alpha, DType beta, \
+                                   bool tA, bool tB, Stream<cpu> *s) { \
+  linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB); \
+  } \
+}
+LINALG_CPU_BATCH_GEMM(float)
+LINALG_CPU_BATCH_GEMM(double)
+
+#ifdef __CUDACC__
+
+template<typename DType>
+__global__ void linalgCollectBatchOffsetsGPU(DType *a[], DType* b, int stride, int N) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+      a[i] = b + i * stride;
+    }
+}
+
+// cublas col-major processing accounted for by switching first two operands
+
+#define LINALG_GPU_GEMM(fname, DType) \
+template<> inline \
+void linalg_gemm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2, DType>& B, \
+                             const Tensor<gpu, 2, DType>& C, DType alpha, DType beta, \
+                             bool tA, bool tB, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_gemm(A, B, C, alpha, beta, tA, tB); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            C.size(1), C.size(0), (tB ? B.size(1) : B.size(0)), \
+                            &alpha, B.dptr_, B.stride_, A.dptr_, A.stride_, \
+                            &beta, C.dptr_, C.stride_)) \
+}
+LINALG_GPU_GEMM(Sgemm, float)
+LINALG_GPU_GEMM(Dgemm, double)
+
+#define LINALG_GPU_BATCH_GEMM(fname, DType) \
+template<> inline \
+void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
+                                   const Tensor<gpu, 3, DType>& C, DType alpha, DType beta, \
+                                   bool tA, bool tB, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
+  check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
+  Storage::Handle offsetsA, offsetsB, offsetsC; \
+  offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \
+  offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \
+  offsetsC = Storage::Get()->Alloc(sizeof(DType*)*C.size(0), Context::GPU()); \
+  using namespace mshadow::cuda; \
+  int ngrid = std::min(kMaxGridNum, \
+                       static_cast<int>((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \
+  linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType **>(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \
+  linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType **>(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, B.size(0)); \
+  linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType **>(offsetsC.dptr), C.dptr_, C.size(1)*C.stride_, C.size(0)); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
+                            &alpha, static_cast<const DType **>(offsetsB.dptr), B.stride_, \
+                            static_cast<const DType **>(offsetsA.dptr),  A.stride_, \
+                            &beta, static_cast<DType **>(offsetsC.dptr), C.stride_, A.size(0))) \
+  Storage::Get()->Free(offsetsA); \
+  Storage::Get()->Free(offsetsB); \
+  Storage::Get()->Free(offsetsC); \
+}
+LINALG_GPU_BATCH_GEMM(SgemmBatched, float)
+LINALG_GPU_BATCH_GEMM(DgemmBatched, double)
+
+#endif
+
+//////////////////////////////// TRSM ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "trsm". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is B = trsm(A,B), so B is input and output parameter.
+
+template<typename xpu, typename DType>
+inline void check_trsm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                DType alpha, bool rightside, bool lower, bool transpose) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ(A.size(0), A.size(1))
+    << "First input of trsm is not a square matrix.";
+  CHECK(!rightside || (B.size(1) == A.size(0)))
+    << "Non compatible matrix dimensions between inputs A and B for trsm";
+  CHECK(rightside || (B.size(0) == A.size(1)))
+    << "Non compatible matrix dimensions between inputs A and B for trsm";
+}
+
+#define LINALG_CPU_TRSM(fname, DType) \
+template<> inline \
+void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+  check_trsm(A, B, alpha, rightside, lower, transpose); \
+  cblas_##fname(CblasRowMajor, (rightside ? CblasRight : CblasLeft), \
+                (lower ? CblasLower : CblasUpper), (transpose ? CblasTrans : CblasNoTrans), \
+                CblasNonUnit, B.size(0), B.size(1), alpha, A.dptr_, \
+                A.stride_, B.dptr_, B.stride_); \
+}
+LINALG_CPU_TRSM(strsm, float)
+LINALG_CPU_TRSM(dtrsm, double)
+
+#define LINALG_CPU_BATCH_TRSM(DType) \
+template<> inline \
+void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
+                   DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+  linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose); \
+  } \
+}
+LINALG_CPU_BATCH_TRSM(float)
+LINALG_CPU_BATCH_TRSM(double)
+
+#ifdef __CUDACC__
+
+// cublas col-major processing accounted for by switching sides and fill mode
+
+#define LINALG_GPU_TRSM(fname, DType) \
+template<> inline \
+void linalg_trsm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2, DType>& B, \
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_trsm(A, B, alpha, rightside, lower, transpose); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            (rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \
+                            (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
+                            (transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            CUBLAS_DIAG_NON_UNIT, B.size(1), B.size(0), &alpha, \
+                            A.dptr_, A.stride_, B.dptr_, B.stride_)); \
+}
+LINALG_GPU_TRSM(Strsm, float)
+LINALG_GPU_TRSM(Dtrsm, double)
+
+#define LINALG_GPU_BATCH_TRSM(fname, DType) \
+template<> inline \
+void linalg_batch_trsm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
+                   DType alpha, bool rightside, bool lower, bool transpose, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
+  check_trsm(A[0], B[0], alpha, rightside, lower, transpose); \
+  Storage::Handle offsetsA, offsetsB; \
+  offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \
+  offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \
+  using namespace mshadow::cuda; \
+  int ngrid = std::min(kMaxGridNum, \
+                       static_cast<int>((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \
+  linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType **>(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \
+  linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType **>(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, A.size(0)); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            (rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \
+                            (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
+                            (transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            CUBLAS_DIAG_NON_UNIT, B.size(2), B.size(1), &alpha, \
+                            static_cast<const DType **>(offsetsA.dptr), A.stride_, \
+                            static_cast<DType **>(offsetsB.dptr), B.stride_, A.size(0))); \
+  Storage::Get()->Free(offsetsA); \
+  Storage::Get()->Free(offsetsB); \
+}
+LINALG_GPU_BATCH_TRSM(StrsmBatched, float)
+LINALG_GPU_BATCH_TRSM(DtrsmBatched, double)
+
+#endif
+
+//////////////////////////////// TRMM ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "trmm". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is B = trmm(A,B), so B is input and output parameter.
+
+template<typename xpu, typename DType>
+inline void check_trmm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                DType alpha, bool rightside, bool lower, bool transpose) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ(A.size(0), A.size(1))
+    << "First input of trmm is not a square matrix.";
+  CHECK(!rightside || (B.size(1) == A.size(0)))
+    << "Non compatible matrix dimensions between inputs A and B for trmm";
+  CHECK(rightside || (B.size(0) == A.size(1)))
+    << "Non compatible matrix dimensions between inputs A and B for trmm";
+}
+
+#define LINALG_CPU_TRMM(fname, DType) \
+template<> inline \
+void linalg_trmm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+  check_trmm(A, B, alpha, rightside, lower, transpose); \
+  cblas_##fname(CblasRowMajor, (rightside ? CblasRight : CblasLeft), \
+                (lower ? CblasLower : CblasUpper), (transpose ? CblasTrans : CblasNoTrans), \
+                CblasNonUnit, B.size(0), B.size(1), alpha, A.dptr_, \
+                A.stride_, B.dptr_, B.stride_); \
+}
+LINALG_CPU_TRMM(strmm, float)
+LINALG_CPU_TRMM(dtrmm, double)
+
+#define LINALG_XPU_BATCH_TRMM(xpu, DType) \
+template<> inline \
+void linalg_batch_trmm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
+                    DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s) { \
+  linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_trmm(A[i], B[i], alpha, rightside, lower, transpose, s); \
+  } \
+}
+LINALG_XPU_BATCH_TRMM(cpu, float)
+LINALG_XPU_BATCH_TRMM(cpu, double)
+
+#ifdef __CUDACC__
+
+// cublas col-major processing accounted for by switching sides and fill mode
+// doing in-place computation by supplying B as second and third matrix
+#define LINALG_GPU_TRMM(fname, DType) \
+template<> inline \
+void linalg_trmm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2, DType>& B, \
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_trmm(A, B, alpha, rightside, lower, transpose); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            (rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \
+                            (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
+                            (transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \
+                            CUBLAS_DIAG_NON_UNIT, B.size(0), B.size(1), &alpha, \
+                            A.dptr_, A.stride_, B.dptr_, B.stride_, \
+                            B.dptr_, B.stride_)); \
+}
+LINALG_GPU_TRMM(Strmm, float)
+LINALG_GPU_TRMM(Dtrmm, double)
+
+LINALG_XPU_BATCH_TRMM(gpu, float)
+LINALG_XPU_BATCH_TRMM(gpu, double)
+
+#endif
+
+//////////////////////////////// POTRF ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "potrf". Please refer to the LAPACK-documentation
+// for further information about the function and its parameters.
+// Note that this is A = potrf(A), so A is input and output parameter.
+
+template<typename xpu, typename DType>
+inline void check_potrf(const Tensor<xpu, 2, DType>& A, bool lower) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ(A.size(0), A.size(1))
+    << "No square matrix as input to potrf.";
+}
+
+#define LINALG_CPU_POTRF(fname, DType) \
+template<> inline \
+void linalg_potrf<cpu, DType>(const Tensor<cpu, 2, DType>& A, bool lower, Stream<cpu> *s) { \
+  check_potrf(A, lower); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, (lower ? 'L' : 'U'), A.size(0),  \
+          A.dptr_ , A.stride_)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+LINALG_CPU_POTRF(spotrf, float)
+LINALG_CPU_POTRF(dpotrf, double)
+
+#define LINALG_CPU_BATCH_POTRF(DType) \
+template<> inline \
+void linalg_batch_potrf<cpu, DType>(const Tensor<cpu, 3, DType>& A, bool lower, Stream<cpu> *s) { \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_potrf(A[i], lower); \
+  } \
+}
+LINALG_CPU_BATCH_POTRF(float)
+LINALG_CPU_BATCH_POTRF(double)
+
+#if MXNET_USE_CUSOLVER == 1
+
+#define LINALG_GPU_BUFFSIZE_POTRF(fname, DType) \
+inline int linalg_potrf_buffsize(const Tensor<gpu, 2, DType>& A, bool lower, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  int buffsize(0); \
+  CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
+                                (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
+                                 A.size(0), A.dptr_, A.stride_, &buffsize)); \
+  return buffsize;  \
+}
+LINALG_GPU_BUFFSIZE_POTRF(DnSpotrf_bufferSize, float)
+LINALG_GPU_BUFFSIZE_POTRF(DnDpotrf_bufferSize, double)
+
+#define LINALG_GPU_POTRF(fname, DType) \
+template<> inline \
+void linalg_potrf<gpu, DType>(const Tensor<gpu, 2, DType>& A, bool lower, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_potrf(A, lower); \
+  int buffsize(linalg_potrf_buffsize(A, lower, s)); \
+  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*buffsize, Context::GPU()); \
+  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
+                (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
+                A.size(0), A.dptr_, A.stride_, static_cast<DType *>(buffer.dptr), buffsize, \
+                static_cast<int *>(info.dptr))); \
+  Storage::Get()->Free(buffer); \
+  Storage::Get()->Free(info); \
+}
+LINALG_GPU_POTRF(DnSpotrf, float)
+LINALG_GPU_POTRF(DnDpotrf, double)
+
+#define LINALG_GPU_BATCH_POTRF(fname, DType) \
+template<> inline \
+void linalg_batch_potrf<gpu, DType>(const Tensor<gpu, 3, DType>& A, bool lower, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  CHECK_GT(A.size(0), 0); \
+  check_potrf(A[0], lower); \
+  int buffsize(linalg_potrf_buffsize(A[0], lower, s)); \
+  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*buffsize, Context::GPU()); \
+  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  for (mshadow::index_t i = 0; i < A.size(0); ++i) { \
+    CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
+                 (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
+                 A[i].size(0), A[i].dptr_, A[i].stride_, \
+                 static_cast<DType *>(buffer.dptr), buffsize, static_cast<int *>(info.dptr))); \
+  } \
+  Storage::Get()->Free(buffer); \
+  Storage::Get()->Free(info); \
+}
+LINALG_GPU_BATCH_POTRF(DnSpotrf, float)
+LINALG_GPU_BATCH_POTRF(DnDpotrf, double)
+
+#endif
+
+//////////////////////////////// POTRI ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "potri". Please refer to the LAPACK-documentation
+// for further information about the function and its parameters.
+// Note that this is A = potri(A), so A is input and output parameter.
+
+template<typename xpu, typename DType>
+inline void check_potri(const Tensor<xpu, 2, DType>& A, bool lower) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ(A.size(0), A.size(1)) << "No square matrix as input to potri.";
+}
+
+#define LINALG_CPU_POTRI(fname, DType) \
+template<> inline \
+void linalg_potri<cpu, DType>(const Tensor<cpu, 2, DType>& A, bool lower, Stream<cpu> *s) { \
+  check_potri(A, lower); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, (lower ? 'L' : 'U'), A.size(0),  \
+          A.dptr_ , A.stride_)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+LINALG_CPU_POTRI(spotri, float)
+LINALG_CPU_POTRI(dpotri, double)
+
+#define LINALG_CPU_BATCH_POTRI(DType) \
+template<> inline \
+void linalg_batch_potri<cpu, DType>(const Tensor<cpu, 3, DType>& A, bool lower, Stream<cpu> *s) { \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_potri(A[i], lower); \
+  } \
+}
+LINALG_CPU_BATCH_POTRI(float)
+LINALG_CPU_BATCH_POTRI(double)
+
+#ifdef __CUDACC__
+
+// Initializes multiple identity matrices on the same vector.
+template<typename DType>
+__global__ void linalgInitIdentityGPU(DType *a, int stride, int lda, int N) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+      // index relative to the matrix.
+      int index(i % stride);
+      a[i] = (index / lda == index % lda ? DType(1.0) : DType(0));
+    }
+}
+
+// There is no direct support for potri in cuda. We emulate the function by two calls to trsm.
+#define LINALG_GPU_POTRI(DType) \
+template<> inline \
+void linalg_potri<gpu, DType>(const Tensor<gpu, 2, DType>& A, bool lower, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  CHECK_NOTNULL(s); \
+  check_potri(A, lower); \
+  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*A.MSize(), Context::GPU()); \
+  using namespace mshadow::cuda; \
+  int ngrid = std::min(kMaxGridNum, \
+                       static_cast<int>((A.MSize() + kBaseThreadNum - 1) / kBaseThreadNum)); \
+  linalgInitIdentityGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType *>(buffer.dptr), A.MSize(), A.stride_, A.MSize());  \
+  Tensor<gpu, 2, DType> B((DType *)buffer.dptr, A.shape_, A.stride_, s); \
+  linalg_trsm(A, B, DType(1.0), false, lower, !lower, s); \
+  linalg_trsm(A, B, DType(1.0), false, lower, lower, s); \
+  Copy(A, B, s); \
+  B.dptr_ = 0; \
+  Storage::Get()->Free(buffer); \
+}
+LINALG_GPU_POTRI(float)
+LINALG_GPU_POTRI(double)
+
+#define LINALG_GPU_BATCH_POTRI(DType) \
+template<> inline \
+void linalg_batch_potri<gpu, DType>(const Tensor<gpu, 3, DType>& A, bool lower, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  CHECK_NOTNULL(s); \
+  CHECK_GT(A.size(0), 0); \
+  check_potri(A[0], lower); \
+  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*A.MSize(), Context::GPU()); \
+  using namespace mshadow::cuda; \
+  int ngrid = std::min(kMaxGridNum, \
+                       static_cast<int>((A.MSize() + kBaseThreadNum - 1) / kBaseThreadNum)); \
+  linalgInitIdentityGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
+    (static_cast<DType *>(buffer.dptr), A.size(1)*A.stride_, A.stride_, A.MSize()); \
+  Tensor<gpu, 3, DType> B((DType *)buffer.dptr, A.shape_, A.stride_, s); \
+  linalg_batch_trsm(A, B, DType(1.0), false, lower, !lower, s); \
+  linalg_batch_trsm(A, B, DType(1.0), false, lower, lower, s); \
+  Copy(A, B, s); \
+  B.dptr_ = 0; \
+  Storage::Get()->Free(buffer); \
+}
+LINALG_GPU_BATCH_POTRI(float)
+LINALG_GPU_BATCH_POTRI(double)
+
+#endif
+
+#endif  // MXNET_OPERATOR_LINALG_IMPL_H_
diff --git a/src/operator/spatial_transformer.cc b/src/operator/spatial_transformer.cc
index 0d8ee2917637..51b0ebfde1f0 100644
--- a/src/operator/spatial_transformer.cc
+++ b/src/operator/spatial_transformer.cc
@@ -27,6 +27,10 @@
 
 namespace mshadow {
 template<typename DType>
+bool between(DType value, int lowerBound, int upperBound) {
+  return (value >= lowerBound && value <= upperBound);
+}
+template<typename DType>
 inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
                                     const Tensor<cpu, 4, DType> &input,
                                     const Tensor<cpu, 3, DType> grid_src) {
@@ -43,19 +47,28 @@ inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
           index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
           DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
           DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
-          index_t top_left_y = std::min(i_h, std::max(0, static_cast<int>(floor(y_real))));
-          index_t top_left_x = std::min(i_w, std::max(0, static_cast<int>(floor(x_real))));
+          int top_left_y = static_cast<int>(floor(y_real));
+          int top_left_x = static_cast<int>(floor(x_real));
           DType top_left_y_w = 1.0 - (y_real - top_left_y);
           DType top_left_x_w = 1.0 - (x_real - top_left_x);
-          index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
-          DType top_left_v = *(data + data_index);
-          DType top_right_v = *(data + data_index + 1);
-          DType bottom_left_v = *(data + data_index + i_w);
-          DType bottom_right_v = *(data + data_index + i_w + 1);
+          int data_index = n * i_c * i_h * i_w + c * i_h * i_w +
+                           top_left_y * i_w + top_left_x;
+          DType top_left_v = 0;
+          DType top_right_v = 0;
+          DType bottom_left_v = 0;
+          DType bottom_right_v = 0;
+          if (between(top_left_x, 0, i_w-1) && between(top_left_y, 0, i_h-1))
+            top_left_v = *(data + data_index);
+          if (between(top_left_x + 1, 0, i_w-1) && between(top_left_y, 0, i_h-1))
+            top_right_v = *(data + data_index + 1);
+          if (between(top_left_x, 0, i_w-1) && between(top_left_y + 1, 0, i_h-1))
+            bottom_left_v = *(data + data_index + i_w);
+          if (between(top_left_x+1, 0, i_w-1) && between(top_left_y + 1, 0, i_h-1))
+            bottom_right_v = *(data + data_index + i_w + 1);
           *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
-                             top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
-                             bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
-                             bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+                              top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                              bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                              bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
         }
       }
     }
@@ -82,8 +95,8 @@ inline void BilinearSamplingBackward(const Tensor<cpu, 4, DType> &input_grad,
           index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
           DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
           DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
-          index_t top_left_y = std::min(i_h, std::max(0, static_cast<int>(floor(y_real))));
-          index_t top_left_x = std::min(i_w, std::max(0, static_cast<int>(floor(x_real))));
+          index_t top_left_y = static_cast<int>(floor(y_real));
+          index_t top_left_x = static_cast<int>(floor(x_real));
           DType top_left_y_w = 1.0 - (y_real - top_left_y);
           DType top_left_x_w = 1.0 - (x_real - top_left_x);
           for (index_t c = 0; c < static_cast<index_t>(o_c); ++c) {
@@ -91,18 +104,29 @@ inline void BilinearSamplingBackward(const Tensor<cpu, 4, DType> &input_grad,
             index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w
                                  + top_left_x;
             // calc 4 vertex value in input data
-            DType top_left_v = *(data + data_index);
-            DType top_right_v = *(data + data_index + 1);
-            DType bottom_left_v = *(data + data_index + i_w);
-            DType bottom_right_v = *(data + data_index + i_w + 1);
-            // calc input grad
-            *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
-            *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w
-                                           * (1.0 - top_left_x_w);
-            *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w)
-                                            * top_left_x_w;
-            *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w)
-                                                * (1.0 - top_left_x_w);
+            DType top_left_v = 0;
+            DType top_right_v = 0;
+            DType bottom_left_v = 0;
+            DType bottom_right_v = 0;
+            if (between(top_left_x, 0, i_w-1) && between(top_left_y, 0, i_h-1)) {
+              *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
+              top_left_v = *(data + data_index);
+            }
+            if (between(top_left_x+1, 0, i_w-1) && between(top_left_y, 0, i_h-1)) {
+              *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w
+                                             * (1.0 - top_left_x_w);
+              top_right_v = *(data + data_index + 1);
+            }
+            if (between(top_left_x, 0, i_w-1) && between(top_left_y+1, 0, i_h-1)) {
+              *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w)
+                                              * top_left_x_w;
+              bottom_left_v = *(data + data_index + i_w);
+            }
+            if (between(top_left_x+1, 0, i_w-1) && between(top_left_y+1, 0, i_h-1)) {
+              *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w)
+                                                  * (1.0 - top_left_x_w);
+              bottom_right_v = *(data + data_index + i_w + 1);
+            }
             // calc weight grad of top_left_w, then multiple -1 is the grad of grid_src
             top_left_y_gw -= *(grad + grad_index) * (top_right_v - bottom_right_v +
                              (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
diff --git a/src/operator/spatial_transformer.cu b/src/operator/spatial_transformer.cu
index b3d635c5e8ab..d5e4480dc187 100644
--- a/src/operator/spatial_transformer.cu
+++ b/src/operator/spatial_transformer.cu
@@ -31,6 +31,10 @@
 
 namespace mshadow {
 template<typename DType>
+__device__ bool between(DType value, int lowerBound, int upperBound) {
+  return (value >= lowerBound && value <= upperBound);
+}
+template<typename DType>
 __global__ void BilinearSamplingForwardKernel(const int i_c, const int i_h,
                                               const int i_w, const DType* data,
                                               const DType* grid, const int o_n,
@@ -48,19 +52,27 @@ __global__ void BilinearSamplingForwardKernel(const int i_c, const int i_h,
     index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
     DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
     DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
-    index_t top_left_y = min(i_h, max(0, static_cast<int>(floor(y_real))));
-    index_t top_left_x = min(i_w, max(0, static_cast<int>(floor(x_real))));
+    int top_left_y = static_cast<int>(floor(y_real));
+    int top_left_x = static_cast<int>(floor(x_real));
     DType top_left_y_w = 1.0 - (y_real - top_left_y);
     DType top_left_x_w = 1.0 - (x_real - top_left_x);
-    index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
-    DType top_left_v = *(data + data_index);
-    DType top_right_v = *(data + data_index + 1);
-    DType bottom_left_v = *(data + data_index + i_w);
-    DType bottom_right_v = *(data + data_index + i_w + 1);
+    int data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+    DType top_left_v = 0;
+    DType top_right_v = 0;
+    DType bottom_left_v = 0;
+    DType bottom_right_v = 0;
+    if (between(top_left_x, 0, i_w-1) && between(top_left_y, 0, i_h-1))
+      top_left_v = *(data + data_index);
+    if (between(top_left_x + 1, 0, i_w-1) && between(top_left_y, 0, i_h-1))
+      top_right_v = *(data + data_index + 1);
+    if (between(top_left_x, 0, i_w-1) && between(top_left_y + 1, 0, i_h-1))
+      bottom_left_v = *(data + data_index + i_w);
+    if (between(top_left_x+1, 0, i_w-1) && between(top_left_y + 1, 0, i_h-1))
+      bottom_right_v = *(data + data_index + i_w + 1);
     *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
-                       top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
-                       bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
-                       bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+                        top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                        bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                        bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
     }
 }
 
@@ -83,29 +95,43 @@ __global__ void BilinearSamplingBackwardKernel(const int i_c, const int i_h,
     index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
     DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
     DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
-    index_t top_left_y = min(i_h, max(0, static_cast<int>(floor(y_real))));
-    index_t top_left_x = min(i_w, max(0, static_cast<int>(floor(x_real))));
+    int top_left_y = static_cast<int>(floor(y_real));
+    int top_left_x = static_cast<int>(floor(x_real));
     DType top_left_y_w = 1.0 - (y_real - top_left_y);
     DType top_left_x_w = 1.0 - (x_real - top_left_x);
     for (index_t c = 0; c < o_c; ++c) {
       index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
       index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
       // calc 4 vertex value in input data
-      DType top_left_v = *(data + data_index);
-      DType top_right_v = *(data + data_index + 1);
-      DType bottom_left_v = *(data + data_index + i_w);
-      DType bottom_right_v = *(data + data_index + i_w + 1);
+      DType top_left_v = 0;
+      DType top_right_v = 0;
+      DType bottom_left_v = 0;
+      DType bottom_right_v = 0;
       // calc input grad
-      *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
-      *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w * (1.0 - top_left_x_w);
-      *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w) * top_left_x_w;
-      *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w) *
-                                          (1.0 - top_left_x_w);
+      if (between(top_left_x, 0, i_w-1) && between(top_left_y, 0, i_h-1)) {
+        *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
+        top_left_v = *(data + data_index);
+      }
+      if (between(top_left_x+1, 0, i_w-1) && between(top_left_y, 0, i_h-1)) {
+        *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w * (1.0 - top_left_x_w);
+        top_right_v = *(data + data_index + 1);
+      }
+      if (between(top_left_x, 0, i_w-1) && between(top_left_y+1, 0, i_h-1)) {
+        *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w) * top_left_x_w;
+        bottom_left_v = *(data + data_index + i_w);
+      }
+      if (between(top_left_x+1, 0, i_w-1) && between(top_left_y+1, 0, i_h-1)) {
+        *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w) *
+                                            (1.0 - top_left_x_w);
+        bottom_right_v = *(data + data_index + i_w + 1);
+      }
       // calc weight grad of top_left_w, then multiple -1 is the grad of grid_src
       top_left_y_gw -= *(grad + grad_index) * (top_right_v - bottom_right_v +
-                       (top_left_v - top_right_v - bottom_left_v + bottom_right_v) * top_left_x_w);
-      top_left_x_gw -= *(grad + grad_index) * (bottom_left_v - bottom_right_v + (top_left_v -
-                       top_right_v - bottom_left_v + bottom_right_v) * top_left_y_w);
+                       (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
+                       * top_left_x_w);
+      top_left_x_gw -= *(grad + grad_index) * (bottom_left_v - bottom_right_v +
+                       (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
+                       * top_left_y_w);
     }
     // calc grid_src grad
     *(grid_src + grid_src_index + o_h * o_w) = top_left_y_gw * (i_h - 1) / 2;
diff --git a/src/operator/tensor/la_op.cc b/src/operator/tensor/la_op.cc
index 1b726ced906b..70d4f9b766ad 100644
--- a/src/operator/tensor/la_op.cc
+++ b/src/operator/tensor/la_op.cc
@@ -401,7 +401,7 @@ Examples::
   { return std::vector<std::string>{"A"}; } )
 .set_attr<nnvm::FInferShape>("FInferShape", LaReduceShape<2>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<FCompute>("FCompute<cpu>", LaReduceForward<cpu, 2, sumlogdiag>)
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 0, 1, 1, sumlogdiag>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_linalg_sumlogdiag"})
 .add_argument("A", "NDArray-or-Symbol", "Tensor of square matrices");
 
@@ -411,7 +411,7 @@ NNVM_REGISTER_OP(_backward_linalg_sumlogdiag)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
   { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", LaReduceBackward<cpu, 2, sumlogdiag_backward>);
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 2, 1, sumlogdiag_backward>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu
new file mode 100644
index 000000000000..a89d98fd7f82
--- /dev/null
+++ b/src/operator/tensor/la_op.cu
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file la_op.cu
+ * \brief GPU-Operators for advanced linear algebra.
+ */
+#include "./la_op.h"
+#include "./la_op_inline.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(linalg_gemm)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 3, 1, gemm>);
+
+NNVM_REGISTER_OP(_backward_linalg_gemm)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 3, gemm_backward>);
+
+NNVM_REGISTER_OP(linalg_gemm2)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 2, 1, gemm2>);
+
+NNVM_REGISTER_OP(_backward_linalg_gemm2)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 3, 2, gemm2_backward>);
+
+NNVM_REGISTER_OP(linalg_trmm)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 2, 1, trmm>);
+
+NNVM_REGISTER_OP(_backward_linalg_trmm)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 2, trmm_backward>);
+
+NNVM_REGISTER_OP(linalg_trsm)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 2, 1, trsm>);
+
+NNVM_REGISTER_OP(_backward_linalg_trsm)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 2, trsm_backward>);
+
+NNVM_REGISTER_OP(linalg_sumlogdiag)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 0, 1, 1, sumlogdiag>);
+
+NNVM_REGISTER_OP(_backward_linalg_sumlogdiag)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, sumlogdiag_backward>);
+
+NNVM_REGISTER_OP(linalg_potri)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potri>);
+
+NNVM_REGISTER_OP(_backward_linalg_potri)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 3, 1, potri_backward>);
+
+#if MXNET_USE_CUSOLVER == 1
+
+NNVM_REGISTER_OP(linalg_potrf)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potrf>);
+
+NNVM_REGISTER_OP(_backward_linalg_potrf)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, potrf_backward>);
+
+#endif
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index 977998855263..dd5fab985e3c 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -91,9 +91,9 @@ struct LaTriangMatrixMultParam : public dmlc::Parameter<LaTriangMatrixMultParam>
 };
 
 // Common function for shape inference for matrix mult and matrix mac.
-bool LaMatrixMultMacOpShape(const nnvm::NodeAttrs& attrs,
-                            std::vector<TShape>* in_attrs,
-                            std::vector<TShape>* out_attrs) {
+inline bool LaMatrixMultMacOpShape(const nnvm::NodeAttrs& attrs,
+                                   std::vector<TShape>* in_attrs,
+                                   std::vector<TShape>* out_attrs) {
   CHECK_GE(in_attrs->size(), 2);
   CHECK_EQ(out_attrs->size(), 1);
   bool transpose_a(false), transpose_b(false);
@@ -132,9 +132,9 @@ bool LaMatrixMultMacOpShape(const nnvm::NodeAttrs& attrs,
   return false;
 }
 
-bool LaTriangMatrixMultOpShape(const nnvm::NodeAttrs& attrs,
-                               std::vector<TShape>* in_attrs,
-                               std::vector<TShape>* out_attrs) {
+inline bool LaTriangMatrixMultOpShape(const nnvm::NodeAttrs& attrs,
+                                      std::vector<TShape>* in_attrs,
+                                      std::vector<TShape>* out_attrs) {
   const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2);
   CHECK_EQ(out_attrs->size(), 1);
@@ -192,9 +192,9 @@ bool LaTriangMatrixMultOpShape(const nnvm::NodeAttrs& attrs,
 }
 
 template<int dim>
-bool LaReduceShape(const nnvm::NodeAttrs& attrs,
-                   std::vector<TShape>* in_attrs,
-                   std::vector<TShape>* out_attrs) {
+inline bool LaReduceShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape>* in_attrs,
+                          std::vector<TShape>* out_attrs) {
   // Shape for reduction of the dim lowest dimensions to a scalar.
   // Can only deduct in forward direction.
   CHECK_EQ(in_attrs->size(), 1);
@@ -203,7 +203,8 @@ bool LaReduceShape(const nnvm::NodeAttrs& attrs,
   if ( ndim < dim ) {
      return false;
   }
-  std::vector<int> oshape(std::max(1, ndim-dim), 1);
+  std::vector<int> oshape(std::max(1, ndim-dim));
+  oshape[0] = 1;
   for ( int i = 0; i < ndim - dim; ++i ) {
     oshape[i] = (*in_attrs)[0][i];
   }
@@ -218,7 +219,6 @@ template<typename xpu, typename DType, int idim, int odim, int inum, int onum, t
 struct LaOpCaller {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
     CHECK(false) << "no specialized LaOpCaller defined for template parameters";
@@ -228,86 +228,75 @@ template<typename xpu, typename DType, int idim, int odim, typename laop>
 struct LaOpCaller<xpu, DType, idim, odim, 1, 1, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
-    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s)[index],
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s)[index], attrs);
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
 struct LaOpCaller<xpu, DType, idim, odim, 2, 1, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
-    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[1].FlatToKD<xpu, idim+1, DType>(s)[index],
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s)[index],
-             attrs);
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[1].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
 struct LaOpCaller<xpu, DType, idim, odim, 3, 1, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
-    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[1].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[2].FlatToKD<xpu, idim+1, DType>(s)[index],
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s)[index],
-             attrs);
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[1].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[2].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
 struct LaOpCaller<xpu, DType, idim, odim, 3, 2, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
-    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[1].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[2].FlatToKD<xpu, idim+1, DType>(s)[index],
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s)[index],
-             outputs[1].FlatToKD<xpu, odim+1, DType>(s)[index],
-             attrs);
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[1].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[2].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s),
+             outputs[1].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
 struct LaOpCaller<xpu, DType, idim, odim, 4, 2, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
-    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[1].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[2].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[3].FlatToKD<xpu, idim+1, DType>(s)[index],
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s)[index],
-             outputs[1].FlatToKD<xpu, odim+1, DType>(s)[index],
-             attrs);
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[1].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[2].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[3].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s),
+             outputs[1].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
 struct LaOpCaller<xpu, DType, idim, odim, 4, 3, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const int index,
                  const nnvm::NodeAttrs& attrs,
                        mshadow::Stream<xpu> *s) {
-    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[1].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[2].FlatToKD<xpu, idim+1, DType>(s)[index],
-             inputs[3].FlatToKD<xpu, idim+1, DType>(s)[index],
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s)[index],
-             outputs[1].FlatToKD<xpu, odim+1, DType>(s)[index],
-             outputs[2].FlatToKD<xpu, odim+1, DType>(s)[index],
-             attrs);
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[1].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[2].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[3].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s),
+             outputs[1].FlatToKD<xpu, odim+1, DType>(s),
+             outputs[2].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
   }
 };
 
@@ -322,24 +311,8 @@ void LaOpForward(const nnvm::NodeAttrs& attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), inum);
   CHECK_EQ(outputs.size(), onum);
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-    int N(-1);
-    for ( int i = 0; i < inum; ++i ) {
-      CHECK_EQ(inputs[i].CheckContiguous(), true);
-      const int M(inputs[i].FlatToKD<xpu, idim+1, OType>(s).size(0));
-      CHECK_EQ((N == -1 || N == M), true);
-      N = M;
-    }
-    for ( int i = 0; i < onum; ++i ) {
-      CHECK_EQ(outputs[i].CheckContiguous(), true);
-      CHECK_EQ((req[i] == kWriteTo || req[i] == kWriteInplace), true);
-      const int M(outputs[i].FlatToKD<xpu, odim+1, OType>(s).size(0));
-      CHECK_EQ((N == -1 || N == M), true);
-      N = M;
-    }
-    for ( int i = 0; i < N; ++i ) {
-      LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, outputs, i, attrs, s);
-    }
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+    LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, outputs, attrs, s);
   });
 }
 
@@ -354,28 +327,15 @@ void LaOpBackward(const nnvm::NodeAttrs& attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), inum);
   CHECK_EQ(outputs.size(), onum);
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-    int N(-1);
-    for ( int i = 0; i < inum; ++i ) {
-      CHECK_EQ(inputs[i].CheckContiguous(), true);
-      const int M(inputs[i].FlatToKD<xpu, idim+1, OType>(s).size(0));
-      CHECK_EQ((N == -1 || N == M), true);
-      N = M;
-    }
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
     std::vector<TBlob> tspace(outputs);
     for ( int i = 0; i < onum; ++i ) {
-      CHECK_EQ(outputs[i].CheckContiguous(), true);
-      const int M(outputs[i].FlatToKD<xpu, odim+1, OType>(s).size(0));
-      CHECK_EQ((N == -1 || N == M), true);
-      N = M;
       if ( req[i] == kAddTo ) {
         tspace[i].dptr_ = ctx.requested[ResourceRequest::kTempSpace]
                              .get_space_typed<xpu, 1, OType>(Shape1(outputs[i].Size()), s).dptr_;
       }
     }
-    for ( int i = 0; i < N; ++i ) {
-      LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, tspace, i, attrs, s);
-    }
+    LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, tspace, attrs, s);
     for ( int i = 0; i < onum; ++i ) {
       if ( req[i] == kAddTo ) {
         Tensor<xpu, 1, OType> out = outputs[i].FlatTo1D<xpu, OType>(s);
@@ -385,53 +345,6 @@ void LaOpBackward(const nnvm::NodeAttrs& attrs,
   });
 }
 
-template<typename xpu, int idim, typename laop>
-void LaReduceForward(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<TBlob>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(inputs[0].CheckContiguous(), true);
-  CHECK_EQ(outputs[0].CheckContiguous(), true);
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-    Tensor<xpu, idim+1, OType> in(inputs[0].FlatToKD<xpu, idim+1, OType>(s));
-    Tensor<xpu, 1, OType> out(outputs[0].FlatTo1D<xpu, OType>(s));
-    const int N(outputs[0].Size());
-    CHECK_EQ(in.size(0), N);
-    for ( int i = 0; i < N; ++i ) {
-      laop::op(in[i], out[i], attrs);
-    }
-  });
-}
-
-template<typename xpu, int idim, typename laop>
-void LaReduceBackward(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(inputs.size(), 2);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(inputs[0].CheckContiguous(), true);
-  CHECK_EQ(inputs[1].CheckContiguous(), true);
-  CHECK_EQ(outputs[0].CheckContiguous(), true);
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-    const int N(inputs[0].Size());
-    Tensor<xpu, 1, OType> in0(inputs[0].FlatTo1D<xpu, OType>(s));
-    Tensor<xpu, idim+1, OType> in1(inputs[1].FlatToKD<xpu, idim+1, OType>(s));
-    Tensor<xpu, idim+1, OType> out(outputs[0].FlatToKD<xpu, idim+1, OType>(s));
-    for ( int i = 0; i < N; ++i ) {
-      laop::op(in0[i], in1[i], out[i], attrs, (req[i] == kAddTo));
-    }
-  });
-}
-
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/la_op_inline.h b/src/operator/tensor/la_op_inline.h
index a032988edb75..34fb441f53f7 100644
--- a/src/operator/tensor/la_op_inline.h
+++ b/src/operator/tensor/la_op_inline.h
@@ -24,244 +24,186 @@
 #ifndef MXNET_OPERATOR_TENSOR_LA_OP_INLINE_H_
 #define MXNET_OPERATOR_TENSOR_LA_OP_INLINE_H_
 
-#include <mxnet/c_lapack_api.h>
+#include "../linalg.h"
 
 namespace mxnet {
 namespace op {
 
 using namespace mshadow;
 
-#define LA_OP_NOT_AVAIL " operator can only be called with float/double data type."
-
-// Signature for single matrix operations (decomposition/inversion).
-#define FUNC_SIGNATURE_1(fname, arg1) {CHECK_EQ(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, 'L', \
-  arg1.size(0), arg1.dptr_, arg1.size(0)), 0) << "fname failed in lapack";}
-
-// Signature for matrix-matrix multiplications involving one diagonal matrix.
-#define FUNC_SIGNATURE_2(fname, arg1, arg2) \
-  { cblas_##fname(CblasRowMajor, (rightside ? CblasRight : CblasLeft), \
-                  CblasLower, (transpose ? CblasTrans : CblasNoTrans), \
-                  CblasNonUnit, arg2.size(0), arg2.size(1), alpha, arg1.dptr_, \
-                  (rightside ? arg2.size(1) : arg2.size(0)), arg2.dptr_, arg2.size(1)); }
-
-
 // Helper functions.
-template<typename DType>
-void CopyLowerToUpper(DType *dptr, int N)
-  { for (int i = 1; i < N; ++i ) for ( int j = 0; j < i; ++j ) dptr[j*N+i] = dptr[i*N+j]; }
-template<typename DType>
-void ZeroUpper(DType *dptr, int N)
-  { for (int i = 0; i < N; ++i ) for ( int j = i+1; j < N; ++j ) dptr[i*N+j] = 0; }
+struct CopyLowerToUpper {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int matrix_size, int stride, DType* data) {
+    // Below computation works even when we are dealing with a batch of matrices.
+    const int row((i % matrix_size) / stride), col(i % stride);
+    if ( row > col ) data[i + (col - row) * (stride - 1)] = data[i];
+  }
+};
+struct ZeroUpper {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int matrix_size, int stride, DType* data) {
+    const int row((i % matrix_size) / stride), col(i % stride);
+    if ( row < col ) data[i] = 0;
+  }
+};
+struct Scale {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType scale, DType* data) {
+    data[i] *= scale;
+  }
+};
 
-// Forward operators
+// Forward computations (always using batched processing)
 
 // D = gemm(A,B,C)
 struct gemm {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
-                 const Tensor<xpu, 2, DType>& C, DType alpha, DType beta, bool tA, bool tB)
-    { CHECK(false) << "gemm" << LA_OP_NOT_AVAIL; }
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+    const Tensor<xpu, 3, DType>& C, DType alpha, DType beta, bool tA, bool tB, Stream<xpu> *s) {
+    linalg_batch_gemm(A, B, C, alpha, beta, tA, tB, s);
+  }
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
-                 const Tensor<xpu, 2, DType>& C, const Tensor<xpu, 2, DType>& D,
-                 const nnvm::NodeAttrs& attrs) {
-    if ( C.dptr_ != D.dptr_ ) Copy(D, C);
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& C, const Tensor<xpu, 3, DType>& D,
+                 Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    if ( C.dptr_ != D.dptr_ ) Copy(D, C, s);
     const LaMatrixMacParam& param = nnvm::get<LaMatrixMacParam>(attrs.parsed);
-    gemm::op(A, B, D, DType(param.alpha), DType(param.beta), param.transpose_a, param.transpose_b);
+    gemm::op(A, B, D, DType(param.alpha), DType(param.beta),
+             param.transpose_a, param.transpose_b, s);
   }
 };
-template<>
-void gemm::op<cpu, float>(const Tensor<cpu, 2, float>& A, const Tensor<cpu, 2, float>& B,
-                          const Tensor<cpu, 2, float>& C,
-                          float alpha, float beta, bool tA, bool tB ) {
-  CHECK_EQ((tA ? A.size(1) : A.size(0)), C.size(0))
-    << "Non compatible matrix dimensions between inputs A and C for gemm operator";
-  CHECK_EQ((tB ? B.size(0) : B.size(1)), C.size(1))
-    << "Non compatible matrix dimensions between inputs B and C for gemm operator";
-  CHECK_EQ((tA ? A.size(0) : A.size(1)), (tB ? B.size(1) : B.size(0)))
-    << "Non compatible matrix dimensions between inputs A and B for gemm operator";
-  cblas_sgemm(CblasRowMajor, (tA ? CblasTrans : CblasNoTrans), (tB ? CblasTrans : CblasNoTrans),
-              (tA ? A.size(1):A.size(0)), (tB ? B.size(0): B.size(1)),
-              (tA ? A.size(0):A.size(1)), alpha, A.dptr_, A.size(1), B.dptr_, B.size(1),
-              beta, C.dptr_, (tB ? B.size(0): B.size(1)));
-}
-template<>
-void gemm::op<cpu, double>(const Tensor<cpu, 2, double>& A, const Tensor<cpu, 2, double>& B,
-                           const Tensor<cpu, 2, double>& C,
-                           double alpha, double beta, bool tA, bool tB) {
-  CHECK_EQ((tA ? A.size(1) : A.size(0)), C.size(0))
-    << "Non compatible matrix dimensions between inputs A and C for gemm operator";
-  CHECK_EQ((tB ? B.size(0) : B.size(1)), C.size(1))
-    << "Non compatible matrix dimensions between inputs B and C for gemm operator";
-  CHECK_EQ((tA ? A.size(0) : A.size(1)), (tB ? B.size(1) : B.size(0)))
-    << "Non compatible matrix dimensions between inputs A and B for gemm operator";
-  cblas_dgemm(CblasRowMajor, (tA ? CblasTrans : CblasNoTrans), (tB ? CblasTrans : CblasNoTrans),
-              (tA ? A.size(1):A.size(0)), (tB ? B.size(0): B.size(1)),
-              (tA ? A.size(0):A.size(1)), alpha, A.dptr_, A.size(1), B.dptr_, B.size(1),
-              beta, C.dptr_, (tB ? B.size(0): B.size(1)));
-}
 
 // C = gemm2(A,B)
 struct gemm2 {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
-                 const Tensor<xpu, 2, DType>& C, const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& C, Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
     const LaMatrixMultParam& param = nnvm::get<LaMatrixMultParam>(attrs.parsed);
-    gemm::op(A, B, C, DType(param.alpha), DType(0), param.transpose_a, param.transpose_b);
+    gemm::op(A, B, C, DType(param.alpha), DType(0), param.transpose_a, param.transpose_b, s);
   }
 };
 
 // L = potrf(A).
 struct potrf {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& L,
-                 const nnvm::NodeAttrs& attrs)
-    { CHECK(false) << "potrf" << LA_OP_NOT_AVAIL; }
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& L,
+                 Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    if ( A.dptr_ != L.dptr_ ) Copy(L, A, s);
+    linalg_batch_potrf(L, true, s);
+    using namespace mxnet_op;
+    Kernel<ZeroUpper, xpu>::Launch(s, L.MSize(), L.size(1)*L.stride_, L.stride_, L.dptr_);
+  }
 };
-template<>
-void potrf::op<cpu, float>(const Tensor<cpu, 2, float>& A, const Tensor<cpu, 2, float>& L,
-                           const nnvm::NodeAttrs& attrs) {
-  if ( A.dptr_ != L.dptr_ ) Copy(L, A);
-  FUNC_SIGNATURE_1(spotrf, L);
-  ZeroUpper(L.dptr_, L.size(0));
-}
-template<>
-void potrf::op<cpu, double>(const Tensor<cpu, 2, double>& A, const Tensor<cpu, 2, double>& L,
-                            const nnvm::NodeAttrs& attrs) {
-  if ( A.dptr_ != L.dptr_ ) Copy(L, A);
-  FUNC_SIGNATURE_1(dpotrf, L);
-  ZeroUpper(L.dptr_, L.size(0));
-}
 
 // A = potri(L).
 struct potri {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& L, const Tensor<xpu, 2, DType>& A,
-                 const nnvm::NodeAttrs& attrs)
-    { CHECK(false) << "potri" << LA_OP_NOT_AVAIL; }
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
+                 Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    if ( A.dptr_ != L.dptr_ ) Copy(A, L, s);
+    linalg_batch_potri(A, true, s);
+    using namespace mxnet_op;
+    Kernel<CopyLowerToUpper, xpu>::Launch(s, A.MSize(), A.size(1)*A.stride_, A.stride_, A.dptr_);
+  }
 };
-template<>
-void potri::op<cpu, float>(const Tensor<cpu, 2, float>& L, const Tensor<cpu, 2, float>& A,
-                           const nnvm::NodeAttrs& attrs) {
-  if ( A.dptr_ != L.dptr_ ) Copy(A, L);
-  FUNC_SIGNATURE_1(spotri, A);
-  CopyLowerToUpper(A.dptr_, A.size(0));
-}
-template<>
-void potri::op<cpu, double>(const Tensor<cpu, 2, double>& A, const Tensor<cpu, 2, double>& L,
-                            const nnvm::NodeAttrs& attrs) {
-  if ( A.dptr_ != L.dptr_ ) Copy(A, L);
-  FUNC_SIGNATURE_1(dpotri, A);
-  CopyLowerToUpper(A.dptr_, A.size(0));
-}
 
 // B = trsm(L,A)
 struct trsm {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& L, const Tensor<xpu, 2, DType>& B,
-                 DType alpha, bool rightside, bool transpose)
-    { CHECK(false) << "trsm" << LA_OP_NOT_AVAIL; }
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& B,
+                 DType alpha, bool rightside, bool transpose, Stream<xpu> *s) {
+    linalg_batch_trsm(L, B, alpha, rightside, true, transpose, s);
+  }
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& L, const Tensor<xpu, 2, DType>& A,
-                 const Tensor<xpu, 2, DType>& B, const nnvm::NodeAttrs& attrs) {
-    if ( A.dptr_ != B.dptr_ ) Copy(B, A);
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B,
+                 Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    if ( A.dptr_ != B.dptr_ ) Copy(B, A, s);
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
-    op(L, B, DType(param.alpha), param.rightside, param.transpose);
+    op(L, B, DType(param.alpha), param.rightside, param.transpose, s);
   }
 };
-template<>
-void trsm::op<cpu, float>(const Tensor<cpu, 2, float>& L, const Tensor<cpu, 2, float>& B,
-                          float alpha, bool rightside, bool transpose) {
-  FUNC_SIGNATURE_2(strsm, L, B);
-}
-template<>
-void trsm::op<cpu, double>(const Tensor<cpu, 2, double>& L, const Tensor<cpu, 2, double>& B,
-                           double alpha, bool rightside, bool transpose) {
-  FUNC_SIGNATURE_2(dtrsm, L, B);
-}
 
 // B = trmm(L,A)
 struct trmm {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& L, const Tensor<xpu, 2, DType>& B,
-                 DType alpha, bool rightside, bool transpose)
-    { CHECK(false) << "trmm" << LA_OP_NOT_AVAIL; }
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& B,
+                 DType alpha, bool rightside, bool transpose, Stream<xpu> *s) {
+    linalg_batch_trmm(L, B, alpha, rightside, true, transpose, s);
+  }
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& L, const Tensor<xpu, 2, DType>& A,
-                 const Tensor<xpu, 2, DType>& B, const nnvm::NodeAttrs& attrs) {
-    if ( A.dptr_ != B.dptr_ ) Copy(B, A);
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B, Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    if ( A.dptr_ != B.dptr_ ) Copy(B, A, s);
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
-    op(L, B, DType(param.alpha), param.rightside, param.transpose);
+    op(L, B, DType(param.alpha), param.rightside, param.transpose, s);
   }
 };
-template<>
-void trmm::op<cpu, float>(const Tensor<cpu, 2, float>& L, const Tensor<cpu, 2, float>& B,
-                          float alpha, bool rightside, bool transpose) {
-  FUNC_SIGNATURE_2(strmm, L, B);
-}
-template<>
-void trmm::op<cpu, double>(const Tensor<cpu, 2, double>& L, const Tensor<cpu, 2, double>& B,
-                           double alpha, bool rightside, bool transpose) {
-  FUNC_SIGNATURE_2(dtrmm, L, B);
-}
 
 // Useful operator that is not part of BLAS/LAPACK.
-struct sumlogdiag {
-  template<typename xpu, typename DType,
-           typename std::enable_if<!std::is_floating_point<DType>::value, int>::type = 0>
-  static void op(const Tensor<xpu, 2, DType>& A, DType& L, const nnvm::NodeAttrs& attrs)
-    { CHECK(false) << "sumlogdiag operator can only be called with float/double data type."; }
-  template<typename xpu, typename DType,
-           typename std::enable_if<std::is_floating_point<DType>::value, int>::type = 0>
-  static void op(const Tensor<xpu, 2, DType>& A, DType& B, const nnvm::NodeAttrs& attrs) {
-    CHECK_EQ(A.size(0), A.size(1)) << "sumlogdiag operator requires a NxN matrix as input.";
-    const int N(A.size(0));
+struct ForwardSumLogDiag {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int N, int stride, DType* A, DType* B) {
     DType sum(0);
-    DType *p(A.dptr_);
-    for ( int i = 0; i < N; ++i, p += N+1 ) {
-      sum += log(*p);
+    const int offset(i * N * stride);
+    for ( int j = 0; j < N; ++j ) {
+      sum += log(A[offset+j*(stride+1)]);
     }
-    B = sum;
+    B[i] = sum;
+  }
+};
+struct sumlogdiag {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 1, DType>& B,
+                 Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    CHECK_EQ(A.size(1), A.size(2)) << "sumlogdiag operator requires square matrices as input.";
+    using namespace mxnet_op;
+    Kernel<ForwardSumLogDiag, xpu>::Launch(s, A.size(0), A.size(1), A.stride_, A.dptr_, B.dptr_);
   }
 };
 
-// Backward operators
+// Backward operators (always using batch processing)
 
 struct gemm_backward {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& dD, const Tensor<xpu, 2, DType>& A,
-                 const Tensor<xpu, 2, DType>& B, const Tensor<xpu, 2, DType>& C,
-                 const Tensor<xpu, 2, DType>& dA, const Tensor<xpu, 2, DType>& dB,
-                 const Tensor<xpu, 2, DType>& dC, const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& dD, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B, const Tensor<xpu, 3, DType>& C,
+                 const Tensor<xpu, 3, DType>& dA, const Tensor<xpu, 3, DType>& dB,
+                 const Tensor<xpu, 3, DType>& dC,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     const LaMatrixMacParam& param = nnvm::get<LaMatrixMacParam>(attrs.parsed);
-    (param.transpose_a ? gemm::op(B, dD, dA, DType(param.alpha), DType(0), param.transpose_b, true)
-                  : gemm::op(dD, B, dA, DType(param.alpha), DType(0), false, !param.transpose_b));
-    (param.transpose_b ? gemm::op(dD, A, dB, DType(param.alpha), DType(0), true, param.transpose_a)
-                  : gemm::op(A, dD, dB, DType(param.alpha), DType(0), !param.transpose_a, false));
-    const int N(dC.size(0)*dC.size(1));
-    for ( int i = 0; i < N; ++i ) {
-      dC.dptr_[i] = param.beta * dD.dptr_[i];
-    }
+    bool tA(param.transpose_a), tB(param.transpose_b);
+    (tA ? gemm::op(B, dD, dA, DType(param.alpha), DType(0), tB, true, s)
+        : gemm::op(dD, B, dA, DType(param.alpha), DType(0), false, !tB, s));
+    (tB ? gemm::op(dD, A, dB, DType(param.alpha), DType(0), true, tA, s)
+        : gemm::op(A, dD, dB, DType(param.alpha), DType(0), !tA, false, s));
+    Copy(dC, dD, s);
+    using namespace mxnet_op;
+    Kernel<Scale, xpu>::Launch(s, dC.MSize(), DType(param.beta), dC.dptr_);
   }
 };
 
 struct gemm2_backward {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& dC, const Tensor<xpu, 2, DType>& A,
-                 const Tensor<xpu, 2, DType>& B, const Tensor<xpu, 2, DType>& dA,
-                 const Tensor<xpu, 2, DType>& dB, const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& dC, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B, const Tensor<xpu, 3, DType>& dA,
+                 const Tensor<xpu, 3, DType>& dB,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     const LaMatrixMultParam& param = nnvm::get<LaMatrixMultParam>(attrs.parsed);
-    (param.transpose_a ? gemm::op(B, dC, dA, DType(param.alpha), DType(0), param.transpose_b, true)
-                   : gemm::op(dC, B, dA, DType(param.alpha), DType(0), false, !param.transpose_b));
-    (param.transpose_b ? gemm::op(dC, A, dB, DType(param.alpha), DType(0), true, param.transpose_a)
-                   : gemm::op(A, dC, dB, DType(param.alpha), DType(0), !param.transpose_a, false));
+    bool tA(param.transpose_a), tB(param.transpose_b);
+    (tA ? gemm::op(B, dC, dA, DType(param.alpha), DType(0), tB, true, s)
+        : gemm::op(dC, B, dA, DType(param.alpha), DType(0), false, !tB, s));
+    (tB ? gemm::op(dC, A, dB, DType(param.alpha), DType(0), true, tA, s)
+        : gemm::op(A, dC, dB, DType(param.alpha), DType(0), !tA, false, s));
   }
 };
 
 struct potrf_backward {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& dL, const Tensor<xpu, 2, DType>& L,
-                 const Tensor<xpu, 2, DType>& dA, const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& dL, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& dA,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of L = potrf(A).
     // dA = 0.5 * L**T * symm(L**T * dL # E) * L**(-1) where
     //     '#' denotes Hadamard product
@@ -269,81 +211,96 @@ struct potrf_backward {
     //      symm(X) = 0.5 * (X + X**T)
     // Hadamard product and symm can be realized by a single copy from lower to upper triangle.
     if ( dL.dptr_ != dA.dptr_ ) {
-      Copy(dA, dL);
+      Copy(dA, dL, s);
     }
-    trmm::op(L, dA, DType(1.0), false, true);
-    CopyLowerToUpper(dA.dptr_, dA.size(0));
-    trsm::op(L, dA, DType(1.0), false, true);
-    trsm::op(L, dA, DType(0.5), true, false);
+    trmm::op(L, dA, DType(1.0), false, true, s);
+    using namespace mxnet_op;
+    Kernel<CopyLowerToUpper, xpu>::Launch
+           (s, dA.MSize(), dA.size(1)*dA.stride_, dA.stride_, dA.dptr_);
+    trsm::op(L, dA, DType(1.0), false, true, s);
+    trsm::op(L, dA, DType(0.5), true, false, s);
   }
 };
 
 struct potri_backward {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& dA, const Tensor<xpu, 2, DType>& L,
-                 const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& dL,
-                 const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& dA, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& dL,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of A = potri(L).
     // dL = -2 * tril(A * dA * L**(-T)), where tril() extracts lower triangle and diagonal.
-    gemm::op(A, dA, dL, DType(1.0), DType(0), false, false);
-    trsm::op(L, dL, DType(-2.0), true, true);
-    ZeroUpper(dL.dptr_, dL.size(0));
+    gemm::op(A, dA, dL, DType(1.0), DType(0), false, false, s);
+    trsm::op(L, dL, DType(-2.0), true, true, s);
+    using namespace mxnet_op;
+    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_, dL.dptr_);
   }
 };
 
 struct trsm_backward {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& dB, const Tensor<xpu, 2, DType>& L,
-                 const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
-                 const Tensor<xpu, 2, DType>& dL, const Tensor<xpu, 2, DType>& dA,
-                 const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& dL, const Tensor<xpu, 3, DType>& dA,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of B = trsm(L,A).
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
     // Compute dA
-    if ( dA.dptr_ != dB.dptr_ ) Copy(dA, dB);
-    trsm::op(L, dA, DType(param.alpha), param.rightside, !param.transpose);
+    if ( dA.dptr_ != dB.dptr_ ) Copy(dA, dB, s);
+    trsm::op(L, dA, DType(param.alpha), param.rightside, !param.transpose, s);
     // Compute dL
     const bool da_left(param.rightside == param.transpose);
-    (da_left ?
-        gemm::op(dA, B, dL, DType(-1.0/param.alpha), DType(0), param.transpose, !param.transpose)
-      : gemm::op(B, dA, dL, DType(-1.0/param.alpha), DType(0), !param.transpose, param.transpose));
-    ZeroUpper(dL.dptr_, dL.size(0));
+    DType scale(-1.0/param.alpha);
+    (da_left ? gemm::op(dA, B, dL, scale, DType(0), param.transpose, !param.transpose, s)
+             : gemm::op(B, dA, dL, scale, DType(0), !param.transpose, param.transpose, s));
+    using namespace mxnet_op;
+    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_, dL.dptr_);
   }
 };
 
 struct trmm_backward {
   template<typename xpu, typename DType>
-  static void op(const Tensor<xpu, 2, DType>& dB, const Tensor<xpu, 2, DType>& L,
-                 const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
-                 const Tensor<xpu, 2, DType>& dL, const Tensor<xpu, 2, DType>& dA,
-                 const nnvm::NodeAttrs& attrs) {
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& dL, const Tensor<xpu, 3, DType>& dA,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of B = trmm(L,A).
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
     // Compute dL
     const bool db_left(param.rightside == param.transpose);
-    (db_left ? gemm::op(dB, A, dL, DType(param.alpha), DType(0), param.transpose, !param.transpose)
-           : gemm::op(A, dB, dL, DType(param.alpha), DType(0), !param.transpose, param.transpose));
-    ZeroUpper(dL.dptr_, dL.size(0));
+    DType scale(param.alpha);
+    (db_left ? gemm::op(dB, A, dL, scale, DType(0), param.transpose, !param.transpose, s)
+             : gemm::op(A, dB, dL, scale, DType(0), !param.transpose, param.transpose, s));
+    using namespace mxnet_op;
+    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_, dL.dptr_);
     // Compute dA
-    if ( dA.dptr_ != dB.dptr_ ) Copy(dA, dB);
-    trmm::op(L, dA, DType(param.alpha), param.rightside, !param.transpose);
+    if ( dA.dptr_ != dB.dptr_ ) Copy(dA, dB, s);
+    trmm::op(L, dA, scale, param.rightside, !param.transpose, s);
   }
 };
 
+struct BackwardSumLogDiag {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int N, int stride, DType* dB, DType* A, DType* dA) {
+    const int offset(i * N * stride);
+    for ( int j = 0; j < N; ++j ) {
+      dA[offset+j*(stride+1)] = dB[i]/A[offset+j*(stride+1)];
+    }
+  }
+};
 struct sumlogdiag_backward {
   template<typename xpu, typename DType>
-  static void op(const DType& dB, const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& dA,
-                 const nnvm::NodeAttrs& attrs, bool add) {
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& dA,
+                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of B = sumlogdiag(A).
-    const int N(A.size(0));
-    if ( !add ) {
-      for ( int i = 0; i < N*N; ++i ) {
-        dA.dptr_[i] = 0;
-      }
-    }
-    for ( int i = 0; i < N; ++i ) {
-      dA.dptr_[i*(N+1)] += dB / A.dptr_[i*N+i];
-    }
+    // dB is actually a 1-d tensor but we convert it to a 3-D one before calling
+    // this function as the LaOpCaller-adapters can only deal with a uniform
+    // dimension for all tensor inputs. This doesn't matter as we will interpret
+    // it correctly internally in this function.
+    using namespace mxnet_op;
+    Kernel<Scale, xpu>::Launch(s, dA.MSize(), DType(0), dA.dptr_);
+    Kernel<BackwardSumLogDiag, xpu>::Launch
+         (s, A.size(0), A.size(1), A.stride_, dB.dptr_, A.dptr_, dA.dptr_);
   }
 };
 
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 6af9a0f33d48..a2a1fe8e06b7 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -626,7 +626,6 @@ def test_bilinear_sampler_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
-
 def test_grid_generator_with_type():
     data = mx.sym.Variable('data')
     sym = mx.sym.GridGenerator(data=data, transform_type='affine', target_shape=(20, 20))
@@ -640,6 +639,19 @@ def test_grid_generator_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
+def test_spatial_transformer_with_type():
+    np.random.seed(1234)
+    data = mx.sym.Variable('data')
+    loc = mx.sym.Flatten(data)
+    loc = mx.sym.FullyConnected(data=loc, num_hidden=10)
+    loc = mx.sym.Activation(data=loc, act_type='relu')
+    loc = mx.sym.FullyConnected(data=loc, num_hidden=6)
+    sym = mx.sym.SpatialTransformer(data=data, loc=loc, target_shape=(10, 10),
+                                    transform_type="affine", sampler_type="bilinear")
+    ctx_list = [{'ctx': mx.gpu(0), 'data': (1, 5, 10, 10), 'type_dict': {'data': np.float32}},
+                {'ctx': mx.cpu(0), 'data': (1, 5, 10, 10), 'type_dict': {'data': np.float32}}]
+    check_consistency(sym, ctx_list)
+    check_consistency(sym, ctx_list, grad_req="add")
 
 # Checking max pooling consistency over the data sets of different float types is problematic
 # as one max value in a float32 data set may not be the max value in a float16 data set.
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 82905cc42f19..37bb5626f765 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -329,6 +329,53 @@ def test_is_train():
         assert y.asnumpy().max() == 2 and y.asnumpy().min() == 0
 
 
+def test_function():
+    class func(Function):
+        def forward(self, x, y):
+            m = x / y
+            n = x * y
+            self.save_for_backward(x, y)
+            return m, n
+
+        def backward(self, dm, dn):
+            x, y = self.saved_tensors
+            dx = dm/y + dn*y
+            dy = dn*x - dm * x / y / y
+            return dx, dy
+
+    f = func()
+    x = mx.nd.random_uniform(shape=(10,))
+    x.attach_grad()
+    y = mx.nd.random_uniform(shape=(10,))
+    y.attach_grad()
+    with record():
+        m, n = f(x, y)
+        backward([m, n])
+
+    dx1 = x.grad.asnumpy()
+    dy1 = y.grad.asnumpy()
+
+    with record():
+        backward([x/y, x*y])
+
+    assert_almost_equal(x.grad.asnumpy(), dx1)
+    assert_almost_equal(y.grad.asnumpy(), dy1)
+
+
+def test_get_symbol():
+    x = mx.nd.ones((1,))
+    x.attach_grad()
+    with record():
+        y = x*x + 2*x - 1
+    assert len(get_symbol(y).list_arguments()) == 1
+
+    z = mx.nd.ones((1,))
+    z.attach_grad()
+    with record():
+        y = x*x + 2*z - 1
+    assert len(get_symbol(y).list_arguments()) == 2
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index e9a430124499..32298fcd57d5 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+import tarfile
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
@@ -32,23 +33,24 @@ def test_array_dataset():
 
 
 def prepare_record():
-    if not os.path.isdir("data"):
-        os.makedirs('data')
     if not os.path.isdir("data/test_images"):
-        os.system("wget http://data.mxnet.io/data/test_images.tar.gz -O data/test_images.tar.gz")
-        os.system("tar -xf data/test_images.tar.gz -C data")
-    imgs = os.listdir('data/test_images')
-    record = mx.recordio.MXIndexedRecordIO('data/test.idx', 'data/test.rec', 'w')
-    for i, img in enumerate(imgs):
-        str_img = open('data/test_images/'+img, 'rb').read()
-        s = mx.recordio.pack((0, i, i, 0), str_img)
-        record.write_idx(i, s)
+        os.makedirs('data/test_images')
+    if not os.path.isdir("data/test_images/test_images"):
+        gluon.utils.download("http://data.mxnet.io/data/test_images.tar.gz", "data/test_images.tar.gz")
+        tarfile.open('data/test_images.tar.gz').extractall('data/test_images/')
+    if not os.path.exists('data/test.rec'):
+        imgs = os.listdir('data/test_images/test_images')
+        record = mx.recordio.MXIndexedRecordIO('data/test.idx', 'data/test.rec', 'w')
+        for i, img in enumerate(imgs):
+            str_img = open('data/test_images/test_images/'+img, 'rb').read()
+            s = mx.recordio.pack((0, i, i, 0), str_img)
+            record.write_idx(i, s)
     return 'data/test.rec'
 
 
 def test_recordimage_dataset():
     recfile = prepare_record()
-    dataset = gluon.data.ImageRecordDataset(recfile)
+    dataset = gluon.data.vision.ImageRecordDataset(recfile)
     loader = gluon.data.DataLoader(dataset, 1)
 
     for i, (x, y) in enumerate(loader):
@@ -71,6 +73,13 @@ def test_datasets():
     assert len(gluon.data.vision.MNIST(root='data')) == 60000
     assert len(gluon.data.vision.CIFAR10(root='data', train=False)) == 10000
 
+def test_image_folder_dataset():
+    prepare_record()
+    dataset = gluon.data.vision.ImageFolderDataset('data/test_images')
+    assert dataset.synsets == ['test_images']
+    assert len(dataset.items) == 16
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index fe4841bc0979..f27204b119bd 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3425,15 +3425,8 @@ def test_deformable_psroipooling():
 
 
 def test_laop():
-    return
-
-    # Currently no support for GPU. Will be added soon
-    # so keep these tests here in this file and activate
-    # gpu-testing when it is ready.
-    dev = default_context()
-    if dev.device_type == 'gpu':
-       return
 
+    # enable numerical checking of gradients
     grad_check = 1
 
     data1 = mx.symbol.Variable('data1')
diff --git a/tools/caffe_converter/make_win32.bat b/tools/caffe_converter/make_win32.bat
index 2f3367d000d4..e5bc9143e05c 100644
--- a/tools/caffe_converter/make_win32.bat
+++ b/tools/caffe_converter/make_win32.bat
@@ -1,3 +1,20 @@
+rem Licensed to the Apache Software Foundation (ASF) under one
+rem or more contributor license agreements.  See the NOTICE file
+rem distributed with this work for additional information
+rem regarding copyright ownership.  The ASF licenses this file
+rem to you under the Apache License, Version 2.0 (the
+rem "License"); you may not use this file except in compliance
+rem with the License.  You may obtain a copy of the License at
+rem
+rem   http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing,
+rem software distributed under the License is distributed on an
+rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+rem KIND, either express or implied.  See the License for the
+rem specific language governing permissions and limitations
+rem under the License.
+
 @protoc --python_out=./ ./caffe.proto
 @echo done.
 @pause
diff --git a/tools/coreml/README.md b/tools/coreml/README.md
new file mode 100644
index 000000000000..32cde339d3a9
--- /dev/null
+++ b/tools/coreml/README.md
@@ -0,0 +1,95 @@
+# Convert MXNet models into Apple CoreML format.
+
+This tool helps convert MXNet models into [Apple CoreML](https://developer.apple.com/documentation/coreml) format which can then be run on Apple devices.
+
+## Installation
+In order to use this tool you need to have these installed:
+* MacOS - High Sierra 10.13
+* Xcode 9
+* coremltools 0.5.0 or greater (pip install coremltools)
+* mxnet 0.10.0 or greater. [Installation instructions](http://mxnet.io/get_started/install.html).
+* yaml (pip install pyyaml)
+* python 2.7
+
+## How to use
+Let's say you want to use your MXNet model in an iPhone App. For the purpose of this example, let's say you want to use squeezenet-v1.1.
+
+1. Download the model into the directory where this converter resides. Squeezenet can be downloaded from [here](http://data.mxnet.io/models/imagenet/squeezenet/).
+2. Run this command:
+
+  ```bash
+python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel"
+```
+
+  The above command will save the converted model into squeezenet-v11.mlmodel in CoreML format. Internally MXNet first loads the model and then we walk through the entire symbolic graph converting each operator into its CoreML equivalent. Some of the parameters are used by MXNet in order to load and generate the symbolic graph in memory while others are used by CoreML either to pre-process the input before the going through the neural network or to process the output in a particular way. 
+
+  In the command above:
+
+  * _model-prefix_: refers to the MXNet model prefix (may include the directory path).
+  * _epoch_: refers to the suffix of the MXNet model file.
+  * _input-shape_: refers to the input shape information in a JSON string format where the key is the name of the input variable (="data") and the value is the shape of that variable. If the model takes multiple inputs, input-shape for all of them need to be provided.
+  * _mode_: refers to the coreml model mode. Can either be 'classifier', 'regressor' or None. In this case, we use 'classifier' since we want the resulting CoreML model to classify images into various categories.
+  * _pre-processing-arguments_: In the Apple world images have to be of type Image. By providing image_input_names as "data", we are saying that the input variable "data" is of type Image.
+  * _class-labels_: refers to the name of the file which contains the classification labels (a.k.a. synset file).
+output-file: the file where the CoreML model will be dumped.
+
+3. The generated ".mlmodel" file can directly be integrated into your app. For more instructions on how to do this, please see [Apple CoreML's tutorial](https://developer.apple.com/documentation/coreml/integrating_a_core_ml_model_into_your_app).
+
+
+### Providing class labels
+You could provide a file containing class labels (as above) so that CoreML will return the predicted category the image belongs to. The file should have a label per line and labels can have any special characters. The line number of the label in the file should correspond with the index of softmax output. E.g.
+
+```bash
+python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel"
+```
+
+### Providing label names
+You may have to provide the label names of the MXNet model's outputs. For example, if you try to convert [vgg16](http://data.mxnet.io/models/imagenet/vgg/), you may have to provide label-name as "prob_label". By default "softmax_label" is assumed.
+
+```bash
+python mxnet_coreml_converter.py --model-prefix='vgg16' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="vgg16.mlmodel" --label-names="prob_label"
+```
+ 
+### Adding a pre-processing to CoreML model.
+You could ask CoreML to pre-process the images before passing them through the model.
+
+```bash
+python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103}' --output-file="squeezenet_v11.mlmodel"
+```
+
+If you are building an app for a model that takes image as an input, you will have to provide image_input_names as pre-processing arguments. This tells CoreML that a particular input variable is of type Image. E.g.:
+ 
+```bash
+python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103,"image_input_names":"data"}' --output-file="squeezenet_v11.mlmodel"
+```
+
+## Currently supported
+### Models
+This is a (growing) list of standard MXNet models that can be successfully converted using the converter. This means that any other model that uses similar operators as these models can also be successfully converted.
+
+1. Inception: [Inception-BN](http://data.mxnet.io/models/imagenet/inception-bn/), [Inception-V3](http://data.mxnet.io/models/imagenet/inception-v3.tar.gz)
+2. [NiN](http://data.dmlc.ml/models/imagenet/nin/)
+2. [Resnet](http://data.mxnet.io/models/imagenet/resnet/)
+3. [Squeezenet](http://data.mxnet.io/models/imagenet/squeezenet/)
+4. [Vgg](http://data.mxnet.io/models/imagenet/vgg/)
+
+### Layers
+1. Activation
+2. Batchnorm
+3. Concat
+4. Convolution
+5. Deconvolution
+6. Dense
+7. Elementwise
+8. Flatten
+9. Pooling
+10. Reshape
+11. Softmax
+12. Transpose
+
+## Known issues
+Currently there are no known issues.
+
+## This tool has been tested on environment with:
+* MacOS - High Sierra 10.13 Beta.
+* Xcode 9 beta 5.
diff --git a/tools/coreml/converter/__init__.py b/tools/coreml/converter/__init__.py
new file mode 100644
index 000000000000..245692337bc3
--- /dev/null
+++ b/tools/coreml/converter/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
diff --git a/tools/coreml/converter/_add_pooling.py b/tools/coreml/converter/_add_pooling.py
new file mode 100644
index 000000000000..51934f22190b
--- /dev/null
+++ b/tools/coreml/converter/_add_pooling.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from coremltools.proto import NeuralNetwork_pb2 as _NeuralNetwork_pb2
+
+
+def add_pooling_with_padding_types(builder, name, height, width, stride_height, stride_width,
+        layer_type, padding_type, input_name, output_name,
+        padding_top = 0, padding_bottom = 0, padding_left = 0, padding_right = 0,
+        same_padding_asymmetry_mode = 'BOTTOM_RIGHT_HEAVY',
+        exclude_pad_area = True, is_global = False):
+    """
+    Add a pooling layer to the model.
+
+    This is our own implementation of add_pooling since current CoreML's version (0.5.0) of builder
+    doesn't provide support for padding types apart from valid. This support will be added in the
+    next release of coremltools. When that happens, this can be removed.
+
+    Parameters
+
+    ----------
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    name: str
+        The name of this layer.
+    height: int
+        Height of pooling region.
+    width: int
+        Number of elements to be padded on the right side of the input blob.
+    stride_height: int
+        Stride along the height direction.
+    stride_width: int
+        Stride along the height direction.
+    layer_type: str
+        Type of pooling performed. Can either be 'MAX', 'AVERAGE' or 'L2'.
+    padding_type: str
+        Option for the output blob shape. Can be either 'VALID' , 'SAME' or 'INCLUDE_LAST_PIXEL'. Kindly look at NeuralNetwork.proto for details.
+    input_name: str
+        The input blob name of this layer.
+    output_name: str
+        The output blob name of this layer.
+
+    padding_top, padding_bottom, padding_left, padding_right: int
+        values of height (top, bottom) and width (left, right) padding to be used if padding type is "VALID" or "INCLUDE_LAST_PIXEL"
+
+    same_padding_asymmetry_mode : str.
+        Type of asymmetric padding to be used when  padding_type = 'SAME'. Kindly look at NeuralNetwork.proto for details. Can be either 'BOTTOM_RIGHT_HEAVY' or  'TOP_LEFT_HEAVY'.
+
+    exclude_pad_area: boolean
+        Whether to exclude padded area in the pooling operation. Defaults to True.
+
+        - If True, the value of the padded area will be excluded.
+        - If False, the padded area will be included.
+        This flag is only used with average pooling.
+    is_global: boolean
+        Whether the pooling operation is global. Defaults to False.
+
+        - If True, the pooling operation is global -- the pooling region is of the same size of the input blob.
+        Parameters height, width, stride_height, stride_width will be ignored.
+
+        - If False, the pooling operation is not global.
+
+    See Also
+    --------
+    add_convolution, add_pooling, add_activation
+    """
+
+    spec = builder.spec
+    nn_spec = builder.nn_spec
+
+    # Add a new layer
+    spec_layer = nn_spec.layers.add()
+    spec_layer.name = name
+    spec_layer.input.append(input_name)
+    spec_layer.output.append(output_name)
+    spec_layer_params = spec_layer.pooling
+
+    # Set the parameters
+    spec_layer_params.type = \
+                _NeuralNetwork_pb2.PoolingLayerParams.PoolingType.Value(layer_type)
+
+    if padding_type == 'VALID':
+        height_border = spec_layer_params.valid.paddingAmounts.borderAmounts.add()
+        height_border.startEdgeSize = padding_top
+        height_border.endEdgeSize = padding_bottom
+        width_border = spec_layer_params.valid.paddingAmounts.borderAmounts.add()
+        width_border.startEdgeSize = padding_left
+        width_border.endEdgeSize = padding_right
+    elif padding_type == 'SAME':
+        if not (same_padding_asymmetry_mode == 'BOTTOM_RIGHT_HEAVY' or  same_padding_asymmetry_mode == 'TOP_LEFT_HEAVY'):
+            raise ValueError("Invalid value %d of same_padding_asymmetry_mode parameter" % same_padding_asymmetry_mode)
+        spec_layer_params.same.asymmetryMode = _NeuralNetwork_pb2.SamePadding.SamePaddingMode.Value(same_padding_asymmetry_mode)
+    elif padding_type == 'INCLUDE_LAST_PIXEL':
+        if padding_top != padding_bottom or padding_left != padding_right:
+            raise ValueError("Only symmetric padding is supported with the INCLUDE_LAST_PIXEL padding type")
+        spec_layer_params.includeLastPixel.paddingAmounts.append(padding_top)
+        spec_layer_params.includeLastPixel.paddingAmounts.append(padding_left)
+
+    spec_layer_params.kernelSize.append(height)
+    spec_layer_params.kernelSize.append(width)
+    spec_layer_params.stride.append(stride_height)
+    spec_layer_params.stride.append(stride_width)
+    spec_layer_params.avgPoolExcludePadding = exclude_pad_area
+    spec_layer_params.globalPooling = is_global
diff --git a/tools/coreml/converter/_layers.py b/tools/coreml/converter/_layers.py
new file mode 100644
index 000000000000..0a089949a1a6
--- /dev/null
+++ b/tools/coreml/converter/_layers.py
@@ -0,0 +1,569 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import _add_pooling
+from ast import literal_eval
+
+def _get_input_output_name(net, node, index=0):
+    name = node['name']
+    inputs = node['inputs']
+
+    if index == 'all':
+        input_name = [_get_node_name(net, inputs[idx][0]) for idx in range(len(inputs))]
+    elif type(index) == int:
+        input_name = _get_node_name(net, inputs[0][0])
+    else:
+        input_name = [_get_node_name(net, inputs[idx][0]) for idx in index]
+    return input_name, name
+
+
+def _get_node_name(net, node_id):
+    return net['nodes'][node_id]['name']
+
+
+def _get_node_shape(net, node_id):
+    return net['nodes'][node_id]['shape']
+
+
+# TODO These operators still need to be converted (listing in order of priority):
+# High priority:
+# mxnet.symbol.repeat -> builder.add_repeat to flatten and repeat the NDArray sequence
+# mxnet.symbol.Crop -> builder.add_crop to crop image along spacial dimensions
+# mxnet.symbol.Pad -> builder.add_padding putting 0's on height and width for tensor
+# Low Priority:
+# depthwise seperable convolution support through groups in builder.add_convolution
+# add_optional -> for all RNNs defining what goes in and out (to define beam search or if input is streaming)
+# mx.symbol.Embedding -> add_embedding takes indicies, word ids from dict that is outside coreml or
+# in pipeline only if we have text mapping to indicies
+# FusedRNNCell -> add_bidirlstm
+#  add_unilstm -> reverse_input param true as second and concat on outputs
+# Do vanilla (0.9 mxnet) lstm, gru, vanilla_rnn
+
+
+def convert_reshape(net, node, module, builder):
+    """Converts a reshape layer from mxnet to coreml.
+
+    This doesn't currently handle the deprecated parameters for the reshape layer.
+
+    Parameters
+    ----------
+    network: net
+        An mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        A module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    target_shape = node['shape']
+
+    if any(item <= 0 for item in target_shape):
+        raise NotImplementedError('Special dimensional values less than or equal to 0 are not supported yet.'
+                                  'Feel free to file an issue here: https://github.com/dmlc/mxnet/issues.')
+
+    if 'reverse' in node and node['reverse'] == 'True':
+        raise NotImplementedError('"reverse" parameter is not supported by yet.'
+                                  'Feel free to file an issue here: https://github.com/dmlc/mxnet/issues.')
+
+    mode = 0 # CHANNEL_FIRST
+    builder.add_reshape(name, input_name, output_name, target_shape, mode)
+
+
+def convert_transpose(net, node, module, builder):
+    """Convert a transpose layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    param = node['attr']
+
+    axes = literal_eval(param['axes'])
+    builder.add_permute(name, axes, input_name, output_name)
+
+
+def convert_flatten(net, node, module, builder):
+    """Convert a flatten layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    mode = 0 # CHANNEL_FIRST
+    builder.add_flatten(name, mode, input_name, output_name)
+
+
+def convert_softmax(net, node, module, builder):
+    """Convert a softmax layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    builder.add_softmax(name=name,
+                        input_name=input_name,
+                        output_name=output_name)
+
+
+def convert_activation(net, node, module, builder):
+    """Convert an activation layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    mx_non_linearity = node['attr']['act_type']
+    #TODO add SCALED_TANH, SOFTPLUS, SOFTSIGN, SIGMOID_HARD, LEAKYRELU, PRELU, ELU, PARAMETRICSOFTPLUS, THRESHOLDEDRELU, LINEAR
+    if mx_non_linearity == 'relu':
+        non_linearity = 'RELU'
+    elif mx_non_linearity == 'tanh':
+        non_linearity = 'TANH'
+    elif mx_non_linearity == 'sigmoid':
+        non_linearity = 'SIGMOID'
+    else:
+        raise TypeError('Unknown activation type %s' % mx_non_linearity)
+    builder.add_activation(name = name,
+                           non_linearity = non_linearity,
+                           input_name = input_name,
+                           output_name = output_name)
+
+
+def convert_elementwise_add(net, node, module, builder):
+    """Convert an elementwise add layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+
+    input_names, output_name = _get_input_output_name(net, node, [0, 1])
+    name = node['name']
+
+    builder.add_elementwise(name, input_names, output_name, 'ADD')
+
+
+def convert_dense(net, node, module, builder):
+    """Convert a dense layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    has_bias = True
+    name = node['name']
+
+    inputs = node['inputs']
+    args, _ = module.get_params()
+    W = args[_get_node_name(net, inputs[1][0])].asnumpy()
+    if has_bias:
+        Wb = args[_get_node_name(net, inputs[2][0])].asnumpy()
+    else:
+        Wb = None
+    nC, nB = W.shape
+
+    builder.add_inner_product(
+        name=name,
+        W=W,
+        b=Wb,
+        input_channels=nB,
+        output_channels=nC,
+        has_bias=has_bias,
+        input_name=input_name,
+        output_name=output_name
+    )
+
+
+def convert_convolution(net, node, module, builder):
+    """Convert a convolution layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    param = node['attr']
+    inputs = node['inputs']
+    args, _ = module.get_params()
+
+    if 'no_bias' in param.keys():
+        has_bias = not literal_eval(param['no_bias'])
+    else:
+        has_bias = True
+
+    if literal_eval(param['pad']) != (0, 0):
+        pad = literal_eval(param['pad'])
+        builder.add_padding(
+            name=name+"_pad",
+            left=pad[1],
+            right=pad[1],
+            top=pad[0],
+            bottom=pad[0],
+            value=0,
+            input_name=input_name,
+            output_name=name+"_pad_output")
+        input_name = name+"_pad_output"
+
+    border_mode = "valid"
+
+    n_filters = int(param['num_filter'])
+
+    W = args[_get_node_name(net, inputs[1][0])].asnumpy()
+    if has_bias:
+        Wb = args[_get_node_name(net, inputs[2][0])].asnumpy()
+    else:
+        Wb = None
+
+    channels = W.shape[1]
+    stride_height, stride_width = literal_eval(param['stride'])
+    kernel_height, kernel_width = literal_eval(param['kernel'])
+
+    W = W.transpose((2, 3, 1, 0))
+    builder.add_convolution(
+        name=name,
+        kernel_channels=channels,
+        output_channels=n_filters,
+        height=kernel_height,
+        width=kernel_width,
+        stride_height=stride_height,
+        stride_width=stride_width,
+        border_mode=border_mode,
+        groups=1,
+        W=W,
+        b=Wb,
+        has_bias=has_bias,
+        is_deconv=False,
+        output_shape=None,
+        input_name=input_name,
+        output_name=output_name)
+
+
+def convert_pooling(net, node, module, builder):
+    """Convert a pooling layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    param = node['attr']
+
+    layer_type_mx = param['pool_type']
+    if layer_type_mx == 'max':
+        layer_type = 'MAX'
+    elif layer_type_mx == 'avg':
+        layer_type = 'AVERAGE'
+    else:
+        raise TypeError("Pooling type %s not supported" % layer_type_mx)
+
+    # Add padding if there is any
+    if literal_eval(param['pad']) != (0, 0):
+        pad = literal_eval(param['pad'])
+        builder.add_padding(
+            name=name+"_pad",
+            left=pad[1],
+            right=pad[1],
+            top=pad[0],
+            bottom=pad[0],
+            value=0,
+            input_name=input_name,
+            output_name=name+"_pad_output")
+        input_name = name+"_pad_output"
+
+    stride_height, stride_width = literal_eval(param['stride'])
+    kernel_width, kernel_height = literal_eval(param['kernel'])
+
+    type_map = {'valid': 'VALID', 'full': 'INCLUDE_LAST_PIXEL'}
+    padding_type = param['pooling_convention'] if 'pooling_convention' in param else 'valid'
+    if padding_type not in type_map:
+        raise KeyError("%s type is not supported in this converter. It is a Github issue.")
+    padding_type = type_map[padding_type]
+
+    if 'global_pool' in param.keys():
+        is_global = literal_eval(param['global_pool'])
+    else:
+        is_global = False
+
+    # For reasons why we are not using the standard builder but having our own implementation,
+    # see the function documentation.
+    _add_pooling.add_pooling_with_padding_types(
+        builder=builder,
+        name=name,
+        height=kernel_height,
+        width=kernel_width,
+        stride_height=stride_height,
+        stride_width=stride_width,
+        layer_type=layer_type,
+        padding_type=padding_type,
+        exclude_pad_area=False,
+        is_global=is_global,
+        input_name=input_name,
+        output_name=output_name
+    )
+
+
+def convert_batchnorm(net, node, module, builder):
+    """Convert a transpose layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    inputs = node['inputs']
+
+
+    eps = 1e-3 # Default value of eps for MXNet.
+    use_global_stats = False # Default value of use_global_stats for MXNet.
+    if 'attr' in node:
+        if 'eps' in node['attr']:
+            eps = literal_eval(node['attr']['eps'])
+
+    args, aux = module.get_params()
+    gamma = args[_get_node_name(net, inputs[1][0])].asnumpy()
+    beta = args[_get_node_name(net, inputs[2][0])].asnumpy()
+    mean = aux[_get_node_name(net, inputs[3][0])].asnumpy()
+    variance = aux[_get_node_name(net, inputs[4][0])].asnumpy()
+    nb_channels = gamma.shape[0]
+    builder.add_batchnorm(
+        name=name,
+        channels=nb_channels,
+        gamma=gamma,
+        beta=beta,
+        mean=mean,
+        variance=variance,
+        input_name=input_name,
+        output_name=output_name,
+        epsilon=eps)
+
+
+def convert_concat(net, node, module, builder):
+    """Convert concat layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    # Get input and output names
+    input_names, output_name = _get_input_output_name(net, node, 'all')
+    name = node['name']
+    mode = 'CONCAT'
+    builder.add_elementwise(name = name, input_names = input_names,
+            output_name = output_name, mode = mode)
+
+
+def convert_deconvolution(net, node, module, builder):
+    """Convert a deconvolution layer from mxnet to coreml.
+
+    Parameters
+    ----------
+    network: net
+        A mxnet network object.
+
+    layer: node
+        Node to convert.
+
+    module: module
+        An module for MXNet
+
+    builder: NeuralNetworkBuilder
+        A neural network builder object.
+    """
+    input_name, output_name = _get_input_output_name(net, node)
+    name = node['name']
+    param = node['attr']
+    inputs = node['inputs']
+    args, _ = module.get_params()
+
+    if 'no_bias' in param.keys():
+        has_bias = not literal_eval(param['no_bias'])
+    else:
+        has_bias = False
+
+    border_mode = "valid"
+
+    n_filters = int(param['num_filter'])
+
+    output_shape = None
+    if 'target_shape' in param:
+        target_shape = literal_eval(param['target_shape'])
+        output_shape = (int(target_shape[0]), int(target_shape[1]))
+
+    W = args[_get_node_name(net, inputs[1][0])].asnumpy()
+
+    if has_bias:
+        Wb = args[_get_node_name(net, inputs[2][0])].asnumpy()
+    else:
+        Wb = None
+
+    channels = W.shape[0]
+    stride_height, stride_width = literal_eval(param['stride'])
+    kernel_height, kernel_width = literal_eval(param['kernel'])
+    W = W.transpose((2, 3, 0, 1))
+
+    use_crop = False
+    if literal_eval(param['pad']) != (0, 0) and output_shape is None:
+        use_crop = True
+
+    builder.add_convolution(
+        name=name,
+        kernel_channels=channels,
+        output_channels=n_filters,
+        height=kernel_height,
+        width=kernel_width,
+        stride_height=stride_height,
+        stride_width=stride_width,
+        border_mode=border_mode,
+        groups=1,
+        W=W,
+        b=Wb,
+        has_bias=has_bias,
+        is_deconv=True,
+        output_shape=output_shape,
+        input_name=input_name,
+        output_name=output_name+'before_pad' if use_crop else output_name
+    )
+
+    if use_crop:
+        pad = literal_eval(param['pad'])
+        builder.add_crop(
+            name=name+"_pad",
+            left=pad[1],
+            right=pad[1],
+            top=pad[0],
+            bottom=pad[0],
+            offset=0,
+            input_names=[output_name+'before_pad'],
+            output_name=output_name
+        )
diff --git a/tools/coreml/converter/_mxnet_converter.py b/tools/coreml/converter/_mxnet_converter.py
new file mode 100644
index 000000000000..a9ea0f4d7ad6
--- /dev/null
+++ b/tools/coreml/converter/_mxnet_converter.py
@@ -0,0 +1,231 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import _layers
+import coremltools as _coremltools
+import coremltools.models.datatypes as _datatypes
+from coremltools.models import neural_network as _neural_network
+
+import json as _json
+import mxnet as _mxnet
+import numpy as _np
+
+_MXNET_LAYER_REGISTRY  = {
+    'FullyConnected' : _layers.convert_dense,
+    'Activation'     : _layers.convert_activation,
+    'SoftmaxOutput'  : _layers.convert_softmax,
+    'Convolution'    : _layers.convert_convolution,
+    'Pooling'        : _layers.convert_pooling,
+    'Flatten'        : _layers.convert_flatten,
+    'transpose'      : _layers.convert_transpose,
+    'Concat'         : _layers.convert_concat,
+    'BatchNorm'      : _layers.convert_batchnorm,
+    'elemwise_add'   : _layers.convert_elementwise_add,
+    'Reshape'        : _layers.convert_reshape,
+    'Deconvolution'  : _layers.convert_deconvolution,
+}
+
+_MXNET_SKIP_LAYERS = [
+    '_MulScalar',
+    'Dropout',
+]
+
+def _mxnet_remove_batch(input_data):
+    for blob in input_data:
+        input_data[blob] = _np.reshape(input_data[blob], input_data[blob].shape[1:])
+    return input_data
+
+def check_error(model, path, shapes, output = 'softmax_output', verbose = True):
+    """
+    Check the difference between predictions from MXNet and CoreML.
+    """
+    coreml_model = _coremltools.models.MLModel(path)
+    input_data = {}
+    input_data_copy = {}
+    for ip in shapes:
+        input_data[ip] = _np.random.rand(*shapes[ip]).astype('f')
+        input_data_copy[ip] = _np.copy(input_data[ip])
+
+    dataIter = _mxnet.io.NDArrayIter(input_data_copy)
+    mx_out = model.predict(dataIter).flatten()
+
+    e_out_dict = coreml_model.predict(_mxnet_remove_batch(input_data))
+    e_out = e_out_dict[output].flatten()
+    error = _np.linalg.norm(e_out - mx_out)
+
+    if verbose:
+        print "First few predictions from CoreML : %s" % e_out[0:10]
+        print "First few predictions from MXNet  : %s" % e_out[0:10]
+        print "L2 Error on random data %s" % error
+    return error
+
+def _set_input_output_layers(builder, input_names, output_names):
+    input_layers_indices = []
+    output_layers_indices = []
+    layers = builder.spec.neuralNetwork.layers
+    for idx, l in enumerate(layers):
+        if set(input_names).intersection(l.input):
+            input_layers_indices.append(idx)
+        if set(output_names).intersection(l.output):
+            output_layers_indices.append(idx)
+
+    builder.input_layers_indices = input_layers_indices
+    builder.output_layers_indices = output_layers_indices
+    builder.input_layers_is1d = [False for _ in input_names]
+    builder.output_layers_is1d = [False for _ in output_names]
+
+def _get_layer_converter_fn(layer):
+    """Get the right converter function for MXNet
+    """
+    if layer in _MXNET_LAYER_REGISTRY:
+        return _MXNET_LAYER_REGISTRY[layer]
+    else:
+        raise TypeError("MXNet layer of type %s is not supported." % layer)
+
+
+def convert(model, input_shape, order = None, class_labels = None, mode = None, preprocessor_args = None):
+    """Convert an MXNet model to the protobuf spec.
+
+    Parameters
+    ----------
+    model: MXNet model
+        A trained MXNet neural network model.
+
+    order: Order of inputs
+
+    class_labels: A string or list of strings.
+        As a string it represents the name of the file which contains the classification labels (one per line).
+        As a list of strings it represents a list of categories that map the index of the output of a neural network to labels in a classifier.
+
+    mode: str ('classifier', 'regressor' or None)
+        Mode of the converted coreml model.
+        When mode = 'classifier', a NeuralNetworkClassifier spec will be constructed.
+        When mode = 'regressor', a NeuralNetworkRegressor spec will be constructed.
+
+    **kwargs :
+        Provide keyword arguments for:
+        - input shapes. Supplied as a dictionary object with keyword "input_shape".
+        - pre-processing arguments: Supplied as a dictionary object with keyword "preprocessor_args". The parameters in the dictionary
+            tell the converted coreml model how to pre-process any input before an inference is run on it.
+            For the list of pre-processing arguments see
+            http://pythonhosted.org/coremltools/generated/coremltools.models.neural_network.html#coremltools.models.neural_network.NeuralNetworkBuilder.set_pre_processing_parameters
+
+    Returns
+    -------
+    model: A coreml model.
+    """
+    if not isinstance(input_shape, dict):
+         raise TypeError("Must provide a dictionary for input shape. e.g input_shape={'data':(3,224,224)}")
+
+    def remove_batch(dim):
+        return dim[1:]
+
+    if order is None:
+        input_names = input_shape.keys()
+        input_dims  = map(remove_batch, input_shape.values())
+    else:
+        names = input_shape.keys()
+        shapes = map(remove_batch, input_shape.values())
+        input_names = [names[i] for i in order]
+        input_dims = [shapes[i] for i in order]
+
+    net = model.symbol
+
+    # Infer shapes and store in a dictionary
+    shapes = net.infer_shape(**input_shape)
+    arg_names = net.list_arguments()
+    output_names = net.list_outputs()
+    aux_names = net.list_auxiliary_states()
+    shape_dict = {}
+    for idx, op in enumerate(arg_names):
+        shape_dict[op] = shapes[0][idx]
+    for idx, op in enumerate(output_names):
+        shape_dict[op] = shapes[1][idx]
+    for idx, op in enumerate(aux_names):
+        shape_dict[op] = shapes[2][idx]
+
+    # Get the inputs and outputs
+    output_dims = shapes[1]
+    input_types = [_datatypes.Array(*dim) for dim in input_dims]
+    output_types = [_datatypes.Array(*dim) for dim in output_dims]
+
+    # Make the builder
+    input_features = zip(input_names, input_types)
+    output_features = zip(output_names, output_types)
+    builder = _neural_network.NeuralNetworkBuilder(input_features, output_features, mode)
+    # Get out the layers
+    net = _json.loads(net.tojson())
+    nodes = net['nodes']
+
+    for i, node in enumerate(nodes):
+        node['id'] = i
+
+        if node['name'] in shape_dict:
+            node['shape'] = shape_dict[node['name']]
+
+        node['outputs'] = []
+        if 'inputs' in node:
+            for ip in node['inputs']:
+                nodes[ip[0]]['outputs'].append([i, 0])
+        else:
+            node['inputs'] = []
+
+    # Mark the head nodes
+    for head in net['heads']:
+        head_id = head[0]
+        head_node = nodes[head_id]
+        head_node['outputs'] = [head]
+        head_node['name'] += "_output"
+        head_node['shape'] = shape_dict[head_node['name']]
+
+    # For skipped layers, make sure nodes are modified
+    for node in nodes:
+        op = node['op']
+        inputs = node['inputs']
+        outputs = node['outputs']
+        if op in _MXNET_SKIP_LAYERS:
+            nodes[inputs[0][0]]['outputs'][0] = outputs[0]
+            nodes[outputs[0][0]]['inputs'][0] = inputs[0]
+
+    # Find the input and output names for this node
+    for idx, node in enumerate(nodes):
+        op = node['op']
+        if op == 'null' or op in _MXNET_SKIP_LAYERS:
+            continue
+        name = node['name']
+        print("%d : %s, %s" % (idx, name, op))
+        converter_func = _get_layer_converter_fn(op)
+        converter_func(net, node, model, builder)
+
+    # Set the right inputs and outputs
+    _set_input_output_layers(builder, input_names, output_names)
+    builder.set_input(input_names, input_dims)
+    builder.set_output(output_names, output_dims)
+    if preprocessor_args is not None:
+        builder.set_pre_processing_parameters(**preprocessor_args)
+
+    if class_labels is not None:
+        if type(class_labels) is str:
+            labels = [l.strip() for l in open(class_labels).readlines()]
+        elif type(class_labels) is list:
+            labels = class_labels
+        else:
+            raise TypeError("synset variable of unknown type. Type found: %s. Expected either string or list of strings." % type(class_labels))
+        builder.set_class_labels(class_labels = labels)
+
+    # Return the model
+    return _coremltools.models.MLModel(builder.spec)
\ No newline at end of file
diff --git a/tools/coreml/mxnet_coreml_converter.py b/tools/coreml/mxnet_coreml_converter.py
new file mode 100644
index 000000000000..502377eca864
--- /dev/null
+++ b/tools/coreml/mxnet_coreml_converter.py
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import argparse
+from converter._mxnet_converter import convert
+from utils import load_model
+import yaml
+from ast import literal_eval
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Converts an MXNet model to a CoreML model')
+
+    parser.add_argument(
+        '--model-prefix', required=True, type=str,
+        help="Prefix of the existing model. The model is expected to be stored in the same directory from where "
+             "this tool is being run. E.g. --model-prefix=squeezenet_v1.1. Note that this can include entire "
+             "directory name too. E.g. --model-prefix=~/Downloads/squeezenet_v1.1."
+    )
+    parser.add_argument(
+        '--epoch', required=True, type=int,
+        help="The suffix of the MXNet model name which usually indicate the number of epochs. E.g. --epoch=0"
+    )
+    parser.add_argument(
+        '--output-file', required=True, type=str,
+        help="File where the resulting CoreML model will be saved. E.g. --output-file=\"squeezenet-v11.mlmodel\""
+    )
+    parser.add_argument(
+        '--input-shape', required=True, type=str,
+        help="Input shape information in a JSON string format. E.g. --input-shape='{\"data\":\"3,224,224\"}' where"
+             " 'data' is the name of the input variable of the MXNet model and '3,244,244' is its shape "
+             "(channel, height and weight) of the input image data."
+    )
+    parser.add_argument(
+        '--label-names', required=False, type=str, default='softmax_label',
+        help="label-names of the MXNet model's output variables. E.g. --label-names=softmax_label. "
+             "(Usually this is the name of the last layer followed by suffix _label.)"
+    )
+    parser.add_argument(
+        '--mode', required=False, type=str, default=None,
+        help="When mode='classifier', a CoreML NeuralNetworkClassifier will be constructed. "
+             "When mode='regressor', a CoreML NeuralNetworkRegressor will be constructed. "
+             "When mode=None (default), a CoreML NeuralNetwork will be constructed."
+    )
+    parser.add_argument(
+        '--class-labels', required=False, type=str, default=None,
+        help="As a string it represents the name of the file which contains the classification labels (synset file)."
+    )
+    parser.add_argument(
+        '--pre-processing-arguments', required=False, type=str, default=None,
+        help="The parameters in the dictionary tell the converted coreml model how to pre-process any input "
+             "before an inference is run on it. For the list of pre-processing arguments see https://goo.gl/GzFe86"
+             "e.g. --pre-processing-arguments='{\"red_bias\": 127, \"blue_bias\":117, \"green_bias\": 103}'"
+    )
+
+    # TODO
+    # We need to test how to use the order
+    # parser.add_argument(
+    #     '--order', required=True, type=str, default=None,
+    #     help=""
+    # )
+
+    args, unknown = parser.parse_known_args()
+
+    model_name = args.model_prefix
+    epoch_num = args.epoch
+    output_file = args.output_file
+    mode = args.mode
+    class_labels=args.class_labels
+
+    # parse the input data name/shape and label name/shape
+    input_shape = yaml.safe_load(args.input_shape)
+    data_shapes = []
+    for key in input_shape:
+        # We prepend 1 because the coreml model only accept 1 input data at a time.
+        shape = (1,)+literal_eval(input_shape[key])
+        input_shape[key] = shape
+        data_shapes.append((key, shape))
+
+    # if label name is not in input then do not use the label
+    label_names = [args.label_names,] if args.label_names in input_shape else None
+
+    pre_processing_arguments = args.pre_processing_arguments
+
+    mod = load_model(
+        model_name=model_name,
+        epoch_num=epoch_num,
+        data_shapes=data_shapes,
+        label_shapes=None,
+        label_names=label_names
+    )
+
+    kwargs = {'input_shape': input_shape}
+    if pre_processing_arguments is not None:
+        kwargs['preprocessor_args'] = yaml.safe_load(pre_processing_arguments)
+
+    coreml_model = convert(model=mod, mode=mode, class_labels=class_labels, **kwargs)
+    coreml_model.save(output_file)
+    print("\nSUCCESS\nModel %s has been converted and saved at %s\n" % (model_name, output_file))
diff --git a/tools/coreml/test/test_mxnet_converter.py b/tools/coreml/test/test_mxnet_converter.py
new file mode 100644
index 000000000000..6692b44ec370
--- /dev/null
+++ b/tools/coreml/test/test_mxnet_converter.py
@@ -0,0 +1,949 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+import mxnet as mx
+import numpy as np
+import sys
+import os
+current_working_directory = os.getcwd()
+sys.path.append(current_working_directory + "/..")
+sys.path.append(current_working_directory + "/../converter/")
+import _mxnet_converter as mxnet_converter
+from collections import namedtuple
+
+
+def _mxnet_remove_batch(input_data):
+    for blob in input_data:
+        input_data[blob] = np.reshape(input_data[blob], input_data[blob].shape[1:])
+    return input_data
+
+
+def _get_mxnet_module(net, input_shape, mode, label_names, input_names=None):
+    """ Given a symbolic graph, input shape and the initialization mode,
+        returns an MXNet module.
+    """
+    mx.random.seed(1993)
+
+    mod = mx.mod.Module(
+        symbol=net,
+        context=mx.cpu(),
+        label_names=label_names
+    )
+    mod.bind(
+        for_training=False,
+        data_shapes=[('data', input_shape)],
+        label_shapes=input_names
+    )
+    if mode == 'random':
+        mod.init_params(
+            initializer=mx.init.Uniform(scale=.1)
+        )
+    elif mode == 'zeros':
+        mod.init_params(
+            initializer=mx.init.Zero()
+        )
+    elif mode == 'ones':
+        mod.init_params(
+            initializer=mx.init.One()
+        )
+    else:
+        Exception(KeyError("%s is not a valid initialization mode" % mode))
+
+    return mod
+
+
+class SingleLayerTest(unittest.TestCase):
+    """
+    Unit test class for testing where converter is able to convert individual layers or not.
+    In order to do so, it converts model and generates preds on both CoreML and MXNet and check they are the same.
+    """
+    def _test_mxnet_model(self, net, input_shape, mode, class_labels=None, coreml_mode=None, label_names=None, delta=1e-3,
+                          pre_processing_args=None):
+        """ Helper method that convert the CoreML model into CoreML and compares the predictions over random data.
+
+        Parameters
+        ----------
+        net: MXNet Symbol Graph
+            The graph that we'll be converting into CoreML.
+
+        input_shape: tuple of ints
+            The shape of input data. Generally of the format (batch-size, channels, height, width)
+
+        mode: (random|zeros|ones)
+            The mode to use in order to set the parameters (weights and biases).
+
+        label_names: list of strings
+            The names of the output labels. Default: None
+
+        delta: float
+            The maximum difference b/w predictions of MXNet and CoreML that is tolerable.
+        """
+        mod = _get_mxnet_module(net, input_shape, mode, label_names)
+
+        # Generate some dummy data
+        input_data = {'data': np.random.uniform(-10., 10., input_shape)}
+        Batch = namedtuple('Batch', ['data'])
+        mod.forward(Batch([mx.nd.array(input_data['data'])]))
+        mxnet_preds = mod.get_outputs()[0].asnumpy().flatten()
+
+        # Get predictions from coreml
+        coreml_model = mxnet_converter.convert(
+            model=mod,
+            class_labels=class_labels,
+            mode=coreml_mode,
+            input_shape={'data': input_shape},
+            preprocessor_args=pre_processing_args
+        )
+        coreml_preds = coreml_model.predict(_mxnet_remove_batch(input_data)).values()[0].flatten()
+
+        # Check prediction accuracy
+        self.assertEquals(len(mxnet_preds), len(coreml_preds))
+        for i in range(len(mxnet_preds)):
+            self.assertAlmostEquals(mxnet_preds[i], coreml_preds[i], delta = delta)
+
+    def test_tiny_inner_product_zero_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='zeros')
+
+    def test_really_tiny_inner_product_ones_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=1)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='ones')
+
+    def test_really_tiny_2_inner_product_ones_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='ones')
+
+    def test_tiny_inner_product_ones_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='ones')
+
+    def test_tiny_inner_product_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_softmax_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.SoftmaxOutput(net, name='softmax')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', label_names=['softmax_label'])
+
+    def test_tiny_relu_activation_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.Activation(net, name='relu1', act_type="relu")
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_sigmoid_activation_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.Activation(net, name='sigmoid1', act_type="sigmoid")
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_tanh_activation_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.Activation(net, name='tanh1', act_type="tanh")
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_really_tiny_conv_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (1 ,1)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_conv_ones_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='ones')
+
+    def test_tiny_conv_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_asym_conv_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5 ,3)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_asym_conv_random_asym_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 28, 18)
+        num_filter = 16
+        kernel = (5, 3)
+        stride = (1, 1)
+        pad = (0, 0)
+        dilate = (1, 1)
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1',
+            dilate=dilate)
+        net = mx.sym.Activation(net, name='tanh', act_type="tanh")
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_conv_valid_pooling_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (2, 2)
+        stride = (2, 2)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        net = mx.symbol.Pooling(
+            data=net,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='pool_1',
+            pool_type='avg',
+            pooling_convention='valid'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_conv_pooling_full_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (2, 2)
+        stride = (2, 2)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        net = mx.symbol.Pooling(
+            data=net,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='pool_1',
+            pool_type='avg',
+            pooling_convention='full'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_conv_pooling_full_random_input_with_padding(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 2
+        kernel = (2, 2)
+        stride = (2, 2)
+        pad = (1, 1)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        net = mx.symbol.Pooling(
+            data=net,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='pool_1',
+            pool_type='avg',
+            pooling_convention='full'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_really_tiny_conv_random_3d_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 1
+        kernel = (1, 1)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_really_tiny_conv_random_input_multi_filter(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 64
+        kernel = (1, 1)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_conv_random_3d_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 1
+        kernel = (5 ,5)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_conv_random_input_multi_filter(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 64
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_conv_random(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 64
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_flatten(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 64
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        net = mx.sym.Flatten(data=net, name='flatten1')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.SoftmaxOutput(net, name='softmax')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', label_names=['softmax_label'])
+
+    def test_transpose(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 64
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        net = mx.sym.Variable('data')
+        net = mx.sym.transpose(data=net, name='transpose', axes=(0, 1, 2, 3))
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_reshape(self):
+        np.random.seed(1988)
+        input_shape = (1, 8)
+        net = mx.sym.Variable('data')
+        net = mx.sym.reshape(data=net, shape=(1, 2, 2, 2))
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_synset_random_input(self):
+        np.random.seed(1989)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.SoftmaxOutput(net, name='softmax')
+        mod = _get_mxnet_module(net,
+                                input_shape=input_shape,
+                                mode='random',
+                                label_names=['softmax_label'])
+
+        # Generate some dummy data
+        input_data = np.random.uniform(-0.1, 0.1, input_shape)
+
+        Batch = namedtuple('Batch', ['data'])
+        mod.forward(Batch([mx.nd.array(input_data)]))
+
+        kwargs = {'input_shape': {'data': input_shape}}
+        # Get predictions from coreml
+        coreml_model = mxnet_converter.convert(
+            model=mod,
+            class_labels=['Category1','Category2','Category3','Category4','Category5'],
+            mode='classifier',
+            **kwargs
+        )
+
+        prediction = coreml_model.predict(_mxnet_remove_batch({'data': input_data}))
+        self.assertEqual(prediction['classLabel'], 'Category3')
+
+    def test_really_tiny_deconv_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (1, 1)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_deconv_ones_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='ones')
+
+    def test_tiny_deconv_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_asym_deconv_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5, 3)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_asym_deconv_random_asym_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 28, 18)
+        num_filter = 16
+        kernel = (5, 3)
+        stride = (1, 1)
+        pad = (0, 0)
+        dilate = (1, 1)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            dilate=dilate,
+            name='deconv_1'
+        )
+        net = mx.sym.Activation(net, name = 'tanh', act_type = "tanh")
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_deconv_pooling_random_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        net = mx.symbol.Pooling(
+            data=net,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='pool_1',
+            pool_type='max'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_really_tiny_deconv_random_3d_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 1
+        kernel = (1, 1)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_really_tiny_deconv_random_input_multi_filter(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 64
+        kernel = (1, 1)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_deconv_random_3d_input(self):
+        np.random.seed(1988)
+        input_shape = (1, 3, 10, 10)
+        num_filter = 1
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_tiny_deconv_random_input_multi_filter(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 64
+        kernel = (5 ,5)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            name='deconv_1'
+        )
+        # Test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_deconv_random(self):
+        np.random.seed(1988)
+        input_shape = (1, 10, 4, 4)
+        num_filter = 3
+        kernel = (2, 2)
+        stride = (1, 1)
+        pad = (0, 0)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            no_bias=False,
+            name='deconv_1'
+        )
+        # test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_deconv_random_output_shape(self):
+        np.random.seed(1988)
+        input_shape = (1, 10, 4, 4)
+        num_filter = 3
+        kernel = (2, 2)
+        stride = (1, 1)
+        pad = (0, 0)
+        target_shape = (5, 5)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            no_bias=False,
+            target_shape=target_shape,
+            name='deconv_1'
+        )
+        # test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_deconv_random_padding(self):
+        np.random.seed(1988)
+        input_shape = (1, 10, 9, 9)
+        num_filter = 3
+        kernel = (3, 3)
+        stride = (3, 3)
+        pad = (2, 2)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+                data=net,
+                num_filter=num_filter,
+                kernel=kernel,
+                stride=stride,
+                pad=pad,
+                no_bias=False,
+                name='deconv_1')
+        # test the mxnet model
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_conv_random_padding_odd(self):
+        np.random.seed(1988)
+        input_shape = (1, 10, 6, 6)
+        num_filter = 3
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (3, 3)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            no_bias=False,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_conv_random_padding_even(self):
+        np.random.seed(1988)
+        input_shape = (1, 10, 6, 6)
+        num_filter = 3
+        kernel = (5, 5)
+        stride = (1, 1)
+        pad = (2, 2)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            no_bias=False,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_deconv_random_all_inputs(self):
+        np.random.seed(1988)
+        input_shape = (1, 10, 5, 5)
+        num_filter = 3
+        kernel = (3, 3)
+        stride = (2, 2)
+        pad = (1, 1)
+        dilate = (1, 1)
+        target_shape = (11, 11)
+
+        # define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Deconvolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            stride=stride,
+            pad=pad,
+            no_bias=False,
+            target_shape=target_shape,
+            dilate=dilate,
+            name='deconv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
+    def test_batch_norm(self):
+        np.random.seed(1988)
+        input_shape = (1, 1, 2, 3)
+
+        net = mx.sym.Variable('data')
+        gamma = mx.sym.Variable('gamma')
+        beta = mx.sym.Variable('beta')
+        moving_mean = mx.sym.Variable('moving_mean')
+        moving_var = mx.sym.Variable('moving_var')
+        net = mx.symbol.BatchNorm(
+            data=net,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            use_global_stats=True,
+            name='batch_norm_1')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', delta=1e-2)
+
+    def test_batch_norm_no_global_stats(self):
+        """ This test should throw an exception since converter doesn't support
+            conversion of MXNet models that use local batch stats (i.e.
+            use_global_stats=False). The reason for this is CoreML doesn't support
+            local batch stats.
+        """
+        np.random.seed(1988)
+        input_shape = (1, 1, 2, 3)
+
+        net = mx.sym.Variable('data')
+        gamma = mx.sym.Variable('gamma')
+        beta = mx.sym.Variable('beta')
+        moving_mean = mx.sym.Variable('moving_mean')
+        moving_var = mx.sym.Variable('moving_var')
+        net = mx.symbol.BatchNorm(
+            data=net,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            use_global_stats=False,
+            name='batch_norm_1')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', delta=1e-2)
+
+    def test_pre_processing_args(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        net = mx.sym.SoftmaxOutput(net, name='softmax')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', label_names=['softmax_label'],
+                               pre_processing_args={'red_bias':0, 'blue_bias':0, 'green_bias':0, 'image_scale':1})
+
+    # TODO test_concat
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(SingleLayerTest)
+    unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tools/coreml/test/test_mxnet_image.py b/tools/coreml/test/test_mxnet_image.py
new file mode 100644
index 000000000000..ac30ac7f5ad9
--- /dev/null
+++ b/tools/coreml/test/test_mxnet_image.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+import unittest
+import sys
+import os
+current_working_directory = os.getcwd()
+sys.path.append(current_working_directory + "/..")
+sys.path.append(current_working_directory + "/../converter/")
+import _mxnet_converter as mxnet_converter
+from utils import load_model
+
+
+VAL_DATA = 'data/val-5k-256.rec'
+URL = 'http://data.mxnet.io/data/val-5k-256.rec'
+
+
+def download_data():
+    return mx.test_utils.download(URL, VAL_DATA)
+
+
+def read_image(data_val, label_name):
+    data = mx.io.ImageRecordIter(
+        path_imgrec=data_val,
+        label_width=1,
+        preprocess_threads=4,
+        batch_size=32,
+        data_shape=(3,224,224),
+        label_name=label_name,
+        rand_corp=False,
+        rand_mirror=False,
+        shuffle=True
+    )
+    return data
+
+
+def is_correct_top_one(predict, label):
+    assert isinstance(predict, np.ndarray)
+    assert isinstance(label, np.float32)
+    predicted_label = np.argmax(predict)
+    return predicted_label == label
+
+
+def is_correct_top_five(predict, label):
+    assert isinstance(predict, np.ndarray)
+    assert isinstance(label, np.float32)
+    top_five_preds = set(predict.argsort()[-5:])
+    return label in top_five_preds
+
+
+class ImageNetTest(unittest.TestCase):
+    def _test_image_prediction(self, model_name, epoch, label_name):
+        try:
+            data = read_image(VAL_DATA, label_name=label_name)
+        except:
+            download_data()
+            data = read_image(VAL_DATA, label_name=label_name)
+
+        mod = load_model(
+            model_name=model_name,
+            epoch_num=epoch,
+            data_shapes=data.provide_data,
+            label_shapes=data.provide_label,
+            label_names=[label_name,]
+        )
+
+        input_shape = (1, 3, 224, 224)
+        coreml_model = mxnet_converter.convert(mod, input_shape={'data': input_shape})
+
+        mxnet_acc = []
+        mxnet_top_5_acc = []
+        coreml_acc = []
+        coreml_top_5_acc = []
+
+        num_batch = 0
+
+        for batch in data:
+            mod.forward(batch, is_train=False)
+            mxnet_preds = mod.get_outputs()[0].asnumpy()
+            data_numpy = batch.data[0].asnumpy()
+            label_numpy = batch.label[0].asnumpy()
+            for i in xrange(32):
+                input_data = {'data': data_numpy[i]}
+                coreml_predict = coreml_model.predict(input_data).values()[0].flatten()
+                mxnet_predict = mxnet_preds[i]
+                label = label_numpy[i]
+                mxnet_acc.append(is_correct_top_one(mxnet_predict, label))
+                mxnet_top_5_acc.append(is_correct_top_five(mxnet_predict, label))
+                coreml_acc.append(is_correct_top_one(coreml_predict, label))
+                coreml_top_5_acc.append(is_correct_top_five(coreml_predict, label))
+                num_batch += 1
+            if (num_batch == 5): break # we only use a subset of the batches.
+
+        print "MXNet acc %s" % np.mean(mxnet_acc)
+        print "Coreml acc %s" % np.mean(coreml_acc)
+        print "MXNet top 5 acc %s" % np.mean(mxnet_top_5_acc)
+        print "Coreml top 5 acc %s" % np.mean(coreml_top_5_acc)
+        self.assertAlmostEqual(np.mean(mxnet_acc), np.mean(coreml_acc), delta=1e-4)
+        self.assertAlmostEqual(np.mean(mxnet_top_5_acc), np.mean(coreml_top_5_acc), delta=1e-4)
+
+    def test_squeezenet(self):
+        print "Testing Image Classification with Squeezenet"
+        self._test_image_prediction(model_name='squeezenet_v1.1', epoch=0, label_name='prob_label')
+
+    def test_inception_with_batch_normalization(self):
+        print "Testing Image Classification with Inception/BatchNorm"
+        self._test_image_prediction(model_name='Inception-BN', epoch=126, label_name='softmax_label')
+
+    def test_resnet18(self):
+        print "Testing Image Classification with ResNet18"
+        self._test_image_prediction(model_name='resnet-18', epoch=0, label_name='softmax_label')
+
+    def test_vgg16(self):
+        print "Testing Image Classification with vgg16"
+        self._test_image_prediction(model_name='vgg16', epoch=0, label_name='prob_label')
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(ImageNetTest)
+    unittest.TextTestRunner(verbosity=2).run(suite)
\ No newline at end of file
diff --git a/tools/coreml/test/test_mxnet_models.py b/tools/coreml/test/test_mxnet_models.py
new file mode 100644
index 000000000000..1732fb833c5f
--- /dev/null
+++ b/tools/coreml/test/test_mxnet_models.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+import mxnet as mx
+import numpy as np
+import sys
+import os
+current_working_directory = os.getcwd()
+sys.path.append(current_working_directory + "/..")
+sys.path.append(current_working_directory + "/../converter/")
+import _mxnet_converter as mxnet_converter
+from collections import namedtuple
+
+
+def _mxnet_remove_batch(input_data):
+    for blob in input_data:
+        input_data[blob] = np.reshape(input_data[blob], input_data[blob].shape[1:])
+    return input_data
+
+
+def _kl_divergence(distribution1, distribution2):
+    """ Calculates Kullback-Leibler Divergence b/w two distributions.
+
+    Parameters
+    ----------
+    distribution1: list of floats
+    distribution2: list of floats
+    """
+    assert len(distribution1) == len(distribution2)
+    n = len(distribution1)
+    result = 1./n * sum(distribution1 * (np.log(distribution1) - np.log(distribution2)))
+    return result
+
+
+class ModelsTest(unittest.TestCase):
+    """
+    Unit test class that tests converter on entire MXNet models .
+    In order to test each unit test converts MXNet model into CoreML model using the converter, generate predictions
+    on both MXNet and CoreML and verifies that predictions are same (or similar).
+    """
+    def _load_model(self, model_name, epoch_num, input_shape):
+        sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, epoch_num)
+        mod = mx.mod.Module(
+            symbol=sym,
+            context=mx.cpu(),
+            label_names=None
+        )
+        mod.bind(
+            for_training=False,
+            data_shapes=[('data', input_shape)],
+            label_shapes=mod._label_shapes
+        )
+        mod.set_params(
+            arg_params=arg_params,
+            aux_params=aux_params,
+            allow_missing=True
+        )
+        return mod
+
+    def _test_model(self, model_name, epoch_num, input_shape=(1, 3, 224, 224), files=None):
+        """ Tests whether the converted CoreML model's preds are equal to MXNet preds for a given model or not.
+
+        Parameters
+        ----------
+        model_name: str
+            Prefix of the MXNet model name as stored on the local directory.
+
+        epoch_num : int
+            Epoch number of model we would like to load.
+
+        input_shape: tuple
+            The shape of the input data in the form of (batch_size, channels, height, width)
+
+        files: list of strings
+            List of URLs pertaining to files that need to be downloaded in order to use the model.
+        """
+
+        if files is not None:
+            print("Downloading files from urls: %s" % (files))
+            for url in files:
+                mx.test_utils.download(url)
+                print("Downloaded %s" % (url))
+
+        module = self._load_model(
+            model_name=model_name,
+            epoch_num=epoch_num,
+            input_shape=input_shape
+        )
+
+        coreml_model = mxnet_converter.convert(module, input_shape={'data': input_shape})
+
+        # Get predictions from MXNet and coreml
+        div=[] # For storing KL divergence for each input.
+        for _ in xrange(1):
+            np.random.seed(1993)
+            input_data = {'data': np.random.uniform(0, 1, input_shape).astype(np.float32)}
+            Batch = namedtuple('Batch', ['data'])
+            module.forward(Batch([mx.nd.array(input_data['data'])]), is_train=False)
+            mxnet_pred = module.get_outputs()[0].asnumpy().flatten()
+            coreml_pred = coreml_model.predict(_mxnet_remove_batch(input_data)).values()[0].flatten()
+            self.assertEqual(len(mxnet_pred), len(coreml_pred))
+            div.append(_kl_divergence(mxnet_pred, coreml_pred))
+
+        print "Average KL divergence is % s" % np.mean(div)
+        self.assertTrue(np.mean(div) < 1e-4)
+
+    def test_pred_inception_bn(self):
+        self._test_model(model_name='Inception-BN', epoch_num=126,
+                         files=["http://data.mxnet.io/models/imagenet/inception-bn/Inception-BN-0126.params",
+                                "http://data.mxnet.io/models/imagenet/inception-bn/Inception-BN-symbol.json"])
+
+    def test_pred_squeezenet_v11(self):
+        self._test_model(model_name='squeezenet_v1.1', epoch_num=0,
+                         files=["http://data.mxnet.io/models/imagenet/squeezenet/squeezenet_v1.1-symbol.json",
+                                "http://data.mxnet.io/models/imagenet/squeezenet/squeezenet_v1.1-0000.params"])
+
+    def test_pred_resnet_50(self):
+        self._test_model(model_name='resnet-50', epoch_num=0,
+                         files=["http://data.mxnet.io/models/imagenet/resnet/50-layers/resnet-50-symbol.json",
+                                "http://data.mxnet.io/models/imagenet/resnet/50-layers/resnet-50-0000.params"])
+
+    def test_pred_vgg16(self):
+        self._test_model(model_name='vgg16', epoch_num=0,
+                         files=["http://data.mxnet.io/models/imagenet/vgg/vgg16-symbol.json",
+                                "http://data.mxnet.io/models/imagenet/vgg/vgg16-0000.params"])
+
+    def test_pred_nin(self):
+        self._test_model(model_name='nin', epoch_num=0,
+                         files=["http://data.dmlc.ml/models/imagenet/nin/nin-symbol.json",
+                                "http://data.dmlc.ml/models/imagenet/nin/nin-0000.params"])
+
+    @unittest.skip("You need to download and unzip file: "
+                   "http://data.mxnet.io/models/imagenet/inception-v3.tar.gz in order to run this test.")
+    def test_pred_inception_v3(self):
+        self._test_model(model_name='Inception-7', epoch_num=1, input_shape=(1, 3, 299, 299))
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(ModelsTest)
+    unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tools/coreml/utils.py b/tools/coreml/utils.py
new file mode 100644
index 000000000000..1e4ff7a4d975
--- /dev/null
+++ b/tools/coreml/utils.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+
+def load_model(model_name, epoch_num, data_shapes, label_shapes, label_names, gpus=''):
+    """Loads and returns a given MXNet model.
+
+    Parameters
+    ----------
+    model_name: str
+        Prefix of the MXNet model name as stored on the local directory.
+
+    epoch_num : int
+        Epoch number of model we would like to load.
+
+    input_shape: tuple
+        The shape of the input data in the form of (batch_size, channels, height, width)
+
+    files: list of strings
+        List of URLs pertaining to files that need to be downloaded in order to use the model.
+
+    data_shapes: list of tuples.
+        List of tuples where each tuple is a pair of input variable name and its shape.
+
+    label_shapes: list of (str, tuple)
+        Typically is ``data_iter.provide_label``.
+
+    label_names: list of str
+        Name of the output labels in the MXNet symbolic graph.
+
+    gpus: str
+        Comma separated string of gpu ids on which inferences are executed. E.g. 3,5,6 would refer to GPUs 3, 5 and 6.
+        If empty, we use CPU.
+
+    Returns
+    -------
+    MXNet module
+    """
+    sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, epoch_num)
+    if gpus == '':
+        devices = mx.cpu()
+    else:
+        devices = [mx.gpu(int(i)) for i in gpus.split(',')]
+    mod = mx.mod.Module(
+        symbol=sym,
+        context=devices,
+        label_names=label_names
+    )
+    mod.bind(
+        for_training=False,
+        data_shapes=data_shapes,
+        label_shapes=label_shapes
+    )
+    mod.set_params(
+        arg_params=arg_params,
+        aux_params=aux_params,
+        allow_missing=True
+    )
+    return mod
+
+
diff --git a/tools/license_header.py b/tools/license_header.py
index d0782b2b06fd..db67000837b0 100644
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -67,7 +67,9 @@
 
 # language extensions and the according commment mark
 _LANGS = {'.cc':'*', '.h':'*', '.cu':'*', '.cuh':'*', '.py':'#',
-          '.pm':'#', '.scala':'*', '.cc':'*', '.sh':'#', '.cmake':'#'}
+          '.pm':'#', '.scala':'*', '.cc':'*', '.sh':'#', '.cmake':'#',
+          '.java':'*', '.sh':'#', '.cpp':'*', '.hpp':'*', '.c':'*',
+          '.bat':'rem', '.pl':'#'}
 
 # Previous license header, which will be removed
 _OLD_LICENSE = re.compile('.*Copyright.*by Contributors')
@@ -105,7 +107,7 @@ def _valid_file(fname, verbose=False):
         return False
     return True
 
-def process_file(fname, action, verbose=False):
+def process_file(fname, action, verbose=True):
     if not _valid_file(fname, verbose):
         return True
     with open(fname, 'rb') as f:
@@ -118,7 +120,8 @@ def process_file(fname, action, verbose=False):
         return False
     _, ext = os.path.splitext(fname)
     # remove old license
-    if ext == '.h' or ext == '.cc' or ext == '.cu':
+    if ext == '.h' or ext == '.cc' or ext == '.cu' or ext == '.cpp' \
+        or ext == '.hpp':
         for i, l in enumerate(lines):
             if _OLD_LICENSE.match(l.decode('utf-8')):
                 del lines[i]