From fa3bc9f96e74e610bab611ad9af6eccda8f6f1aa Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Wed, 7 Aug 2019 19:54:02 -0700
Subject: [PATCH] Numpy-compatible Infra (#15581)

* [Do not review] [Do not merge] New numpy-compatible sum (#14739)

* Add numpy namespace and initial impl of np.sum (not complete)

* Clean up

* Fix import error

* numpy sum

* add test and backward data type support

* add license to test_numpy_op.py

* improve test to reduce flakiness

* fix sanity build

* extra numeric test and imperative test

* add error message for initial argument

* [numpy] Infra for supporting numpy ops in imperative mode and Gluon APIs (#14758)

* Infra of new ndarray and symbol types for numpy operators

* Rename

* Fix import problem

* Refactor

* Remove redundant code

* Add docstring

* More on numpy ndarray and symbol

* Override unimplemented methdos for ndarray and _NumpySymbol

* Fix built-in methods of ndarray and _NumpySymbol

* Fix test and sanity check

* Fix pylint

* Address cr comments

* Add unit tests for ndarray and _NumpySymbol

* Add _true_divide

* Fix gpu build

* Add future import division

* More correct way of checking if an output is from a np compat op

* Fix gpu build

* Fix output ndarray/symbol types with at least one new ndarray/symbol

* Modify true_divide doc

* Fix flaky copying zero-size arrays via gpus

* Fix zero size in gluon hybridize and zeros/ones symbol not creating new symbol type

* Fix doc

* Enable np op compat check with name prefix (#14897)

* [numpy] Numpy dot (#14831)

* Numpy Dot case 1-4 + case 3.5 forward and 0.5 backward

* Backward computation and test coverage

* numpy-compatible mean (#14859)

* [numpy] Some np ops for d2l (#14924)

* Add np transpose

More ops and namespaces for submodules

Add relu and sigmoid

Add reshape

Fix symbolic name mismatch

Add maximum and minimum

* Add convenience fluent method

* Add ndarray.item()

* Fix CI

* Fix lint

* Fix lint

* Fix reshape gpu

* Add example

* Remove python notebook outputs

* Remove notebook output

* Add one more example

* [numpy] Refactor np modules (#14989)

* Refactor

* Initial refactoring

* Fix notebook

* Move numpy op check from backend to frontend

* Add homogeneous ndarray check

* Fix grouping inhomogeneous types of symbols

* Improve error handling of different types of symbols as outputs

* Fix test

* Fix numpy test

* Fix ci

* Try to fix gpu ci failure

* [numpy] Refactor np module (example runs through) (#15055)

* Refactor notebook

* notebook working with hybrid block

* More refactoring

* Remove unnecessary use_np_compat

* Use class decorator to initialize numpy ndarrays in parameter.py

* Clear notebook outputs

* Improve np decorator

* Remove npe op from optimizer

* Fix CI

* Fix functools.wraps issue in Python2

* Fix ci

* Change np_compat to np_shape

* Temporarily disable test_amp

* Numpy-compatible stack (#15027)

* numpy stack

* migrate to use_np_shape

* Numpy Unary Ops (#15010)

* Unary Ops

* new version of unit tests

* [numpy] Fix np branch after rebase (#15086)

* Add np_array semantics for Gluon

Fix notebook

Fix sanity

Fix gluon deferred infer shape

Add np.random.uniform

Add random normal

Add boolean comparison ops

Add np.ndarray indexing

Reformat test ndarray indexing

Fix unit tests

Add one more test of indexing

Fix sanity

Enable amp test

Add np.arange

Revert cython unit test to ctypes

Delete unnecessary use_np_shape decorator from test

Rebase with numpy branch

support range as index

Fix python2 range type check

Add argmax

Disable clojure test

* Fix ci

* Add np.linalg.norm for ord='fro'

* Fix pylint

* numpy concatenate (#15104)

* [WIP][numpy] Fix for D2L Chapters 2/3/4 (#15139)

* Fix

* Fix linear regression gluon

* More fix

* Fix pylint

* Fix for chapter 4

* Add np.add mul div mod pow sub and shuffle

* Fix model selection, underfitting, overfitting

* Fix weight decay

* Fix dropout

* Fix

* Fix chapter 4

* [numpy] Fix d2l performance regression (#15173)

* Add np array adapter decorator for layers

* Fix performance regression caused by too many conversions between nd.NDArray and np.ndarray

* Fix pylint

* Fix test backward compatibility issue

* Fix test_lambda

* Fix (#15188)

* fix for chapter6 conv nn (#15224)

* [numpy] Fix d2l chapter8 (#15237)

* Add np op doc

* Fix several issues

* Add a N-D dot b 2D support

* Simplify array creation api

* Add swapaxes

* Fix rnn gluon

* More fix

* Fix pylint

* Delete

* Fix mp windows

* fix for ch11 (#15244)

* Numpy-compatible split (#15049)

* numpy split

* numpy split

* unit test

* unit test

* [numpy] [DO NOT MERGE] Fix d2l chapters 9 and 13 (#15246)

* Add npx batch_dot and topk

* Text embedding uses numpy

* Fix SoftmaxCrossEntropyLoss with np

* Fix sentiment cnn

* Fix pylint

* Fix dot attention

* Fix seq2seq attention

* Add np.tile

* Fix transformer

* Fix ci

* Fix ci and rebase

* [numpy] Fix d2l chapter 5 (#15264)

* Fix parameter initializer

* Add np.save and np.load

* Fix read-write

* Fix lint

* Numpy compatible max (#15161)

* numpy amax

* weird cu file diff

* fix the unit test error

* fix gpu bug

* minor fix

* fix lint

* remove scalar value check

* fix the bug on unit test

* fix the case () that breaks the kernel launch

* add zero dimension unit test

* revert the tuple change

* use mshadow maximum

* remove test zero

* change the macro for now

* change the cuda to use mashadow op

* fix the broadcast_reduce_op_value.cu wrong kernel

* add more logic in shape to detect the invalid situation

* change back to type swtich

* change to as_nd_ndarray

* add missing @npx.use_np_shape

* retrigger CI

* address the comment

* undo algorithm import

* remove the numeric gradient check

* Numpy compatible multinomial (#15219)

* draft of multinomial

* rename to more concise name

* finish shape

* complete the forward function

* complete forward without handle 0 dimension & scalar

* handle 0 dimension

* add new line

* fix lint

* fix the build error

* fix lint

* finish unit test

* change the registration

* make multinomial support pvals as mx.ndarray

* delete newline

* fix lint error

* support input as list, mx.ndarray, np.ndarray & unit test

* fix lint

* fix the include error

* fix lint

* refactor & pass the tensor instead of tuple to kernel

* fix lint

* updata the doc

* address the comment

* Numpy compatible linspace (#15256)

* draft

* finish linspace implementation

* finish linspace

* delete newline

* fix pylint

* add more unit test

* address comment

* add more test case

* disable too-many-arguments

* resolve confliction

* add ctx

* numpy-compatible cumsum (#15309)

* [numpy] Misc fix for other chapters (#15332)

* Add np.prod

* Fix ndarray.reshape accepting positional integers as arguments

* Rebase

* Fix rebase error

* Add np.ndarray.flatten

* Fix

* Add broadcast_to

* Add meshgrid and broadcast_arrays

* Fix sin, cos, sinh, cosh not supporting scalars

* Add more unary ops supporting python scalars

* Fix

* Fix

* Fix ci

* Fix sanity

* [numpy] Change d2l chapters cv and gan to use numpy (#15368)

* Change op name style to lower case underscore

* Add ops under image to npx

* Add image submodule to npx

* Fix split_and_load use np

* Fix fine tuning

* Fix bbox and anchor

* Fix odd

* Fix ssd and rcnn

* Remove restriction on binary element-wise scalar

* Fix gan

* Fix sanity

* Try to fix website build failure

* Add npx.random.seed

* Fix doc

* add doc for multinomial, dot, cumsum, clip, abs, exp, arctan (#15386)

* [numpy] Fix several places in numpy (#15398)

* Fix

* More fix

* [numpy] fix cython (#15418)

* add cython support for numpy

* stay with original API for backward compatibility

* fix after rebase

* get rid of coverage in clang60 mkldnn

* fix lint issues

* fix flaky test and get rid of extra print

* remove numpy examples

* revert #15309 #15256 #15219 #15161

* remove numpy docs

* remove changes to contrib/text/embedding.py

* remove numpy changes to gluon peripherals

* Revert "remove numpy docs"

This reverts commit c104695b28a26738b8700d80c70814e0f583ac55.

* get rid of most operators

* Revert "get rid of coverage in clang60 mkldnn"

This reverts commit 77dc90520b6a2282716ba41987a1f37522daf078.

* remove np-compatible from mxnet.image mxnet.initializer

* address comments
---
 include/mxnet/base.h                          |    4 +-
 include/mxnet/c_api.h                         |   12 +
 include/mxnet/tuple.h                         |   15 +
 python/mxnet/__init__.py                      |    5 +
 python/mxnet/_ctypes/ndarray.py               |   36 +-
 python/mxnet/_ctypes/symbol.py                |   13 +-
 python/mxnet/_numpy_op_doc.py                 |   54 +
 python/mxnet/base.py                          |  117 +-
 python/mxnet/cython/ndarray.pyx               |   27 +-
 python/mxnet/cython/symbol.pyx                |   16 +-
 python/mxnet/gluon/block.py                   |   49 +-
 python/mxnet/gluon/data/dataloader.py         |    2 +
 python/mxnet/gluon/parameter.py               |   52 +-
 python/mxnet/gluon/rnn/rnn_layer.py           |    2 -
 python/mxnet/gluon/utils.py                   |   45 +-
 python/mxnet/image/image.py                   |    2 +-
 python/mxnet/ndarray/__init__.py              |    6 +-
 python/mxnet/ndarray/_internal.py             |   11 +-
 python/mxnet/ndarray/ndarray.py               |   26 +-
 python/mxnet/ndarray/numpy/__init__.py        |   26 +
 python/mxnet/ndarray/numpy/_internal.py       |   20 +
 python/mxnet/ndarray/numpy/_op.py             |  295 ++++
 python/mxnet/ndarray/numpy/_register.py       |   28 +
 python/mxnet/ndarray/numpy/linalg.py          |   22 +
 python/mxnet/ndarray/numpy/random.py          |   21 +
 .../mxnet/ndarray/numpy_extension/__init__.py |   25 +
 python/mxnet/ndarray/numpy_extension/_op.py   |   21 +
 .../ndarray/numpy_extension/_register.py      |   25 +
 python/mxnet/ndarray/numpy_extension/image.py |   20 +
 python/mxnet/ndarray/register.py              |  116 +-
 python/mxnet/ndarray/utils.py                 |    7 +
 python/mxnet/numpy/__init__.py                |   30 +
 python/mxnet/numpy/_op.py                     |   20 +
 python/mxnet/numpy/_register.py               |   27 +
 python/mxnet/numpy/linalg.py                  |   22 +
 python/mxnet/numpy/multiarray.py              | 1551 +++++++++++++++++
 python/mxnet/numpy/random.py                  |   22 +
 python/mxnet/numpy/utils.py                   |   35 +
 python/mxnet/numpy_extension/__init__.py      |   32 +
 python/mxnet/numpy_extension/_op.py           |   20 +
 python/mxnet/numpy_extension/_register.py     |   27 +
 python/mxnet/numpy_extension/image.py         |   22 +
 python/mxnet/numpy_extension/utils.py         |  122 ++
 python/mxnet/optimizer/optimizer.py           |   34 +-
 python/mxnet/symbol/__init__.py               |    7 +-
 python/mxnet/symbol/_internal.py              |   10 +-
 python/mxnet/symbol/numpy/__init__.py         |   28 +
 python/mxnet/symbol/numpy/_internal.py        |   20 +
 python/mxnet/symbol/numpy/_op.py              |   20 +
 python/mxnet/symbol/numpy/_register.py        |   28 +
 python/mxnet/symbol/numpy/_symbol.py          | 1013 +++++++++++
 python/mxnet/symbol/numpy/linalg.py           |   22 +
 python/mxnet/symbol/numpy/random.py           |   22 +
 .../mxnet/symbol/numpy_extension/__init__.py  |   25 +
 python/mxnet/symbol/numpy_extension/_op.py    |   21 +
 .../mxnet/symbol/numpy_extension/_register.py |   24 +
 python/mxnet/symbol/numpy_extension/image.py  |   20 +
 python/mxnet/symbol/register.py               |   84 +-
 python/mxnet/symbol/symbol.py                 |   36 +-
 python/mxnet/test_utils.py                    |   28 +-
 python/mxnet/util.py                          |  395 ++++-
 src/c_api/c_api.cc                            |    9 +
 src/c_api/c_api_common.h                      |    1 +
 src/c_api/c_api_symbolic.cc                   |   13 +-
 src/imperative/imperative_utils.h             |    1 -
 src/io/image_io.cc                            |    3 +
 src/ndarray/ndarray.cc                        |   17 +-
 src/operator/contrib/multibox_detection.cc    |    4 +
 src/operator/contrib/multibox_prior.cc        |    3 +
 src/operator/contrib/multibox_target.cc       |    4 +
 src/operator/image/crop.cc                    |    1 +
 src/operator/image/image_random.cc            |   13 +
 src/operator/image/resize.cc                  |    1 +
 src/operator/leaky_relu.cc                    |    1 +
 src/operator/nn/activation.cc                 |    1 +
 src/operator/nn/batch_norm.cc                 |    1 +
 src/operator/nn/concat.cc                     |   13 +-
 src/operator/nn/convolution.cc                |    1 +
 src/operator/nn/deconvolution.cc              |    1 +
 src/operator/nn/dropout.cc                    |    1 +
 src/operator/nn/fully_connected.cc            |    1 +
 src/operator/nn/layer_norm.cc                 |    1 +
 src/operator/nn/pooling.cc                    |    3 +-
 src/operator/nn/softmax.cc                    |    2 +
 .../numpy/np_elemwise_broadcast_op.cc         |  186 ++
 .../numpy/np_elemwise_broadcast_op.cu         |   82 +
 src/operator/numpy/np_init_op.cc              |  111 ++
 src/operator/numpy/np_init_op.cu              |   44 +
 src/operator/numpy/np_true_divide.cc          |  127 ++
 src/operator/numpy/np_true_divide.cu          |   41 +
 src/operator/quantization/quantized_concat.cc |   12 +-
 src/operator/random/sample_op.cc              |    2 +
 src/operator/random/shuffle_op.cc             |    1 +
 src/operator/rnn.cc                           |    1 +
 src/operator/roi_pooling.cc                   |    4 +
 src/operator/sequence_mask.cc                 |    3 +
 src/operator/swapaxis-inl.h                   |   42 +-
 src/operator/swapaxis.cc                      |    2 +-
 src/operator/tensor/broadcast_reduce_op.h     |  183 +-
 .../tensor/broadcast_reduce_op_index.cc       |    1 +
 src/operator/tensor/dot.cc                    |    1 +
 .../tensor/elemwise_binary_broadcast_op.h     |    1 +
 .../elemwise_binary_broadcast_op_logic.cc     |    6 +
 src/operator/tensor/elemwise_binary_op.h      |   16 +-
 .../elemwise_binary_scalar_op_extended.cc     |    3 +-
 .../tensor/elemwise_binary_scalar_op_logic.cc |    6 +
 src/operator/tensor/elemwise_unary_op.h       |    6 +-
 .../tensor/elemwise_unary_op_basic.cc         |    3 +
 src/operator/tensor/indexing_op.cc            |    2 +
 src/operator/tensor/matrix_op-inl.h           |   40 +-
 src/operator/tensor/matrix_op.cc              |    6 +
 src/operator/tensor/ordering_op.cc            |    1 +
 tests/python/gpu/test_operator_gpu.py         |    1 +
 tests/python/unittest/test_numpy_ndarray.py   |  672 +++++++
 tests/python/unittest/test_thread_local.py    |   36 +
 115 files changed, 6394 insertions(+), 231 deletions(-)
 create mode 100644 python/mxnet/_numpy_op_doc.py
 create mode 100644 python/mxnet/ndarray/numpy/__init__.py
 create mode 100644 python/mxnet/ndarray/numpy/_internal.py
 create mode 100644 python/mxnet/ndarray/numpy/_op.py
 create mode 100644 python/mxnet/ndarray/numpy/_register.py
 create mode 100644 python/mxnet/ndarray/numpy/linalg.py
 create mode 100644 python/mxnet/ndarray/numpy/random.py
 create mode 100644 python/mxnet/ndarray/numpy_extension/__init__.py
 create mode 100644 python/mxnet/ndarray/numpy_extension/_op.py
 create mode 100644 python/mxnet/ndarray/numpy_extension/_register.py
 create mode 100644 python/mxnet/ndarray/numpy_extension/image.py
 create mode 100644 python/mxnet/numpy/__init__.py
 create mode 100644 python/mxnet/numpy/_op.py
 create mode 100644 python/mxnet/numpy/_register.py
 create mode 100644 python/mxnet/numpy/linalg.py
 create mode 100644 python/mxnet/numpy/multiarray.py
 create mode 100644 python/mxnet/numpy/random.py
 create mode 100644 python/mxnet/numpy/utils.py
 create mode 100644 python/mxnet/numpy_extension/__init__.py
 create mode 100644 python/mxnet/numpy_extension/_op.py
 create mode 100644 python/mxnet/numpy_extension/_register.py
 create mode 100644 python/mxnet/numpy_extension/image.py
 create mode 100644 python/mxnet/numpy_extension/utils.py
 create mode 100644 python/mxnet/symbol/numpy/__init__.py
 create mode 100644 python/mxnet/symbol/numpy/_internal.py
 create mode 100644 python/mxnet/symbol/numpy/_op.py
 create mode 100644 python/mxnet/symbol/numpy/_register.py
 create mode 100644 python/mxnet/symbol/numpy/_symbol.py
 create mode 100644 python/mxnet/symbol/numpy/linalg.py
 create mode 100644 python/mxnet/symbol/numpy/random.py
 create mode 100644 python/mxnet/symbol/numpy_extension/__init__.py
 create mode 100644 python/mxnet/symbol/numpy_extension/_op.py
 create mode 100644 python/mxnet/symbol/numpy_extension/_register.py
 create mode 100644 python/mxnet/symbol/numpy_extension/image.py
 create mode 100644 src/operator/numpy/np_elemwise_broadcast_op.cc
 create mode 100644 src/operator/numpy/np_elemwise_broadcast_op.cu
 create mode 100644 src/operator/numpy/np_init_op.cc
 create mode 100644 src/operator/numpy/np_init_op.cu
 create mode 100644 src/operator/numpy/np_true_divide.cc
 create mode 100644 src/operator/numpy/np_true_divide.cu
 create mode 100644 tests/python/unittest/test_numpy_ndarray.py

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index f568e45bd69c..0c13e4eaa27e 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -421,7 +421,9 @@ inline int32_t Context::GetGPUCount() {
 #if MXNET_USE_CUDA
   int32_t count;
   cudaError_t e = cudaGetDeviceCount(&count);
-  if (e == cudaErrorNoDevice) {
+  // TODO(junwu): Remove e == cudaErrorInsufficientDriver
+  // This is skipped for working around wheel build system with older CUDA driver.
+  if (e == cudaErrorNoDevice || e == cudaErrorInsufficientDriver) {
     return 0;
   }
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 95d13fe2125c..f0d5e8e55b52 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2860,6 +2860,18 @@ MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
+/*!
+ * \brief Create an NDArray from source sharing the same data chunk.
+ * \param src source NDArray
+ * \param out new NDArray sharing the same data chunck with src
+ */
+MXNET_DLL int MXShallowCopyNDArray(NDArrayHandle src, NDArrayHandle* out);
+/*!
+ * \brief Create an Symbol from source sharing the same graph structure.
+ * \param src source Symbol
+ * \param out new Symbol sharing the same graph structure with src
+ */
+MXNET_DLL int MXShallowCopySymbol(SymbolHandle src, SymbolHandle * out);
 
 /*!
   * \brief Push an asynchronous operation to the engine.
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index bc630f153744..f018c8faabea 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -272,6 +272,14 @@ class Tuple {
       is.get();
       if (ch == '(' || ch == '[') break;
       if (!isspace(ch)) {
+        if (ch == 'N') {
+          std::string tmp_val;
+          is >> tmp_val;
+          if (tmp_val == "one") {  // is stores "None"
+            t.SetDim(-1);
+            return is;
+          }
+        }
         is.setstate(std::ios::failbit);
         return is;
       }
@@ -653,6 +661,13 @@ inline bool shape_is_known(const TShape& x) {
   return true;
 }
 
+inline bool shape_is_known(const std::vector<TShape>& shapes) {
+  for (const TShape& shape : shapes) {
+    if (!shape_is_known(shape)) return false;
+  }
+  return true;
+}
+
 /*! \brief helper function to cast type of container elements */
 template<typename SrcIter, typename DstIter>
 inline DstIter ShapeTypeCast(const SrcIter begin,
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 233bb2a1f57e..e9c1229d7f2f 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -25,11 +25,16 @@
 from . import engine
 from .base import MXNetError
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
+from .util import is_np_array, np_array, use_np_array, use_np
 from . import base
 from . import library
 from . import contrib
 from . import ndarray
 from . import ndarray as nd
+from . import numpy
+from . import numpy_extension
+from . import numpy as np
+from . import numpy_extension as npx
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index f324545a2352..b1a38c1d2621 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -55,6 +55,8 @@ def __reduce__(self):
 
 
 _ndarray_cls = None
+_np_ndarray_cls = None
+
 
 def _set_ndarray_class(cls):
     """Set the symbolic class to be cls"""
@@ -62,7 +64,13 @@ def _set_ndarray_class(cls):
     _ndarray_cls = cls
 
 
-def _imperative_invoke(handle, ndargs, keys, vals, out):
+def _set_np_ndarray_class(cls):
+    """Set the symbolic class to be cls"""
+    global _np_ndarray_cls
+    _np_ndarray_cls = cls
+
+
+def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op):
     """ctypes implementation of imperative invoke wrapper"""
     if out is not None:
         original_output = out
@@ -91,23 +99,27 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         c_str_array([str(s) for s in vals]),
         ctypes.byref(out_stypes)))
 
+    create_ndarray_fn = _np_ndarray_cls if is_np_op else _ndarray_cls
     if original_output is not None:
         return original_output
     if num_output.value == 1:
-        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
-                            stype=out_stypes[0])
+        return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
+                                 stype=out_stypes[0])
     else:
-        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
-                             stype=out_stypes[i])
-                for i in range(num_output.value)]
+        return [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                  stype=out_stypes[i]) for i in range(num_output.value)]
 
 
 class CachedOp(object):
     """Cached operator handle."""
-    __slots__ = ["handle"]
+    __slots__ = ["handle", "is_np_sym"]
+
     def __init__(self, sym, flags=()):
         self.handle = CachedOpHandle()
 
+        from ..symbol.numpy._symbol import _Symbol
+        self.is_np_sym = bool(isinstance(sym, _Symbol))
+
         check_call(_LIB.MXCreateCachedOpEx(
             sym.handle,
             len(flags),
@@ -151,10 +163,10 @@ def __call__(self, *args, **kwargs):
 
         if original_output is not None:
             return original_output
+        create_ndarray_fn = _np_ndarray_cls if self.is_np_sym else _ndarray_cls
         if num_output.value == 1:
-            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
-                                stype=out_stypes[0])
+            return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
+                                     stype=out_stypes[0])
         else:
-            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
-                                 stype=out_stypes[i])
-                    for i in range(num_output.value)]
+            return [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                      stype=out_stypes[i]) for i in range(num_output.value)]
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index fe4cb950ed14..01ba18b38963 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -26,7 +26,9 @@
 from ..base import SymbolHandle
 from ..base import check_call
 
+# The symbol class to be used (Cython or Ctypes)
 _symbol_cls = None
+_np_symbol_cls = None
 
 class SymbolBase(object):
     """Symbol is symbolic graph."""
@@ -115,7 +117,13 @@ def _set_symbol_class(cls):
     _symbol_cls = cls
 
 
-def _symbol_creator(handle, args, kwargs, keys, vals, name):
+def _set_np_symbol_class(cls):
+    """Set the numpy-compatible symbolic class to be cls"""
+    global _np_symbol_cls
+    _np_symbol_cls = cls
+
+
+def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op):
     sym_handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateAtomicSymbol(
         ctypes.c_void_p(handle),
@@ -128,7 +136,8 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         raise TypeError(
             'Operators with variable length input can only accept input'
             'Symbols either as positional or keyword arguments, not both')
-    s = _symbol_cls(sym_handle)
+    create_symbol_fn = _np_symbol_cls if is_np_op else _symbol_cls
+    s = create_symbol_fn(sym_handle)
     if args:
         s._compose(*args, name=name)
     elif kwargs:
diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
new file mode 100644
index 000000000000..5543ebc8e8c9
--- /dev/null
+++ b/python/mxnet/_numpy_op_doc.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+
+"""Doc placeholder for numpy ops with prefix _np."""
+
+
+def _np_ones_like(a):
+    """Return an array of ones with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : ndarray
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+
+    Returns
+    -------
+    out : ndarray
+        Array of ones with the same shape and type as `a`.
+    """
+    pass
+
+
+def _np_zeros_like(a):
+    """Return an array of zeros with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : ndarray
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the same shape and type as `a`.
+    """
+    pass
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 17819bde28b2..c16c36c7f419 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass, wrong-import-position
+# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass, too-many-lines, wrong-import-position
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
@@ -598,7 +598,9 @@ def _init_op_module(root_namespace, module_name, make_op_func):
                                      ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
-        op_names.append(py_str(plist[i]))
+        op_name = py_str(plist[i])
+        if not _is_np_op(op_name):
+            op_names.append(op_name)
 
     module_op = sys.modules["%s.%s.op" % (root_namespace, module_name)]
     module_internal = sys.modules["%s.%s._internal" % (root_namespace, module_name)]
@@ -692,7 +694,9 @@ def write_all_str(module_file, module_all_list):
                                      ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
-        op_names.append(py_str(plist[i]))
+        op_name = py_str(plist[i])
+        if not _is_np_op(op_name):
+            op_names.append(op_name)
 
     module_op_file = get_module_file("%s.%s.op" % (root_namespace, module_name))
     module_op_all = []
@@ -739,3 +743,110 @@ def write_all_str(module_file, module_all_list):
 if Features().is_enabled("TVM_OP"):
     _LIB_TVM_OP = libinfo.find_lib_path("libtvmop")
     check_call(_LIB.MXLoadTVMOp(c_str(_LIB_TVM_OP[0])))
+
+
+_NP_OP_PREFIX = '_np_'
+_NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
+
+_NP_EXT_OP_PREFIX = '_npx_'
+_NP_EXT_OP_SUBMODULE_LIST = ['_image_']
+
+_NP_INTERNAL_OP_PREFIX = '_npi_'
+
+
+def _is_np_op(op_name):
+    return op_name.startswith(_NP_OP_PREFIX) or op_name.startswith(_NP_EXT_OP_PREFIX)\
+           or op_name.startswith(_NP_INTERNAL_OP_PREFIX)
+
+
+def _get_op_submodule_name(op_name, op_name_prefix, submodule_name_list):
+    """Get the submodule name of a specific op"""
+    assert op_name.startswith(op_name_prefix)
+    for submodule_name in submodule_name_list:
+        if op_name[len(op_name_prefix):].startswith(submodule_name):
+            return submodule_name
+    return ""
+
+
+def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op_func):
+    """
+    Register numpy operators in namespaces `mxnet.numpy`, `mxnet.ndarray.numpy`
+    and `mxnet.symbol.numpy`. They are used in imperative mode, Gluon APIs w/o hybridization,
+    and Gluon APIs w/ hybridization, respectively. Essentially, operators with the same name
+    registered in three namespaces, respectively share the same functionality in C++ backend.
+    Different namespaces are needed for dispatching operator calls in Gluon's `HybridBlock` by `F`.
+
+    Parameters
+    ----------
+    root_module_name : str
+        Top level module name, `mxnet` in the current cases.
+    np_module_name : str
+        Second level module name, `numpy` or `numpy_extension` in the current case.
+    make_op_func : function
+        Function for creating op functions.
+    """
+    from . import _numpy_op_doc as _np_op_doc
+    if np_module_name == 'numpy':
+        op_name_prefix = _NP_OP_PREFIX
+        submodule_name_list = _NP_OP_SUBMODULE_LIST
+    elif np_module_name == 'numpy_extension':
+        op_name_prefix = _NP_EXT_OP_PREFIX
+        submodule_name_list = _NP_EXT_OP_SUBMODULE_LIST
+    elif np_module_name == 'numpy._internal':
+        op_name_prefix = _NP_INTERNAL_OP_PREFIX
+        submodule_name_list = []
+    else:
+        raise ValueError('unsupported np module name {}'.format(np_module_name))
+
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size), ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        name = py_str(plist[i])
+        if name.startswith(op_name_prefix):
+            op_names.append(name)
+
+    if mx_module_name is None:
+        # register np/npx ops for imperative programming
+        op_module_name = "%s.%s._op" % (root_module_name, np_module_name)  # e.g. mxnet.numpy._op
+        op_submodule_name = "%s.%s" % (root_module_name, np_module_name)  # e.g. mxnet.numpy.random
+    elif mx_module_name in ('ndarray', 'symbol'):
+        # register numpy internal ops and np/npx ops for use in Gluon
+        # np internal ops are registered in mxnet.ndarray/symbol.numpy._internal
+        # np ops are registered in mxnet.ndarray/symbol.numpy._op
+        # npx ops are registered in mxnet.ndarray/symbol.numpy_extension._op
+        op_module_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
+        if op_name_prefix != _NP_INTERNAL_OP_PREFIX:
+            op_module_name += '._op'
+        # e.g. mxnet.symbol.numpy.random
+        op_submodule_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
+    else:
+        raise ValueError('unsupported mxnet module {}'.format(mx_module_name))
+    op_submodule_name += '.%s'
+
+    op_module = sys.modules[op_module_name]
+    submodule_dict = {}
+    for submodule_name in submodule_name_list:
+        submodule_dict[submodule_name] = sys.modules[op_submodule_name % submodule_name[1:-1]]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        submodule_name = _get_op_submodule_name(name, op_name_prefix, submodule_name_list)
+        if len(submodule_name) > 0:
+            func_name = name[(len(op_name_prefix) + len(submodule_name)):]
+            cur_module = submodule_dict[submodule_name]
+            module_name_local = op_submodule_name % submodule_name[1:-1]
+        else:
+            func_name = name[len(op_name_prefix):]
+            cur_module = op_module
+            module_name_local =\
+                op_module_name[:-len('._op')] if op_module_name.endswith('._op') else op_module_name
+
+        function = make_op_func(hdl, name, func_name)
+        function.__module__ = module_name_local
+        setattr(cur_module, function.__name__, function)
+        cur_module.__all__.append(function.__name__)
+
+        if hasattr(_np_op_doc, name):
+            function.__doc__ = getattr(_np_op_doc, name).__doc__
diff --git a/python/mxnet/cython/ndarray.pyx b/python/mxnet/cython/ndarray.pyx
index f9279889b504..50791e9b9a86 100644
--- a/python/mxnet/cython/ndarray.pyx
+++ b/python/mxnet/cython/ndarray.pyx
@@ -64,21 +64,27 @@ cdef class NDArrayBase:
 
 
 _ndarray_cls = None
+_np_ndarray_cls = None
 
 def _set_ndarray_class(cls):
     global _ndarray_cls
     _ndarray_cls = cls
 
 
-cdef NewArray(NDArrayHandle handle, int stype=-1):
+def _set_np_ndarray_class(cls):
+    global _np_ndarray_cls
+    _np_ndarray_cls = cls
+
+
+cdef NewArray(NDArrayHandle handle, int stype=-1, int is_np_array=0):
     """Create a new array given handle"""
-    return _ndarray_cls(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
+    create_array_fn = _np_ndarray_cls if is_np_array else _ndarray_cls
+    return create_array_fn(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
 
 
 cdef class CachedOp:
     """Cached operator handle."""
     cdef CachedOpHandle chandle
-
     cdef _set_handle(self, handle):
         cdef unsigned long long ptr
         if handle is None:
@@ -96,6 +102,8 @@ cdef class CachedOp:
         def __set__(self, value):
             self._set_handle(value)
 
+    cdef int is_np_sym
+
     def __init__(self, sym, flags=()):
         cdef vector[string] s_flag_keys
         cdef vector[string] s_flag_vals
@@ -106,6 +114,9 @@ cdef class CachedOp:
         cdef vector[const char*] c_flag_keys = SVec2Ptr(s_flag_keys)
         cdef vector[const char*] c_flag_vals = SVec2Ptr(s_flag_vals)
 
+        from ..symbol.numpy._symbol import _Symbol
+        self.is_np_sym = bool(isinstance(sym, _Symbol))
+
         CALL(MXCreateCachedOpEx(
             <SymbolHandle>(<unsigned long long>sym.handle.value),
             len(flags),
@@ -154,12 +165,12 @@ cdef class CachedOp:
         if original_output is not None:
             return original_output
         if num_output == 1:
-            return NewArray(p_output_vars[0], p_output_stypes[0])
+            return NewArray(p_output_vars[0], p_output_stypes[0], self.is_np_sym)
         else:
-            return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
+            return [NewArray(p_output_vars[i], p_output_stypes[i], self.is_np_sym) for i in range(num_output)]
 
 
-def _imperative_invoke(handle, ndargs, keys, vals, out):
+def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op=0):
     """cython implementation of imperative invoke wrapper"""
     cdef unsigned long long ihandle = handle
     cdef OpHandle chandle = <OpHandle>ihandle
@@ -211,6 +222,6 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
     if original_output is not None:
         return original_output
     if num_output == 1:
-        return NewArray(p_output_vars[0], p_output_stypes[0])
+        return NewArray(p_output_vars[0], p_output_stypes[0], is_np_op)
     else:
-        return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
+        return [NewArray(p_output_vars[i], p_output_stypes[i], is_np_op) for i in range(num_output)]
diff --git a/python/mxnet/cython/symbol.pyx b/python/mxnet/cython/symbol.pyx
index 1bdea6c6c547..86fe8ae6db4f 100644
--- a/python/mxnet/cython/symbol.pyx
+++ b/python/mxnet/cython/symbol.pyx
@@ -84,19 +84,27 @@ cdef SymbolSetAttr(SymbolHandle handle, dict kwargs):
 
 
 _symbol_cls = SymbolBase
+_np_symbol_cls = None
 
 def _set_symbol_class(cls):
     global _symbol_cls
     _symbol_cls = cls
 
-cdef NewSymbol(SymbolHandle handle):
+
+def _set_np_symbol_class(cls):
+    global _np_symbol_cls
+    _np_symbol_cls = cls
+
+
+cdef NewSymbol(SymbolHandle handle, int is_np_sym=0):
     """Create a new symbol given handle"""
-    sym = _symbol_cls(None)
+    create_symbol_fn = _np_symbol_cls if is_np_sym else _symbol_cls
+    sym = create_symbol_fn(None)
     (<SymbolBase>sym).chandle = handle
     return sym
 
 
-def _symbol_creator(handle, args, kwargs, keys, vals, name):
+def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op=0):
     cdef unsigned long long ihandle = handle
     cdef OpHandle chandle = <OpHandle>ihandle
     cdef vector[string] ckeys
@@ -143,4 +151,4 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         &csym_keys[0] if csym_keys.size() != 0 else NULL,
         &sym_args[0] if sym_args.size() != 0 else NULL))
 
-    return NewSymbol(ret_handle)
+    return NewSymbol(ret_handle, is_np_op)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 3bac3c023bf8..1567c7baa58e 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -26,7 +26,6 @@
 import re
 from collections import OrderedDict
 
-
 from ..base import mx_real_t, MXNetError
 from .. import symbol, ndarray, initializer
 from ..symbol import Symbol
@@ -34,6 +33,10 @@
 from .. import name as _name
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
 from .utils import _indent, _brief_print_list, HookHandle
+from .utils import _check_same_symbol_type, _check_all_np_ndarrays
+from .. import numpy_extension as _mx_npx
+from .. import numpy as _mx_np
+from .. util import is_np_array, np_shape, np_array
 
 
 class _BlockScope(object):
@@ -332,7 +335,8 @@ def save_parameters(self, filename):
         """
         params = self._collect_params_with_prefix()
         arg_dict = {key : val._reduce() for key, val in params.items()}
-        ndarray.save(filename, arg_dict)
+        save_fn = _mx_npx.save if is_np_array() else ndarray.save
+        save_fn(filename, arg_dict)
 
     def save_params(self, filename):
         """[Deprecated] Please use save_parameters. Note that if you want load
@@ -381,7 +385,28 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         `Saving and Loading Gluon Models \
         <https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html>`_
         """
-        loaded = ndarray.load(filename)
+        if is_np_array():
+            # failure may happen when loading parameters saved as NDArrays within
+            # NumPy semantics. Check the failure type and recover from it if it happens.
+            try:
+                loaded = _mx_npx.load(filename)
+            except MXNetError as e:
+                err_msg = str(e)
+                if 'is_np_shape' in err_msg:
+                    # Loading failure due to parameters saved without numpy semantics.
+                    # Temporarily disable numpy semantics and load parameters. After it's
+                    # done, resume the numpy semantics. This is fine because the cases
+                    # numpy ndarray covers is a superset of the legacy ndarray's.
+                    with np_array(False):
+                        with np_shape(False):
+                            loaded_nds = ndarray.load(filename)
+                    assert isinstance(loaded_nds, dict),\
+                        'expecting a dict type, got {}'.format(str(type(loaded_nds)))
+                    loaded = {k: loaded_nds[k].as_np_ndarray() for k in loaded_nds}
+                else:
+                    raise ValueError(err_msg)
+        else:
+            loaded = ndarray.load(filename)
         params = self._collect_params_with_prefix()
         if not loaded and not params:
             return
@@ -549,7 +574,8 @@ def __call__(self, *args):
 
         for hook in self._forward_hooks.values():
             hook(self, args, out)
-
+        if _mx_npx.is_np_array():
+            _check_all_np_ndarrays(out)
         return out
 
     def forward(self, *args):
@@ -739,9 +765,13 @@ def _get_graph(self, *args):
         if not self._cached_graph:
             args, self._in_format = _flatten(args, "input")
             if len(args) > 1:
-                inputs = [symbol.var('data%d'%i) for i in range(len(args))]
+                inputs = [symbol.var('data%d' % i).as_np_ndarray()
+                          if isinstance(args[i], _mx_np.ndarray)
+                          else symbol.var('data%d' % i) for i in range(len(args))]
             else:
-                inputs = [symbol.var('data')]
+                inputs = [symbol.var('data').as_np_ndarray()
+                          if isinstance(args[0], _mx_np.ndarray)
+                          else symbol.var('data')]
             grouped_inputs = _regroup(inputs, self._in_format)[0]
 
             params = {i: j.var() for i, j in self._reg_params.items()}
@@ -749,7 +779,7 @@ def _get_graph(self, *args):
                 out = self.hybrid_forward(symbol, *grouped_inputs, **params)  # pylint: disable=no-value-for-parameter
             out, self._out_format = _flatten(out, "output")
 
-            self._cached_graph = inputs, symbol.Group(out)
+            self._cached_graph = inputs, symbol.Group(out, _check_same_symbol_type(out))
 
         return self._cached_graph
 
@@ -904,7 +934,8 @@ def export(self, path, epoch=0, remove_amp_cast=True):
             else:
                 assert name in aux_names
                 arg_dict['aux:%s'%name] = param._reduce()
-        ndarray.save('%s-%04d.params'%(path, epoch), arg_dict)
+        save_fn = _mx_npx.save if is_np_array() else ndarray.save
+        save_fn('%s-%04d.params'%(path, epoch), arg_dict)
 
     def forward(self, x, *args):
         """Defines the forward computation. Arguments can be either
@@ -1057,7 +1088,7 @@ def __init__(self, outputs, inputs, params=None):
 
         syms, self._in_format = _flatten(inputs, "input")
         out, self._out_format = _flatten(outputs, "output")
-        out = symbol.Group(out)
+        out = symbol.Group(out, _check_same_symbol_type(out))
 
         input_names = set()
         for i in syms:
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index accd968cc9df..04610165094d 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -124,6 +124,7 @@ def __init__(self, *args, **kwargs):
         self._send = self._writer.send
         self._recv = self._reader.recv
 
+
 def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
@@ -382,6 +383,7 @@ def same_process_iter():
     def __len__(self):
         return len(self._batch_sampler)
 
+
 _worker_dataset = None
 def _worker_initializer(dataset):
     """Initialier for processing pool."""
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 548407584715..ca4be735bcb2 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=unnecessary-pass
 """Neural network parameter."""
+from __future__ import absolute_import
+
 __all__ = ['DeferredInitializationError', 'Parameter', 'Constant',
            'ParameterDict', 'tensor_types']
 
@@ -31,7 +33,8 @@
 from ..context import Context, cpu
 from .. import autograd
 from .utils import _indent, _brief_print_list, shape_is_known
-from .. import is_np_shape
+from ..util import is_np_shape, is_np_array
+from .. import numpy as _mx_np  # pylint: disable=reimported
 
 # pylint: disable= invalid-name
 tensor_types = (symbol.Symbol, ndarray.NDArray)
@@ -131,7 +134,6 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self._grad_stype = grad_stype
         self._stype = stype
 
-
     def __repr__(self):
         s = 'Parameter {name} (shape={shape}, dtype={dtype})'
         return s.format(name=self.name, shape=self.shape, dtype=self.dtype)
@@ -191,9 +193,9 @@ def shape(self, new_shape):
             return
 
         assert len(self._shape) == len(new_shape) and \
-            all(j in (0, i) for i, j in zip(new_shape, self._shape)), \
+            all(j in (-1, 0, i) for i, j in zip(new_shape, self._shape)), \
             "Expected shape %s is incompatible with given shape %s."%(
-                str(new_shape), str(self._shape))
+                str(new_shape), str(self._shape))  # -1 means unknown dim size in np_shape mode
 
         self._shape = new_shape
 
@@ -272,12 +274,14 @@ def _load_init(self, data, ctx, cast_dtype=False, dtype_source='current'):
         if cast_dtype:
             assert dtype_source in ['current', 'saved']
         if self.shape:
+            unknown_dim_size = -1 if is_np_shape() else 0
             for self_dim, data_dim in zip(self.shape, data.shape):
-                assert self_dim in (0, data_dim), \
+                assert self_dim in (unknown_dim_size, data_dim), \
                     "Failed loading Parameter '%s' from saved params: " \
                     "shape incompatible expected %s vs saved %s"%(
                         self.name, str(self.shape), str(data.shape))
-            self.shape = tuple(i if i != 0 else j for i, j in zip(self.shape, data.shape))
+            self.shape = tuple(i if i != unknown_dim_size else j
+                               for i, j in zip(self.shape, data.shape))
         if self.dtype:
             if cast_dtype and np.dtype(self.dtype).type != data.dtype:
                 if dtype_source == 'current':
@@ -318,6 +322,7 @@ def _finish_deferred_init(self):
             return
         init, ctx, default_init, data = self._deferred_init
         self._deferred_init = ()
+
         assert shape_is_known(self.shape), \
             "Cannot initialize Parameter '%s' because it has " \
             "invalid shape: %s. Please specify in_units, " \
@@ -326,8 +331,16 @@ def _finish_deferred_init(self):
 
         with autograd.pause():
             if data is None:
-                data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
-                                     ctx=context.cpu(), stype=self._stype)
+                kwargs = {'shape': self.shape, 'dtype': self.dtype, 'ctx': context.cpu()}
+                if is_np_array():
+                    if self._stype != 'default':
+                        raise ValueError("mxnet.numpy.zeros does not support stype = {}"
+                                         .format(self._stype))
+                    zeros_fn = _mx_np.zeros
+                else:
+                    kwargs['stype'] = self._stype
+                    zeros_fn = ndarray.zeros
+                data = zeros_fn(**kwargs)
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
 
@@ -352,8 +365,15 @@ def _init_grad(self):
             self._grad = None
             return
 
-        self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
-                                    stype=self._grad_stype) for i in self._data]
+        if is_np_array():
+            if self._grad_stype != 'default':
+                raise ValueError("mxnet.numpy.zeros does not support stype = {}"
+                                 .format(self._grad_stype))
+            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context)
+                          for i in self._data]
+        else:
+            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
+                                        stype=self._grad_stype) for i in self._data]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
                                 self._grad, self.grad_req)
@@ -363,7 +383,10 @@ def _reduce(self):
         ctx = context.cpu()
         if self._stype == 'default':
             block = self.list_data()
-            data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
+            if is_np_array():
+                data = sum([w.copyto(ctx) for w in block]) / len(block)
+            else:
+                data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
         else:
             # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
@@ -463,7 +486,6 @@ def reset_ctx(self, ctx):
             raise ValueError("Cannot reset context for Parameter '%s' because it "
                              "has not been initialized."%self.name)
 
-
     def set_data(self, data):
         """Sets this parameter's value on all contexts."""
         self.shape = data.shape
@@ -602,6 +624,8 @@ def var(self):
             self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
                                    init=self.init, stype=self._stype)
+            if is_np_array():
+                self._var = self._var.as_np_ndarray()
         return self._var
 
     def cast(self, dtype):
@@ -763,12 +787,12 @@ def get(self, name, **kwargs):
                         inferred_shape = []
                         matched = True
                         for dim1, dim2 in zip(v, existing):
-                            if dim1 != dim2 and dim1 * dim2 != 0:
+                            if dim1 != dim2 and dim1 > 0 and dim2 > 0:
                                 matched = False
                                 break
                             elif dim1 == dim2:
                                 inferred_shape.append(dim1)
-                            elif dim1 == 0:
+                            elif dim1 in (0, -1):  # -1 means unknown dim size in np_shape mode
                                 inferred_shape.append(dim2)
                             else:
                                 inferred_shape.append(dim1)
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index b3cc596282a7..2a9cd88bb214 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -236,7 +236,6 @@ def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
         else:
             return super(_RNNLayer, self).__call__(inputs, states, **kwargs)
 
-
     def hybrid_forward(self, F, inputs, states, sequence_length=None, **kwargs):
         if F is ndarray:
             batch_size = inputs.shape[self._layout.find('N')]
@@ -285,7 +284,6 @@ def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
                     lstm_state_clip_max=self._lstm_state_clip_max,
                     lstm_state_clip_nan=self._lstm_state_clip_nan)
 
-
         if self._mode == 'lstm':
             outputs, states = rnn[0], [rnn[1], rnn[2]]
         else:
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 3957b7402688..97c513fb6447 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -40,6 +40,7 @@ class requests_failed_to_import(object):
 from .. import ndarray
 from ..util import is_np_shape
 
+
 def split_data(data, num_slice, batch_axis=0, even_split=True):
     """Splits an NDArray into `num_slice` slices along `batch_axis`.
     Usually used for data parallelism where each slices is sent
@@ -108,7 +109,7 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
 
     Returns
     -------
-    list of NDArray
+    list of NDArrays
         Each corresponds to a context in `ctx_list`.
     """
     if not isinstance(data, ndarray.NDArray):
@@ -414,6 +415,7 @@ def __enter__(self):
     def __exit__(self, ptype, value, trace):
         self.detach()
 
+
 def shape_is_known(shape):
     """Check whether a shape is completely known with or without np semantics.
 
@@ -430,3 +432,44 @@ def shape_is_known(shape):
         assert dim_size > unknown_dim_size, "shape dimension size cannot be less than {}, while " \
                                             "received {}".format(unknown_dim_size, dim_size)
     return True
+
+
+def _check_same_symbol_type(symbols):
+    """Check whether all the symbols in the list are of the same type.
+    Raise type error if the types are different. Return the class of
+    the symbols."""
+    from ..symbol.numpy import _Symbol as np_symbol
+    from ..symbol import Symbol as nd_symbol
+    is_np_sym = isinstance(symbols[0], np_symbol)
+    for s in symbols[1:]:
+        if is_np_sym != isinstance(s, np_symbol):
+            raise TypeError('Found both classic symbol (mx.sym.Symbol) and numpy symbol '
+                            '(mx.sym.np._Symbol) in outputs. This will prevent you from building '
+                            'a computation graph by grouping them since different types of symbols '
+                            'are not allowed to be grouped in Gluon to form a computation graph. '
+                            'You will need to convert them to the same type of symbols, either '
+                            'classic or numpy following this rule: if you want numpy ndarray '
+                            'output(s) from the computation graph, please convert all the classic '
+                            'symbols in the list to numpy symbols by calling `as_np_ndarray()` '
+                            'on each of them; if you want classic ndarray output(s) from the '
+                            'computation graph, please convert all the numpy symbols in the list '
+                            'to classic symbols by calling `as_nd_ndarray()` on each of them.')
+    return np_symbol if is_np_sym else nd_symbol
+
+
+def _check_all_np_ndarrays(out):
+    """Check if ndarrays/symbols in out are all np.ndarray/np._Symbol."""
+    from ..numpy import ndarray as np_ndarray
+    from ..symbol.numpy import _Symbol as np_symbol
+    from ..symbol import Symbol as nd_symbol
+    from ..ndarray import NDArray as nd_ndarray
+
+    # pylint: disable=no-else-raise
+    if isinstance(out, (nd_ndarray, nd_symbol)) and not isinstance(out, (np_ndarray, np_symbol)):
+        raise TypeError("Block's output ndarrays/symbols must be of type `mxnet.numpy.ndarray`"
+                        " or `mxnet.symbol.numpy._Symbol`, while got output type {}"
+                        .format(str(type(out))))
+    elif isinstance(out, (list, tuple)):
+        for i in out:
+            _check_all_np_ndarrays(i)
+    # pylint: enable=no-else-raise
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index a142282c83a6..2eb1405b5569 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -428,7 +428,7 @@ def fixed_crop(src, x0, y0, w, h, size=None, interp=2):
     NDArray
         An `NDArray` containing the cropped image.
     """
-    out = nd.slice(src, begin=(y0, x0, 0), end=(y0 + h, x0 + w, int(src.shape[2])))
+    out = src[y0:y0+h, x0:x0+w]
     if size is not None and (w, h) != size:
         sizes = (h, w, size[1], size[0])
         out = imresize(out, *size, interp=_get_interp_method(interp, sizes))
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
index f09908e894d5..f6b8712a2513 100644
--- a/python/mxnet/ndarray/__init__.py
+++ b/python/mxnet/ndarray/__init__.py
@@ -17,7 +17,7 @@
 
 """NDArray API of MXNet."""
 
-from . import _internal, contrib, linalg, op, random, sparse, utils, image, ndarray
+from . import _internal, contrib, linalg, op, random, sparse, utils, image, ndarray, numpy
 # pylint: disable=wildcard-import, redefined-builtin
 try:
     from .gen_op import * # pylint: disable=unused-wildcard-import
@@ -30,6 +30,8 @@
 from .utils import load, load_frombuffer, save, zeros, empty, array
 from .sparse import _ndarray_cls
 from .ndarray import _GRAD_REQ_MAP, _DTYPE_MX_TO_NP, _DTYPE_NP_TO_MX, _new_empty_handle
+from . import numpy as np
+from . import numpy_extension as npx
 
 __all__ = op.__all__ + ndarray.__all__ + utils.__all__ + \
-          ['contrib', 'linalg', 'random', 'sparse', 'image']
+          ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/ndarray/_internal.py b/python/mxnet/ndarray/_internal.py
index 8045d9bd2b14..d48255647939 100644
--- a/python/mxnet/ndarray/_internal.py
+++ b/python/mxnet/ndarray/_internal.py
@@ -23,18 +23,18 @@
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.ndarray import NDArrayBase, CachedOp
-        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
     elif _sys.version_info >= (3, 0):
         from .._cy3.ndarray import NDArrayBase, CachedOp
-        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
     else:
         from .._cy2.ndarray import NDArrayBase, CachedOp
-        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
 except ImportError:
     if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
         raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
     from .._ctypes.ndarray import NDArrayBase, CachedOp
-    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
 
 from ..base import _Null
 try:
@@ -42,4 +42,5 @@
 except ImportError:
     pass
 
-__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class']
+__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class',
+           '_set_np_ndarray_class']
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 7e21daedcde1..d67779e9a0f7 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -184,6 +184,24 @@ class NDArray(NDArrayBase):
     # See C++ side of definition(kTVMNDArrayTypeCode) at include/mxmet/tensor_blob.h
     _tvm_tcode = 19
     # pylint: disable= no-member, undefined-variable
+
+    def as_np_ndarray(self):
+        """Convert mxnet.ndarray.NDArray to mxnet.numpy.ndarray."""
+        storage_type = self.stype
+        if storage_type != 'default':
+            raise ValueError('cannot convert ndarray of stype {} to numpy ndarray'
+                             .format(str(type(storage_type))))
+        from ..numpy import ndarray
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+        return ndarray(handle=hdl, writable=self.writable)
+
+    def as_nd_ndarray(self):
+        """A convenience function for creating a classic ndarray from the current
+        ndarray with zero copy. For this class, it just returns itself since it is
+        already a classic ndarray."""
+        return self
+
     @property
     def _tvm_handle(self):
         return self.handle.value
@@ -908,7 +926,7 @@ def _slice(self, start, stop):
 
         check_call(_LIB.MXNDArraySlice(
             self.handle, mx_uint(start), mx_uint(stop), ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def _at(self, idx):
         """Returns a view of the array sliced at `idx` in the first dim.
@@ -942,7 +960,7 @@ def _at(self, idx):
                                  % (idx-length, length))
         check_call(_LIB.MXNDArrayAt(
             self.handle, mx_uint(idx), ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def reshape(self, *shape, **kwargs):
         """Returns a **view** of this array with a new shape without altering any data.
@@ -1065,7 +1083,7 @@ def reshape(self, *shape, **kwargs):
                                            c_array(ctypes.c_int64, shape),
                                            reverse,
                                            ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -2384,7 +2402,7 @@ def _get_broadcast_shape(shape1, shape2):
     for a, b in zip(shape1[::-1], shape2[::-1]):
         if a != 1 and b != 1 and a != b:
             raise ValueError('shape1=%s is not broadcastable to shape2=%s' % (shape1, shape2))
-        shape[i] = max(a, b)
+        shape[i] = b if a == 1 else a
         i -= 1
     return tuple(shape)
 
diff --git a/python/mxnet/ndarray/numpy/__init__.py b/python/mxnet/ndarray/numpy/__init__.py
new file mode 100644
index 000000000000..7eb478f792f5
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for numpy ops under mxnet.ndarray."""
+
+from . import random
+from . import linalg
+from . import _op, _internal
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/ndarray/numpy/_internal.py b/python/mxnet/ndarray/numpy/_internal.py
new file mode 100644
index 000000000000..c5f292842b3b
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_internal.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for numpy internal ops."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
new file mode 100644
index 000000000000..d7c06e76c182
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -0,0 +1,295 @@
+# pylint: disable=C0302
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
+
+from __future__ import absolute_import
+import numpy as _np
+from ...base import numeric_types
+from ...util import set_module
+from ...context import current_context
+from . import _internal as _npi
+
+__all__ = ['zeros', 'ones', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power']
+
+
+@set_module('mxnet.ndarray.numpy')
+def zeros(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.zeros(shape=shape, ctx=ctx, dtype=dtype)
+
+
+@set_module('mxnet.ndarray.numpy')
+def ones(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with ones.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.ones(shape=shape, ctx=ctx, dtype=dtype)
+
+
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
+
+    Parameters
+    --------
+    lhs : ndarray or numeric value
+        Left-hand side operand.
+
+    rhs : ndarray or numeric value
+        Right-hand operand,
+
+    fn_array : function
+        Function to be called if both lhs and rhs are of ``ndarray`` type.
+
+    fn_scalar : function
+        Function to be called if both lhs and rhs are numeric values.
+
+    lfn_scalar : function
+        Function to be called if lhs is ``ndarray`` while rhs is numeric value
+
+    rfn_scalar : function
+        Function to be called if lhs is numeric value while rhs is ``ndarray``;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    --------
+    mxnet.numpy.ndarray or scalar
+        result array or scalar
+    """
+    from ...numpy import ndarray
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return fn_scalar(lhs, rhs, out=out)
+        else:
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs), out=out)
+            else:
+                return rfn_scalar(rhs, float(lhs), out=out)
+    elif isinstance(rhs, numeric_types):
+        return lfn_scalar(lhs, float(rhs), out=out)
+    elif isinstance(rhs, ndarray):
+        return fn_array(lhs, rhs, out=out)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(rhs))))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+
+@set_module('mxnet.ndarray.numpy')
+def add(x1, x2, out=None):
+    """Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be added. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.add, _np.add, _npi.add_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def subtract(x1, x2, out=None):
+    """Subtract arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be subtracted from each other. If x1.shape != x2.shape,
+        they must be broadcastable to a common shape (which may be the shape
+        of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    subtract : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.subtract, _np.subtract, _npi.subtract_scalar,
+                         _npi.rsubtract_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def multiply(x1, x2, out=None):
+    """Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be multiplied. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The multiplication of x1 and x2, element-wise. This is a scalar if both x1 and x2
+        are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.multiply, _np.multiply, _npi.multiply_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def divide(x1, x2, out=None):
+    """Returns a true division of the inputs, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.true_divide, _np.divide, _npi.true_divide_scalar,
+                         _npi.rtrue_divide_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def mod(x1, x2, out=None):
+    """Return element-wise remainder of division.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def power(x1, x2, out=None):
+    """First array elements raised to powers from second array, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        The bases.
+
+    x2 : ndarray or scalar
+        The exponent.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The bases in x1 raised to the exponents in x2.
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
diff --git a/python/mxnet/ndarray/numpy/_register.py b/python/mxnet/ndarray/numpy/_register.py
new file mode 100644
index 000000000000..3ac464e24217
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_register.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy._internal',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
new file mode 100644
index 000000000000..0222bb45d148
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=ndarray."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
new file mode 100644
index 000000000000..339fb1e77920
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=ndarray."""
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy_extension/__init__.py b/python/mxnet/ndarray/numpy_extension/__init__.py
new file mode 100644
index 000000000000..5be34ac9b3d5
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for the ops not belonging to the official numpy package."""
+
+from . import _op
+from . import image
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/ndarray/numpy_extension/_op.py b/python/mxnet/ndarray/numpy_extension/_op.py
new file mode 100644
index 000000000000..22738a0f1950
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/_op.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for the operators not belonging to the official numpy package
+used in Gluon dispatched by F=ndarray module."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy_extension/_register.py b/python/mxnet/ndarray/numpy_extension/_register.py
new file mode 100644
index 000000000000..32cd0686551c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/_register.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy_extension ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/ndarray/numpy_extension/image.py b/python/mxnet/ndarray/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 1ccf228698ba..bdbfa1584ca6 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -24,12 +24,97 @@
 from ._internal import NDArrayBase, _imperative_invoke # pylint: disable=unused-import
 from ..ndarray_doc import _build_doc
 
-from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null # pylint: disable=unused-import
+from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null, _is_np_op  # pylint: disable=unused-import
+from ..util import use_np_shape  # pylint: disable=unused-import
+
+
+def _verify_all_np_ndarrays(op_name, func_name, args, out):
+    """Verify if all the arrays are numpy ndarrays.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    args : list of arrays
+        Input ndarray arguments to be checked.
+    out : ndarray or None or list of ndarrays
+        User-provided output ndarrays.
+    """
+    from ..numpy import ndarray as np_ndarray
+    for arr in args:
+        if (arr is not None) and (not isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a numpy operator which can only accept '
+                            'MXNet numpy ndarrays, while received a legacy ndarray. '
+                            'Please ensure that you have activated numpy semantics by calling '
+                            '`npx.set_np()` in your code. If you still see this error with numpy '
+                            'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                            'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                            'converted array to this operator.'
+                            .format(op_name, func_name))
+    if out is None:
+        return
+    if not isinstance(out, (list, tuple)):
+        out = [out]
+    for arr in out:
+        if (arr is not None) and (not isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a numpy operator which can only accept '
+                            'MXNet numpy ndarrays, while received a legacy ndarray. '
+                            'Please ensure that you have activated numpy semantics by calling '
+                            '`npx.set_np()` in your code. If you still see this error with numpy '
+                            'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                            'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                            'converted array to this operator.'
+                            .format(op_name, func_name))
+
+
+def _verify_all_legacy_ndarrays(op_name, func_name, args, out):
+    """Verify if all the arrays are legacy ndarrays.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    args : list of arrays
+        Input ndarray arguments to be checked.
+    out : ndarray or None or list of ndarrays
+        User-provided output ndarrays.
+    """
+    from ..numpy import ndarray as np_ndarray
+    for arr in args:
+        if (arr is not None) and (isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a legacy operator which can only accept '
+                            'legacy ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                            'convert it to a legacy ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
+    if out is None:
+        return
+    if not isinstance(out, (list, tuple)):
+        out = [out]
+    for arr in out:
+        if (arr is not None) and (isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a legacy operator which can only write to '
+                            'legacy ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                            'convert it to a legacy ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
 
 
 # pylint: disable=too-many-locals
-def _generate_ndarray_function_code(handle, name, func_name, signature_only=False):
-    """Generate function for ndarray op by handle and function name."""
+def _generate_ndarray_function_code(handle, op_name, func_name, signature_only=False):
+    """Generate function for ndarray op by handle and function op_name."""
     real_name = ctypes.c_char_p()
     desc = ctypes.c_char_p()
     num_args = mx_uint()
@@ -52,7 +137,7 @@ def _generate_ndarray_function_code(handle, name, func_name, signature_only=Fals
     arg_types = [py_str(arg_types[i]) for i in range(narg)]
     key_var_num_args = py_str(key_var_num_args.value)
     ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(name,
+    doc_str = _build_doc(op_name,
                          py_str(desc.value),
                          arg_names,
                          arg_types,
@@ -90,6 +175,10 @@ def _generate_ndarray_function_code(handle, name, func_name, signature_only=Fals
     signature = ndsignature + signature
 
     code = []
+    is_np_op = _is_np_op(op_name)
+    doc_str_idx = 1
+    if is_np_op:
+        doc_str_idx = 2
     if arr_name:
         code.append("""
 def %s(*%s, **kwargs):"""%(func_name, arr_name))
@@ -134,15 +223,26 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         vals.append(%s)"""%(name, name, name))
             # dtype
             if dtype_name is not None:
-                code.append("""
+                if is_np_op:
+                    code.append("""
+    if %s is not _Null and %s is not None:
+        keys.append('%s')
+        vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name, dtype_name))
+                else:
+                    code.append("""
     if %s is not _Null:
         keys.append('%s')
         vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
+    verify_ndarrays_fn =\
+        _verify_all_np_ndarrays.__name__ if is_np_op else _verify_all_legacy_ndarrays.__name__
     if not signature_only:
         code.append("""
-    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
-        handle.value))
+    {verify_fn}("{op_name}", "{func_name}", ndargs, out)
+        """.format(verify_fn=verify_ndarrays_fn, op_name=op_name, func_name=func_name))
+        code.append("""
+    return _imperative_invoke(%d, ndargs, keys, vals, out, %s)"""%(
+        handle.value, str(is_np_op)))
     else:
         code.append("""
     return (0,)""")
@@ -150,7 +250,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     doc_str_lines = _os.linesep+''.join(['    '+s if s.strip() else s
                                          for s in 'r"""{doc_str}"""'.format(doc_str=doc_str)
                                          .splitlines(True)])
-    code.insert(1, doc_str_lines)
+    code.insert(doc_str_idx, doc_str_lines)
     return ''.join(code), doc_str
 
 
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index ff93d0be6d73..730f2172c4f4 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -248,6 +248,7 @@ def save(fname, data):
     >>> mx.nd.load('my_dict')
     {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
     """
+    from ..numpy import ndarray as np_ndarray
     if isinstance(data, NDArray):
         data = [data]
         handles = c_array(NDArrayHandle, [])
@@ -257,11 +258,17 @@ def save(fname, data):
         if any(not isinstance(k, string_types) for k in str_keys) or \
            any(not isinstance(v, NDArray) for v in nd_vals):
             raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        if any(isinstance(v, np_ndarray) for v in nd_vals):
+            raise TypeError('cannot save mxnet.numpy.ndarray using mxnet.ndarray.save;'
+                            ' use mxnet.numpy.save instead.')
         keys = c_str_array(str_keys)
         handles = c_handle_array(nd_vals)
     elif isinstance(data, list):
         if any(not isinstance(v, NDArray) for v in data):
             raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        if any(isinstance(v, np_ndarray) for v in data):
+            raise TypeError('cannot save mxnet.numpy.ndarray using mxnet.ndarray.save;'
+                            ' use mxnet.numpy.save instead.')
         keys = None
         handles = c_handle_array(data)
     else:
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
new file mode 100644
index 000000000000..979e8d61a5ab
--- /dev/null
+++ b/python/mxnet/numpy/__init__.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""MXNet NumPy module."""
+
+from __future__ import division, absolute_import, print_function
+
+from . import random
+from . import linalg
+from .multiarray import *  # pylint: disable=wildcard-import
+from . import _op
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from .utils import *  # pylint: disable=wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy/_op.py b/python/mxnet/numpy/_op.py
new file mode 100644
index 000000000000..8f6f9cc053e4
--- /dev/null
+++ b/python/mxnet/numpy/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for registering numpy ops for imperative programming."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/_register.py b/python/mxnet/numpy/_register.py
new file mode 100644
index 000000000000..8a2d2ea61c24
--- /dev/null
+++ b/python/mxnet/numpy/_register.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering ops in mxnet.numpy for imperative programming."""
+
+from __future__ import absolute_import
+
+from ..base import _init_np_op_module
+from ..ndarray.register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name=None, make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
new file mode 100644
index 000000000000..c4109378e146
--- /dev/null
+++ b/python/mxnet/numpy/linalg.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for ops used in imperative programming."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
new file mode 100644
index 000000000000..54b1069b1029
--- /dev/null
+++ b/python/mxnet/numpy/multiarray.py
@@ -0,0 +1,1551 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""numpy ndarray and util functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+
+try:
+    from __builtin__ import slice as py_slice
+except ImportError:
+    from builtins import slice as py_slice
+
+from array import array as native_array
+import sys
+import ctypes
+import warnings
+import numpy as _np
+from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _GRAD_REQ_MAP
+from ..ndarray._internal import _set_np_ndarray_class
+from . import _op as _mx_np_op
+from ..base import check_call, _LIB, NDArrayHandle
+from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, integer_types
+from ..util import _sanity_check_params, set_module
+from ..context import current_context
+from ..ndarray import numpy as _mx_nd_np
+from ..ndarray.numpy import _internal as _npi
+
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'add', 'subtract', 'multiply', 'divide',
+           'mod', 'power']
+
+
+# This function is copied from ndarray.py since pylint
+# keeps giving false alarm error of undefined-all-variable
+def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
+    """Return a new handle with specified shape and context.
+
+    Empty handle is only used to hold results.
+
+    Returns
+    -------
+    handle
+        A new empty `ndarray` handle.
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXNDArrayCreateEx(
+        c_array_buf(mx_uint, native_array('I', shape)),
+        mx_uint(len(shape)),
+        ctypes.c_int(ctx.device_typeid),
+        ctypes.c_int(ctx.device_id),
+        ctypes.c_int(int(delay_alloc)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[_np.dtype(dtype).type])),
+        ctypes.byref(hdl)))
+    return hdl
+
+
+# Have to use 0 as default value for stype since pylint does not allow
+# importing _STORAGE_TYPE_DEFAULT from ndarray.py.
+def _np_ndarray_cls(handle, writable=True, stype=0):
+    if stype != 0:
+        raise ValueError('_np_ndarray_cls currently only supports default storage '
+                         'type, while received stype = {}'.format(stype))
+    return ndarray(handle, writable=writable)
+
+
+_set_np_ndarray_class(_np_ndarray_cls)
+
+
+def _get_index(idx):
+    if isinstance(idx, NDArray) and not isinstance(idx, ndarray):
+        raise TypeError('Cannot have mx.nd.NDArray as index')
+    if isinstance(idx, ndarray):
+        return idx.as_nd_ndarray()
+    elif sys.version_info[0] > 2 and isinstance(idx, range):
+        return array(_np.arange(idx.start, idx.stop, idx.step, dtype=_np.int32)).as_nd_ndarray()
+    else:
+        return idx
+
+
+@set_module('mxnet.numpy')  # pylint: disable=invalid-name
+class ndarray(NDArray):
+    """An array object represents a multidimensional, homogeneous array of fixed-size items.
+    An associated data-type object describes the format of each element in the array
+    (its byte-order, how many bytes it occupies in memory, whether it is an integer, a
+    floating point number, or something else, etc.). Arrays should be constructed using
+    `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
+
+    # pylint: disable=too-many-return-statements
+    def __getitem__(self, key):
+        # TODO(junwu): calling base class __getitem__ is a temp solution
+        ndim = self.ndim
+        shape = self.shape
+        if ndim == 0:
+            if key != ():
+                raise IndexError('scalar tensor can only accept `()` as index')
+        if isinstance(key, tuple) and len(key) == 0:
+            return self
+        elif isinstance(key, tuple) and len(key) == ndim\
+                and all(isinstance(idx, integer_types) for idx in key):
+            out = self
+            for idx in key:
+                out = out[idx]
+            return out
+        elif isinstance(key, integer_types):
+            if key > shape[0] - 1:
+                raise IndexError(
+                    'index {} is out of bounds for axis 0 with size {}'.format(
+                        key, shape[0]))
+            return self._at(key)
+        elif isinstance(key, py_slice):
+            if key.step is not None and key.step != 1:
+                if key.step == 0:
+                    raise ValueError("slice step cannot be zero")
+                return self.as_nd_ndarray()._get_nd_basic_indexing(key).as_np_ndarray()
+            elif key.start is not None or key.stop is not None:
+                return self._slice(key.start, key.stop)
+            else:
+                return self
+
+        if isinstance(key, ndarray):
+            key = key.as_nd_ndarray()
+        elif isinstance(key, tuple):
+            key = [_get_index(idx) for idx in key]
+            key = tuple(key)
+        elif isinstance(key, list):
+            key = [_get_index(idx) for idx in key]
+        elif sys.version_info[0] > 2 and isinstance(key, range):
+            key = _get_index(key)
+        return self.as_nd_ndarray().__getitem__(key).as_np_ndarray()
+    # pylint: enable=too-many-return-statements
+
+    def __setitem__(self, key, value):
+        # TODO(junwu): calling base class __setitem__ is a temp solution
+        if isinstance(value, NDArray) and not isinstance(value, ndarray):
+            raise TypeError('Cannot assign mx.nd.NDArray to mxnet.numpy.ndarray')
+        if self.ndim == 0:
+            if not isinstance(key, tuple) or len(key) != 0:
+                raise IndexError('scalar tensor can only accept `()` as index')
+        if isinstance(value, ndarray):
+            value = value.as_nd_ndarray()
+        # TODO(junwu): Better handling of this situation
+        if isinstance(key, tuple) and len(key) == 0:
+            self.as_nd_ndarray().__setitem__(slice(None), value)
+            return
+
+        if isinstance(key, ndarray):
+            key = key.as_nd_ndarray()
+        elif isinstance(key, tuple):
+            key = [_get_index(idx) for idx in key]
+            key = tuple(key)
+        elif isinstance(key, list):
+            key = [_get_index(idx) for idx in key]
+        elif sys.version_info[0] > 2 and isinstance(key, range):
+            key = _get_index(key)
+        self.as_nd_ndarray().__setitem__(key, value)
+
+    def __add__(self, other):
+        """x.__add__(y) <=> x + y"""
+        return add(self, other)
+
+    def __iadd__(self, other):
+        """x.__iadd__(y) <=> x += y"""
+        if not self.writable:
+            raise ValueError('trying to add to a readonly ndarray')
+        return add(self, other, out=self)
+
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x - y"""
+        return subtract(self, other)
+
+    def __isub__(self, other):
+        """x.__isub__(y) <=> x -= y"""
+        if not self.writable:
+            raise ValueError('trying to subtract from a readonly ndarray')
+        return subtract(self, other, out=self)
+
+    def __rsub__(self, other):
+        """x.__rsub__(y) <=> y - x"""
+        return subtract(other, self)
+
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x * y"""
+        return multiply(self, other)
+
+    def __neg__(self):
+        return self.__mul__(-1.0)
+
+    def __imul__(self, other):
+        """x.__imul__(y) <=> x *= y"""
+        if not self.writable:
+            raise ValueError('trying to add to a readonly ndarray')
+        return multiply(self, other, out=self)
+
+    def __rmul__(self, other):
+        """x.__rmul__(y) <=> y * x"""
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        raise AttributeError('ndarray.__div__ is replaced by __truediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __rdiv__(self, other):
+        raise AttributeError('ndarray.__rdiv__ is replaced by __rtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __idiv__(self, other):
+        raise AttributeError('ndarray.__idiv__ is replaced by __irtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __truediv__(self, other):
+        """x.__truediv__(y) <=> x / y"""
+        return divide(self, other)
+
+    def __rtruediv__(self, other):
+        """x.__rtruediv__(y) <=> y / x"""
+        return divide(other, self)
+
+    def __itruediv__(self, other):
+        return divide(self, other, out=self)
+
+    def __mod__(self, other):
+        """x.__mod__(y) <=> x % y"""
+        return mod(self, other)
+
+    def __rmod__(self, other):
+        """x.__rmod__(y) <=> y % x"""
+        return mod(other, self)
+
+    def __imod__(self, other):
+        """x.__imod__(y) <=> x %= y"""
+        return mod(self, other, out=self)
+
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x ** y"""
+        return power(self, other)
+
+    def __rpow__(self, other):
+        """x.__rpow__(y) <=> y ** x"""
+        return power(other, self)
+
+    def __eq__(self, other):
+        """x.__eq__(y) <=> x == y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __hash__(self):
+        raise NotImplementedError
+
+    def __ne__(self, other):
+        """x.__ne__(y) <=> x != y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.not_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.not_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __gt__(self, other):
+        """x.__gt__(y) <=> x > y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.greater(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __ge__(self, other):
+        """x.__ge__(y) <=> x >= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.greater_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __lt__(self, other):
+        """x.__lt__(y) <=> x < y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.less(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __le__(self, other):
+        """x.__le__(y) <=> x <= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.less_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __bool__(self):
+        num_elements = self.size
+        if num_elements == 0:
+            warnings.simplefilter('default')
+            warnings.warn('The truth value of an empty array is ambiguous. Returning False, but in'
+                          ' future this will result in an error.', DeprecationWarning)
+            return False
+        elif num_elements == 1:
+            return bool(self.item())
+        else:
+            raise ValueError("The truth value of an ndarray with multiple elements is ambiguous.")
+
+    __nonzero__ = __bool__
+
+    def __float__(self):
+        num_elements = self.size
+        if num_elements != 1:
+            raise TypeError('only size-1 arrays can be converted to Python scalars')
+        return float(self.item())
+
+    def __int__(self):
+        num_elements = self.size
+        if num_elements != 1:
+            raise TypeError('only size-1 arrays can be converted to Python scalars')
+        return int(self.item())
+
+    def __len__(self):
+        """Number of elements along the first axis."""
+        shape = self.shape
+        if len(shape) == 0:
+            raise TypeError('len() of unsized object')
+        return self.shape[0]
+
+    def __reduce__(self):
+        return ndarray, (None,), self.__getstate__()
+
+    def item(self, *args):
+        """Copy an element of an array to a standard Python scalar and return it.
+
+        Parameters
+        ----------
+        *args : Arguments (variable number and type)
+            none: in this case, the method only works for arrays with one element (a.size == 1),
+            which element is copied into a standard Python scalar object and returned.
+
+            int_type: this argument is interpreted as a flat index into the array, specifying which
+            element to copy and return.
+
+            tuple of int_types: functions as does a single int_type argument, except that the
+            argument is interpreted as an nd-index into the array.
+
+        Returns
+        -------
+        z : Standard Python scalar object
+            A copy of the specified element of the array as a suitable Python scalar.
+        """
+        # TODO(junwu): no need to call asnumpy() on the whole array.
+        return self.asnumpy().item(*args)
+
+    @property
+    # pylint: disable= invalid-name, undefined-variable
+    def T(self):
+        """Same as self.transpose(). This always returns a copy of self."""
+        return self.transpose()
+    # pylint: enable= invalid-name, undefined-variable
+
+    def all(self, axis=None, out=None, keepdims=False):
+        raise NotImplementedError
+
+    def any(self, axis=None, out=None, keepdims=False):
+        raise NotImplementedError
+
+    def as_nd_ndarray(self):
+        """Convert mxnet.numpy.ndarray to mxnet.ndarray.NDArray to use its fluent methods."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+        return NDArray(handle=hdl, writable=self.writable)
+
+    def as_np_ndarray(self):
+        """A convenience function for creating a numpy ndarray from the current ndarray
+        with zero copy. For this class, it just returns itself since it's already a
+        numpy ndarray."""
+        return self
+
+    def __repr__(self):
+        """
+        Returns a string representation of the array. The dtype of the ndarray will not
+        be appended to the string if it is `float32`. The context of the ndarray will
+        be appended for devices other than CPU.
+
+        Examples
+        --------
+        >>> from mxnet import np, npx
+        >>> a = np.random.uniform(size=(2, 3))
+        >>> a
+        array([[0.5488135 , 0.5928446 , 0.71518934],
+               [0.84426576, 0.60276335, 0.8579456 ]])
+        >>> print(a)
+        [[0.5488135  0.5928446  0.71518934]
+         [0.84426576 0.60276335 0.8579456 ]]
+        >>> a.dtype
+        <class 'numpy.float32'>
+        >>> b = a.astype(np.float64)
+        >>> b
+        array([[0.54881352, 0.59284461, 0.71518934],
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64)
+        >>> print(b)
+        [[0.54881352 0.59284461 0.71518934]
+         [0.84426576 0.60276335 0.85794562]]
+        >>> b.dtype
+        <class 'numpy.float64'>
+        >>> c = a.copyto(npx.gpu(0))
+        >>> c
+        array([[0.5488135 , 0.5928446 , 0.71518934],
+               [0.84426576, 0.60276335, 0.8579456 ]], ctx=gpu(0))
+        >>> print(c)
+        [[0.5488135  0.5928446  0.71518934]
+         [0.84426576 0.60276335 0.8579456 ]] @gpu(0)
+        >>> d = b.copyto(npx.gpu(0))
+        >>> d
+        array([[0.54881352, 0.59284461, 0.71518934],
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64, ctx=gpu(0))
+        >>> print(d)
+        [[0.54881352 0.59284461 0.71518934]
+         [0.84426576 0.60276335 0.85794562]] @gpu(0)
+        """
+        array_str = self.asnumpy().__repr__()
+        dtype = self.dtype
+        if 'dtype=' in array_str:
+            if dtype == _np.float32:
+                array_str = array_str[:array_str.rindex(',')] + ')'
+        elif dtype != _np.float32:
+            array_str = array_str[:-1] + ', dtype={})'.format(dtype.__name__)
+
+        context = self.context
+        if context.device_type == 'cpu':
+            return array_str
+        return array_str[:-1] + ', ctx={})'.format(str(context))
+
+    def __str__(self):
+        """Returns a string representation of the array."""
+        array_str = self.asnumpy().__str__()
+        context = self.context
+        if context.device_type == 'cpu' or self.ndim == 0:
+            return array_str
+        return '{array} @{ctx}'.format(array=array_str, ctx=context)
+
+    def attach_grad(self, grad_req='write'):  # pylint: disable=arguments-differ
+        """Attach a gradient buffer to this ndarray, so that `backward`
+        can compute gradient with respect to it.
+
+        Parameters
+        ----------
+        grad_req : {'write', 'add', 'null'}
+            How gradient will be accumulated.
+            - 'write': gradient will be overwritten on every backward.
+            - 'add': gradient will be added to existing value on every backward.
+            - 'null': do not compute gradient for this NDArray.
+        """
+        grad = _mx_np_op.zeros_like(self)  # pylint: disable=undefined-variable
+        grad_req = _GRAD_REQ_MAP[grad_req]
+        check_call(_LIB.MXAutogradMarkVariables(
+            1, ctypes.pointer(self.handle),
+            ctypes.pointer(mx_uint(grad_req)),
+            ctypes.pointer(grad.handle)))
+
+    @property
+    def grad(self):
+        """Returns gradient buffer attached to this ndarray."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetGrad(self.handle, ctypes.byref(hdl)))
+        if hdl.value is None:
+            return None
+        return _np_ndarray_cls(hdl)
+
+    def detach(self):
+        """Returns a new ndarray, detached from the current graph."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
+        return _np_ndarray_cls(hdl)
+
+    def astype(self, dtype, *args, **kwargs):  # pylint: disable=arguments-differ,unused-argument
+        """
+        Copy of the array, cast to a specified type.
+
+        Parameters
+        ----------
+        dtype : str or dtype
+            Typecode or data-type to which the array is cast.
+        copy : bool, optional
+            Default `True`. By default, astype always returns a newly
+            allocated ndarray on the same context. If this is set to
+            `False`, and the dtype requested is the same as the ndarray's
+            dtype, the ndarray is returned instead of a copy.
+
+        Returns
+        -------
+        arr_t : ndarray
+            Unless `copy` is False and the other conditions for returning the input
+            array are satisfied (see description for `copy` input parameter), `arr_t`
+            is a new array of the same shape as the input array with `dtype`.
+        """
+        _sanity_check_params('astype', ['order', 'casting', 'subok'], kwargs)
+        copy = kwargs.get('copy', True)
+        if not copy and _np.dtype(dtype) == self.dtype:
+            return self
+
+        res = empty(self.shape, dtype=dtype, ctx=self.context)
+        self.copyto(res)
+        return res
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``ndarray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``NDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : ndarray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        ndarray
+            The copied array. If ``other`` is an ``ndarray``, then the return value
+            and ``other`` will point to the same ``ndarray``.
+
+        Examples
+        --------
+        >>> x = np.ones((2,3))
+        >>> y = np.zeros((2,3), mx.gpu(0))
+        >>> z = x.copyto(y)
+        >>> z is y
+        True
+        >>> y.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        """
+        if isinstance(other, ndarray):
+            other = other.as_nd_ndarray()
+        return self.as_nd_ndarray().copyto(other).as_np_ndarray()
+
+    def asscalar(self):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute asscalar')
+
+    def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    def as_in_context(self, context):
+        """Returns an array on the target device with the same value as this array.
+
+        If the target context is the same as ``self.context``, then ``self`` is
+        returned.  Otherwise, a copy is made.
+
+        Parameters
+        ----------
+        context : Context
+            The target context.
+
+        Returns
+        -------
+        ndarray
+            The target array.
+        """
+        if self.context == context:
+            return self
+        return self.copyto(context)
+
+    def copy(self, order='C'):  # pylint: disable=arguments-differ
+        if order != 'C':
+            raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
+                                      'received {}'.format(str(order)))
+        return super(ndarray, self).copy().as_np_ndarray()
+
+    def dot(self, b, out=None):
+        raise NotImplementedError
+
+    def reshape(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Returns an array containing the same data with a new shape.
+
+        Notes
+        -----
+        Unlike the free function `numpy.reshape`, this method on `ndarray` allows
+        the elements of the shape parameter to be passed in as separate arguments.
+        For example, ``a.reshape(10, 11)`` is equivalent to
+        ``a.reshape((10, 11))``.
+        """
+        order = 'C'
+        if len(kwargs) > 1:
+            raise TypeError('function takes at most 1 keyword argument')
+        if len(kwargs) == 1:
+            if 'order' not in kwargs:
+                raise TypeError('{} is an invalid keyword argument for this function'
+                                .format(kwargs.keys()[0]))
+            order = kwargs.pop('order', 'C')
+            if order != 'C':
+                raise NotImplementedError('only supports C-order,'
+                                          ' while received {}'.format(order))
+        if len(args) == 0:
+            raise TypeError('reshape() takes exactly 1 argument (0 given)')
+        raise NotImplementedError
+
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute reshape_like')
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute zeros_like')
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute ones_like')
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
+
+    def repeat(self, repeats, axis=None):  # pylint: disable=arguments-differ
+        """Repeat elements of an array."""
+        raise NotImplementedError
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute pad')
+
+    def swapaxes(self, axis1, axis2):  # pylint: disable=arguments-differ
+        """Return a copy of the array with axis1 and axis2 interchanged.
+        Refer to `mxnet.numpy.swapaxes` for full documentation.
+        """
+        raise NotImplementedError
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute split')
+
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute split_v2')
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice')
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_axis')
+
+    def slice_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_like`.
+
+        The arguments are the same as for :py:func:`slice_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_like')
+
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute one_hot')
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute pick')
+
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute topk')
+
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute argmax_channel')
+
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
+        """Return an array whose values are limited to [min, max].
+        One of max or min must be given.
+        """
+        raise NotImplementedError
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+
+    def flatten(self, order='C'):  # pylint: disable=arguments-differ
+        """Return a copy of the array collapsed into one dimension."""
+        raise NotImplementedError
+
+    def shape_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`shape_array`.
+
+        The arguments are the same as for :py:func:`shape_array`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute shape_array')
+
+    def size_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`size_array`.
+
+        The arguments are the same as for :py:func:`size_array`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute size_array')
+
+    def expand_dims(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute expand_dims')
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tile')
+
+    def transpose(self, *axes):  # pylint: disable=arguments-differ
+        """Permute the dimensions of an array."""
+        raise NotImplementedError
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute flip')
+
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute depth_to_space')
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute space_to_depth')
+
+    def diag(self, k=0, **kwargs):
+        """Convenience fluent method for :py:func:`diag`.
+
+        The arguments are the same as for :py:func:`diag`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute diag')
+
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute nansum')
+
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the product of the array elements over the given axis."""
+        raise NotImplementedError
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute nanprod')
+
+    def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Returns the average of the array elements along given axis."""
+        raise NotImplementedError
+
+    # TODO(junwu): Use mxnet std op instead of onp.std
+    def std(self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint: disable=arguments-differ
+        """Returns the standard deviation of the array elements along given axis."""
+        ret_np = self.asnumpy().std(axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
+        return array(ret_np, dtype=ret_np.dtype, ctx=self.context)
+
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of the elements along the given axis."""
+        raise NotImplementedError
+
+    def tolist(self):
+        return self.asnumpy().tolist()
+
+    def max(self, axis=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the maximum along a given axis."""
+        raise NotImplementedError
+
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
+
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rint')
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute fix')
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute floor')
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute ceil')
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute trunc')
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sin')
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cos')
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tan')
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arcsin')
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arccos')
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arctan')
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute degrees')
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute radians')
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sinh')
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cosh')
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tanh')
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arcsinh')
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arccosh')
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arctanh')
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute exp')
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute expm1')
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log')
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log10')
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log2')
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log1p')
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sqrt')
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rsqrt')
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cqrt')
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rcqrt')
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute square')
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute reciprocal')
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute relu')
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sigmoid')
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute softmax')
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log_softmax')
+
+    def softmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmin`.
+
+        The arguments are the same as for :py:func:`softmin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute softmin')
+
+    def squeeze(self, axis=None):  # pylint: disable=arguments-differ
+        """Remove single-dimensional entries from the shape of a.
+        """
+        raise NotImplementedError
+
+    def broadcast_to(self, shape):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_to')
+
+    def broadcast_like(self, other):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
+
+    @property
+    def shape(self):
+        return super(ndarray, self).shape
+
+    @property
+    def ndim(self):
+        """Number of array dimensions."""
+        return len(self.shape)
+
+    @property
+    def size(self):
+        """Number of elements in the array."""
+        return super(ndarray, self).size
+
+    def tostype(self, stype):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tostype')
+
+
+@set_module('mxnet.numpy')
+def empty(shape, dtype=float, order='C', ctx=None):
+    """Return a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int Shape of the empty array, e.g., ``(2, 3)`` or ``2``.
+    dtype : data-type, optional
+        Desired output data-type for the array, e.g, `numpy.int8`. Default is
+        `numpy.float32`. Note that this behavior is different from NumPy's `empty`
+        function where `float64` is the default value, because `float32` is
+        considered as the default data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+
+    Returns
+    -------
+    out : ndarray
+        Array of uninitialized (arbitrary) data of the given shape, dtype, and order.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    if dtype is None:
+        dtype = _np.float32
+    if isinstance(shape, int):
+        shape = (shape,)
+    return ndarray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+
+
+@set_module('mxnet.numpy')
+def array(object, dtype=None, ctx=None):
+    """
+    Create an array.
+
+    Parameters
+    ----------
+    object : array_like or `numpy.ndarray` or `mxnet.numpy.ndarray`
+        An array, any object exposing the array interface, an object whose
+        __array__ method returns an array, or any (nested) sequence.
+    dtype : data-type, optional
+        The desired data-type for the array. Default is `float32`.
+    ctx : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+
+    Returns
+    -------
+    out : ndarray
+        An array object satisfying the specified requirements.
+    """
+    if ctx is None:
+        ctx = current_context()
+    if isinstance(object, ndarray):
+        dtype = object.dtype if dtype is None else dtype
+    else:
+        dtype = mx_real_t if dtype is None else dtype
+        if not isinstance(object, (ndarray, _np.ndarray)):
+            try:
+                object = _np.array(object, dtype=dtype)
+            except Exception as e:
+                raise TypeError('{}'.format(str(e)))
+    ret = empty(object.shape, dtype=dtype, ctx=ctx)
+    if len(object.shape) == 0:
+        ret[()] = object
+    else:
+        ret[:] = object
+    return ret
+
+
+@set_module('mxnet.numpy')
+def zeros(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `numpy.float32`). Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    return _mx_nd_np.zeros(shape, dtype, order, ctx)
+
+
+@set_module('mxnet.numpy')
+def ones(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    return _mx_nd_np.ones(shape, dtype, order, ctx)
+
+
+@set_module('mxnet.numpy')
+def add(x1, x2, out=None):
+    """Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be added. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.add(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def subtract(x1, x2, out=None):
+    """Subtract arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be subtracted from each other. If x1.shape != x2.shape,
+        they must be broadcastable to a common shape (which may be the shape
+        of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    subtract : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.subtract(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def multiply(x1, x2, out=None):
+    """Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be multiplied. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.multiply(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def divide(x1, x2, out=None):
+    """Returns a true division of the inputs, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.divide(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def mod(x1, x2, out=None):
+    """Return element-wise remainder of division.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.mod(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def power(x1, x2, out=None):
+    """First array elements raised to powers from second array, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        The bases.
+
+    x2 : ndarray or scalar
+        The exponent.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The bases in x1 raised to the exponents in x2.
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.power(x1, x2, out=out)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
new file mode 100644
index 000000000000..c4109378e146
--- /dev/null
+++ b/python/mxnet/numpy/random.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for ops used in imperative programming."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/numpy/utils.py b/python/mxnet/numpy/utils.py
new file mode 100644
index 000000000000..920897efc80b
--- /dev/null
+++ b/python/mxnet/numpy/utils.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions for the numpy module."""
+
+
+from __future__ import absolute_import
+
+import numpy as onp
+
+__all__ = ['float16', 'float32', 'float64', 'uint8', 'int32', 'int8', 'int64', 'pi']
+
+float16 = onp.float16
+float32 = onp.float32
+float64 = onp.float64
+uint8 = onp.uint8
+int32 = onp.int32
+int8 = onp.int8
+int64 = onp.int64
+
+pi = onp.pi
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
new file mode 100644
index 000000000000..d71d65f08de2
--- /dev/null
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for ops not belonging to the official numpy package for imperative programming."""
+
+from __future__ import absolute_import
+from . import _op
+from . import image
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from ..context import *  # pylint: disable=wildcard-import
+from ..util import is_np_shape, is_np_array, set_np, reset_np
+from ..ndarray import waitall
+from .utils import *  # pylint: disable=wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/_op.py b/python/mxnet/numpy_extension/_op.py
new file mode 100644
index 000000000000..a995e480221a
--- /dev/null
+++ b/python/mxnet/numpy_extension/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for registering numpy_extension ops for imperative programming."""
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/_register.py b/python/mxnet/numpy_extension/_register.py
new file mode 100644
index 000000000000..8abb7254057c
--- /dev/null
+++ b/python/mxnet/numpy_extension/_register.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering ops in mxnet.numpy_extension for imperative programming."""
+
+from __future__ import absolute_import
+
+from ..base import _init_np_op_module
+from ..ndarray.register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name=None, make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/numpy_extension/image.py b/python/mxnet/numpy_extension/image.py
new file mode 100644
index 000000000000..00a028b3c18f
--- /dev/null
+++ b/python/mxnet/numpy_extension/image.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+from ..image import *  # pylint: disable=wildcard-import, unused-wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/utils.py b/python/mxnet/numpy_extension/utils.py
new file mode 100644
index 000000000000..0aa89badbb58
--- /dev/null
+++ b/python/mxnet/numpy_extension/utils.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions for the numpy module."""
+
+
+from __future__ import absolute_import
+
+import ctypes
+from .. util import is_np_array, is_np_shape
+from .. base import _LIB, check_call, string_types, c_str_array
+from .. base import c_handle_array, c_str, mx_uint, NDArrayHandle, py_str
+from ..numpy import ndarray
+
+__all__ = ['save', 'load']
+
+
+def save(file, arr):
+    """Saves a list of `ndarray`s or a dict of `str`->`ndarray` to file.
+
+    Examples of filenames:
+
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+
+    Parameters
+    ----------
+    file : str
+        Filename to which the data is saved.
+    arr : `ndarray` or list of `ndarray`s or dict of `str` to `ndarray`
+        The data to be saved.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot save `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if isinstance(arr, ndarray):
+        arr = [arr]
+    if isinstance(arr, dict):
+        str_keys = arr.keys()
+        nd_vals = arr.values()
+        if any(not isinstance(k, string_types) for k in str_keys) or \
+                any(not isinstance(v, ndarray) for v in nd_vals):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = c_str_array(str_keys)
+        handles = c_handle_array(nd_vals)
+    elif isinstance(arr, list):
+        if any(not isinstance(v, ndarray) for v in arr):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = None
+        handles = c_handle_array(arr)
+    else:
+        raise ValueError("data needs to either be a ndarray, dict of (str, ndarray) pairs "
+                         "or a list of ndarrays.")
+    check_call(_LIB.MXNDArraySave(c_str(file),
+                                  mx_uint(len(handles)),
+                                  handles,
+                                  keys))
+
+
+def load(file):
+    """Loads an array from file.
+
+    See more details in ``save``.
+
+    Parameters
+    ----------
+    file : str
+        The filename.
+
+    Returns
+    -------
+    result : list of ndarrays or dict of str -> ndarray
+        Data stored in the file.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot load `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if not isinstance(file, string_types):
+        raise TypeError('file required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(file),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [ndarray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), ndarray(NDArrayHandle(handles[i])))
+            for i in range(out_size.value))
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index c2c1aa6a76f4..d953e9247900 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -18,6 +18,7 @@
 
 # pylint: disable=too-many-lines
 """Weight updating functions."""
+from __future__ import absolute_import
 import logging
 import math
 import pickle
@@ -33,6 +34,7 @@
                        multi_mp_sgd_mom_update)
 from ..ndarray import sparse
 from ..random import normal
+from ..util import is_np_array
 
 __all__ = [
     'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD',
@@ -119,6 +121,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.idx2name = param_idx2name.copy()
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
+        self.allow_np_array = is_np_array()
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -1644,6 +1647,28 @@ def update(self, index, weight, grad, state):
 # backward compatibility wrapper for Optimizer.CreateOptimizer
 create = Optimizer.create_optimizer  # pylint: disable=invalid-name
 
+
+def _as_classic(a, allow_np):
+    # TODO(junwu): This is a temp solution for allowing converting
+    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
+    # users may have custom optimizers implemented using mx.nd.NDArray ops.
+    from ..numpy import ndarray as np_ndarray
+    if isinstance(a, (tuple, list)):
+        if any(isinstance(x, np_ndarray) for x in a):
+            if allow_np:
+                return [x.as_nd_ndarray() for x in a]
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    else:
+        if isinstance(a, np_ndarray):
+            if allow_np:
+                return a.as_nd_ndarray()
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    return a
+
+
+
 class Updater(object):
     """Updater for kvstore."""
     def __init__(self, optimizer):
@@ -1654,14 +1679,15 @@ def __init__(self, optimizer):
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
+        allow_np = self.optimizer.allow_np_array
         if not isinstance(index, (list, tuple)):
             indices = [index]
-            grads = [grad]
-            weights = [weight]
+            grads = [_as_classic(grad, allow_np)]
+            weights = [_as_classic(weight, allow_np)]
         else:
             indices = index
-            grads = grad
-            weights = weight
+            grads = _as_classic(grad, allow_np)
+            weights = _as_classic(weight, allow_np)
         if weights:
             self.optimizer._set_current_context(weights[0].context.device_id)
         for i, idx in enumerate(indices):
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
index f438e4954aa9..2ce395bdd279 100644
--- a/python/mxnet/symbol/__init__.py
+++ b/python/mxnet/symbol/__init__.py
@@ -17,7 +17,7 @@
 
 """Symbol API of MXNet."""
 
-from . import _internal, contrib, linalg, op, random, sparse, image, symbol
+from . import _internal, contrib, linalg, op, random, sparse, image, symbol, numpy
 # pylint: disable=wildcard-import, redefined-builtin
 try:
     from .gen_op import * # pylint: disable=unused-wildcard-import
@@ -27,5 +27,8 @@
 from .op import *
 from .symbol import *
 # pylint: enable=wildcard-import
+from . import numpy as np
+from . import numpy_extension as npx
 
-__all__ = op.__all__ + symbol.__all__ + ['contrib', 'linalg', 'random', 'sparse', 'image']
+__all__ = op.__all__ + symbol.__all__\
+          + ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/symbol/_internal.py b/python/mxnet/symbol/_internal.py
index 7e9787e32b1c..d46c0e64e6f1 100644
--- a/python/mxnet/symbol/_internal.py
+++ b/python/mxnet/symbol/_internal.py
@@ -24,18 +24,18 @@
 
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
-        from .._ctypes.symbol import SymbolBase, _set_symbol_class
+        from .._ctypes.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._ctypes.symbol import _symbol_creator
     elif _sys.version_info >= (3, 0):
-        from .._cy3.symbol import SymbolBase, _set_symbol_class
+        from .._cy3.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._cy3.symbol import _symbol_creator
     else:
-        from .._cy2.symbol import SymbolBase, _set_symbol_class
+        from .._cy2.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._cy2.symbol import _symbol_creator
 except ImportError:
     if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
         raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
-    from .._ctypes.symbol import SymbolBase, _set_symbol_class
+    from .._ctypes.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
     from .._ctypes.symbol import _symbol_creator
 from ..attribute import AttrScope
 from ..base import _Null
@@ -45,4 +45,4 @@
 except ImportError:
     pass
 
-__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator']
+__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator', '_set_np_symbol_class']
diff --git a/python/mxnet/symbol/numpy/__init__.py b/python/mxnet/symbol/numpy/__init__.py
new file mode 100644
index 000000000000..857849c4ae62
--- /dev/null
+++ b/python/mxnet/symbol/numpy/__init__.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for numpy ops under mxnet.symbol."""
+
+from . import random
+from . import linalg
+from . import _op, _symbol, _internal
+from ._symbol import _Symbol
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from ._symbol import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__ + _symbol.__all__
diff --git a/python/mxnet/symbol/numpy/_internal.py b/python/mxnet/symbol/numpy/_internal.py
new file mode 100644
index 000000000000..c5f292842b3b
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_internal.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for numpy internal ops."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/_op.py b/python/mxnet/symbol/numpy/_op.py
new file mode 100644
index 000000000000..a4a979f30b18
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/_register.py b/python/mxnet/symbol/numpy/_register.py
new file mode 100644
index 000000000000..3245c8d6d638
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_register.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_symbol_function
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy._internal',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
new file mode 100644
index 000000000000..616f3066d98d
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -0,0 +1,1013 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""numpy namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+from __future__ import absolute_import
+import ctypes
+import numpy as _np
+from ...base import _LIB, SymbolHandle, numeric_types, mx_uint
+from ...util import check_call, set_module
+from ...context import current_context
+from ..symbol import Symbol
+from .._internal import _set_np_symbol_class
+from . import _internal as _npi
+
+__all__ = ['zeros', 'ones', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power']
+
+
+def _num_outputs(sym):
+    return len(sym.as_nd_ndarray())
+
+
+@set_module('mxnet.symbol.numpy')
+class _Symbol(Symbol):
+    def __getitem__(self, key):
+        num_outputs = _num_outputs(self)
+        if num_outputs == 1:
+            raise NotImplementedError
+        if not isinstance(key, int):
+            raise NotImplementedError
+        if key >= num_outputs:
+            # Important, python determines the end by this exception
+            raise IndexError
+        handle = SymbolHandle()
+        check_call(_LIB.MXSymbolGetOutput(
+            self.handle, mx_uint(key), ctypes.byref(handle)))
+        return _Symbol(handle=handle)
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise AttributeError('_Symbol object has no attribute __iter__')
+
+    def __add__(self, other):
+        """x.__add__(y) <=> x + y"""
+        return add(self, other)
+
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x - y"""
+        return subtract(self, other)
+
+    def __rsub__(self, other):
+        """x.__rsub__(y) <=> y - x"""
+        return subtract(other, self)
+
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x * y"""
+        return multiply(self, other)
+
+    def __rmul__(self, other):
+        """x.__rmul__(y) <=> y * x"""
+        return multiply(other, self)
+
+    def __div__(self, other):
+        raise AttributeError('_Symbol.__div__ is replaced by __truediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __rdiv__(self, other):
+        raise AttributeError('_Symbol.__rdiv__ is replaced by __rtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __mod__(self, other):
+        """x.__mod__(y) <=> x % y"""
+        return mod(self, other)
+
+    def __rmod__(self, other):
+        """x.__rmod__(y) <=> y % x"""
+        return mod(other, self)
+
+    def __idiv__(self, other):
+        raise NotImplementedError
+
+    def __truediv__(self, other):
+        """x.__truediv__(y) <=> x / y"""
+        return divide(self, other)
+
+    def __rtruediv__(self, other):
+        """x.__rtruediv__(y) <=> y / x"""
+        return divide(other, self)
+
+    def __itruediv__(self, other):
+        raise NotImplementedError
+
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x ** y"""
+        return power(self, other)
+
+    def __rpow__(self, other):
+        return power(other, self)
+
+    def __neg__(self):
+        """x.__neg__() <=> - x"""
+        return self.__mul__(-1.0)
+
+    def __deepcopy__(self, _):
+        return super(_Symbol, self).as_np_ndarray()
+
+    def __eq__(self, other):
+        """x.__eq__(y) <=> x == y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __ne__(self, other):
+        """x.__ne__(y) <=> x != y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.not_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.not_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __gt__(self, other):
+        """x.__gt__(y) <=> x > y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.greater(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __ge__(self, other):
+        """x.__ge__(y) <=> x >= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.greater_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __lt__(self, other):
+        """x.__lt__(y) <=> x < y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.less(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __le__(self, other):
+        """x.__le__(y) <=> x <= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.less_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def as_nd_ndarray(self):
+        """Convert _Symbol to mxnet.symbol.Symbol to use its convenience fluent methods."""
+        hdl = SymbolHandle()
+        check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
+        return Symbol(handle=hdl)
+
+    def as_np_ndarray(self):
+        """For the convenience of conversion between legacy and np symbols."""
+        return self
+
+    @property
+    # pylint: disable= invalid-name, undefined-variable
+    def T(self):
+        """Same as self.transpose()."""
+        return self.transpose()
+    # pylint: enable= invalid-name, undefined-variable
+
+    def astype(self, dtype, **kwargs):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    def dot(self, b, out=None):
+        raise NotImplementedError
+
+    def reshape(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Returns an array containing the same data with a new shape.
+
+        Notes
+        -----
+        Unlike the free function `numpy.reshape`, this method on `ndarray` allows
+        the elements of the shape parameter to be passed in as separate arguments.
+        For example, ``a.reshape(10, 11)`` is equivalent to
+        ``a.reshape((10, 11))``.
+        """
+        order = 'C'
+        if len(kwargs) > 1:
+            raise TypeError('function takes at most 1 keyword argument')
+        if len(kwargs) == 1:
+            if 'order' not in kwargs:
+                raise TypeError('{} is an invalid keyword argument for this function'
+                                .format(kwargs.keys()[0]))
+            order = kwargs.pop('order', 'C')
+            if order != 'C':
+                raise NotImplementedError('only supports C-order,'
+                                          ' while received {}'.format(order))
+        if len(args) == 0:
+            raise TypeError('reshape() takes exactly 1 argument (0 given)')
+        raise NotImplementedError
+
+    def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute reshape_like')
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute zeros_like')
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute ones_like')
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute broadcast_like')
+
+    def repeat(self, repeats, axis=None):  # pylint: disable=arguments-differ
+        """Repeat elements of an array."""
+        raise NotImplementedError
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute pad')
+
+    def swapaxes(self, axis1, axis2):  # pylint: disable=arguments-differ
+        """Return a copy of the array with axis1 and axis2 interchanged.
+        Refer to `mxnet.numpy.swapaxes` for full documentation.
+        """
+        raise NotImplementedError
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute split')
+
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute split_v2')
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute slice')
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute slice_axis')
+
+    def slice_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_like`.
+
+        The arguments are the same as for :py:func:`slice_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute slice_like')
+
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute one_hot')
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute pick')
+
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute topk')
+
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute argmax_channel')
+
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
+        """Return an array whose values are limited to [min, max].
+        One of max or min must be given.
+        """
+        raise NotImplementedError
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute abs')
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute abs')
+
+    def flatten(self, order='C'):  # pylint: disable=arguments-differ
+        """Return a copy of the array collapsed into one dimension."""
+        return self.reshape(-1, order=order)
+
+    def shape_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`shape_array`.
+
+        The arguments are the same as for :py:func:`shape_array`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute shape_array')
+
+    def size_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`size_array`.
+
+        The arguments are the same as for :py:func:`size_array`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute size_array')
+
+    def expand_dims(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute expand_dims')
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute tile')
+
+    def transpose(self, *axes):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`transpose`.
+
+        The arguments are the same as for :py:func:`transpose`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute flip')
+
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute depth_to_space')
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute space_to_depth')
+
+    def diag(self, k=0, **kwargs):
+        """Convenience fluent method for :py:func:`diag`.
+
+        The arguments are the same as for :py:func:`diag`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute diag')
+
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute nansum')
+
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the product of the array elements over the given axis."""
+        raise NotImplementedError
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute nanprod')
+
+    def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`mean`.
+
+        The arguments are the same as for :py:func:`mean`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of the elements along the given axis."""
+        raise NotImplementedError
+
+    def max(self, axis=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the maximum along a given axis."""
+        raise NotImplementedError
+
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute norm')
+
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute rint')
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute fix')
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute floor')
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute ceil')
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute trunc')
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sin')
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute cos')
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute tan')
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arcsin')
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arccos')
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arctan')
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute degrees')
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute radians')
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sinh')
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute cosh')
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute tanh')
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arcsinh')
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arccosh')
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arctanh')
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute exp')
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute expm1')
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log')
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log10')
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log2')
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log1p')
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sqrt')
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute rsqrt')
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute cqrt')
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute rcqrt')
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute square')
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute reciprocal')
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute relu')
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sigmoid')
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute softmax')
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log_softmax')
+
+    def softmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmin`.
+
+        The arguments are the same as for :py:func:`softmin`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute softmin')
+
+    def squeeze(self, axis=None):  # pylint: disable=arguments-differ
+        """Remove single-dimensional entries from the shape of a.
+        """
+        raise NotImplementedError
+
+    def broadcast_to(self, *args, **kwargs):
+        raise AttributeError('_Symbol object has no attribute broadcast_to')
+
+    def broadcast_like(self, *args, **kwargs):
+        raise AttributeError('_Symbol object has no attribute broadcast_like')
+
+
+@set_module('mxnet.symbol.numpy')
+def zeros(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `zeros` function  where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : Symbol
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.zeros(shape=shape, ctx=ctx, dtype=dtype)
+
+
+@set_module('mxnet.symbol.numpy')
+def ones(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.ones(shape=shape, ctx=ctx, dtype=dtype)
+
+
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
+
+    Parameters
+    --------
+    lhs : Symbol or numeric value
+        Left-hand side operand.
+
+    rhs : Symbol or numeric value
+        Right-hand operand,
+
+    fn_array : function
+        Function to be called if both lhs and rhs are of ``Symbol`` type.
+
+    fn_scalar : function
+        Function to be called if both lhs and rhs are numeric values.
+
+    lfn_scalar : function
+        Function to be called if lhs is ``Symbol`` while rhs is numeric value
+
+    rfn_scalar : function
+        Function to be called if lhs is numeric value while rhs is ``Symbol``;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    --------
+    mxnet.numpy.ndarray
+        result array
+    """
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return fn_scalar(lhs, rhs, out=out)
+        else:
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs), out=out)
+            else:
+                return rfn_scalar(rhs, float(lhs), out=out)
+    elif isinstance(rhs, numeric_types):
+        return lfn_scalar(lhs, float(rhs), out=out)
+    elif isinstance(rhs, Symbol):
+        return fn_array(lhs, rhs, out=out)
+    else:
+        raise TypeError('type %s not supported' % str(type(rhs)))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+
+@set_module('mxnet.symbol.numpy')
+def add(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.add, _np.add, _npi.add_scalar, None, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def subtract(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.subtract, _np.subtract, _npi.subtract_scalar,
+                         _npi.rsubtract_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def multiply(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.multiply, _np.multiply, _npi.multiply_scalar, None, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def divide(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.true_divide, _np.divide, _npi.true_divide_scalar,
+                         _npi.rtrue_divide_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def mod(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def power(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
+
+
+_set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
new file mode 100644
index 000000000000..28cfd0f3806a
--- /dev/null
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=symbol."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
new file mode 100644
index 000000000000..28cfd0f3806a
--- /dev/null
+++ b/python/mxnet/symbol/numpy/random.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=symbol."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy_extension/__init__.py b/python/mxnet/symbol/numpy_extension/__init__.py
new file mode 100644
index 000000000000..5be34ac9b3d5
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for the ops not belonging to the official numpy package."""
+
+from . import _op
+from . import image
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/symbol/numpy_extension/_op.py b/python/mxnet/symbol/numpy_extension/_op.py
new file mode 100644
index 000000000000..82eaa8e6ec9f
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/_op.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators not belonging to the official numpy package
+used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy_extension/_register.py b/python/mxnet/symbol/numpy_extension/_register.py
new file mode 100644
index 000000000000..b118987b1fd3
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/_register.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy_extension ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_symbol_function
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
diff --git a/python/mxnet/symbol/numpy_extension/image.py b/python/mxnet/symbol/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index ac59f8b97f15..a17dd79048d4 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -27,12 +27,60 @@
 from ..attribute import AttrScope
 from ..base import mx_uint, check_call, _LIB, py_str
 from ..symbol_doc import _build_doc
-from ..base import _Null, _init_op_module
+from ..base import _Null, _init_op_module, _is_np_op
 from ..name import NameManager
 # pylint: enable=unused-import
 
 
-def _generate_symbol_function_code(handle, name, func_name, signature_only=False):
+def _verify_np_symbol(op_name, func_name, sym):
+    """Verify if the sym is a numpy symbol.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    sym : symbol to be verified
+    """
+    from .numpy._symbol import _Symbol as np_symbol
+    if not isinstance(sym, np_symbol):
+        raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                        'This is a numpy operator which can only accept '
+                        'MXNet numpy ndarrays, while received a legacy ndarray. '
+                        'Please ensure that you have activated numpy semantics by calling '
+                        '`npx.set_np()` in your code. If you still see this error with numpy '
+                        'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                        'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                        'converted array to this operator.'
+                        .format(op_name, func_name))
+
+
+def _verify_legacy_symbol(op_name, func_name, sym):
+    """Verify if the sym is a legacy symbol.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    sym : symbol to be verified
+    """
+    from .numpy._symbol import _Symbol as np_symbol
+    if isinstance(sym, np_symbol):
+        raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                        'This is a legacy operator which can only accept '
+                        'legacy ndarrays, while received an MXNet numpy ndarray. '
+                        'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                        'convert it to a legacy ndarray, and then feed the converted '
+                        'array to this operator.'
+                        .format(op_name, func_name))
+
+
+def _generate_symbol_function_code(handle, op_name, func_name, signature_only=False):
     """Generate function for symbol op by handle and function name."""
     real_name = ctypes.c_char_p()
     desc = ctypes.c_char_p()
@@ -56,7 +104,7 @@ def _generate_symbol_function_code(handle, name, func_name, signature_only=False
     arg_types = [py_str(arg_types[i]) for i in range(narg)]
     key_var_num_args = py_str(key_var_num_args.value)
     ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(name,
+    doc_str = _build_doc(op_name,
                          py_str(desc.value),
                          arg_names,
                          arg_types,
@@ -95,6 +143,8 @@ def _generate_symbol_function_code(handle, name, func_name, signature_only=False
     signature.append('**kwargs')
     signature = ndsignature + signature
 
+    is_np_op = _is_np_op(op_name)
+    verify_symbol_fn = _verify_np_symbol.__name__ if is_np_op else _verify_legacy_symbol.__name__
     code = []
     if arr_name:
         code.append("""
@@ -106,7 +156,8 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
         assert isinstance(i, SymbolBase), \\
             "Positional arguments must be Symbol instances, " \\
             "but got %s"%str(i)
-        sym_args.append(i)""".format(arr_name))
+        {}('{}', '{}', i)
+        sym_args.append(i)""".format(arr_name, verify_symbol_fn, op_name, func_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
@@ -128,9 +179,10 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
     for k, v in kwargs.items():
         if isinstance(v, SymbolBase):
             sym_kwargs[k] = v
+            %s('%s', '%s', v)
         else:
             keys.append(k)
-            vals.append(v)"""%(func_name.lower()))
+            vals.append(v)"""%(func_name.lower(), verify_symbol_fn, op_name, func_name))
             if key_var_num_args: # pylint: disable=using-constant-test
                 code.append("""
     if '%s' not in kwargs:
@@ -139,8 +191,8 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             key_var_num_args, key_var_num_args))
 
             code.append("""
-    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%(
-        handle.value))
+    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name, %s)"""%(
+        handle.value, str(is_np_op)))
     else:
         code.append("""
 def %s(%s):"""%(func_name, ', '.join(signature)))
@@ -155,9 +207,10 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     for _k, _v in kwargs.items():
         if isinstance(_v, SymbolBase):
             sym_kwargs[_k] = _v
+            {}('{}', '{}', _v)
         else:
             _keys.append(_k)
-            _vals.append(_v)""")
+            _vals.append(_v)""".format(verify_symbol_fn, op_name, func_name))
             # NDArray args
             for name in ndarg_names: # pylint: disable=redefined-argument-from-local
                 code.append("""
@@ -165,6 +218,9 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         assert isinstance({name}, SymbolBase), \\
             "Argument {name} must be Symbol instances, but got %s"%str({name})
         sym_kwargs['{name}'] = {name}""".format(name=name))
+                code.append("""
+        {}('{}', '{}', {name})
+                """.format(verify_symbol_fn, op_name, func_name, name=name))
             # kwargs
             for name in kwarg_names: # pylint: disable=redefined-argument-from-local
                 code.append("""
@@ -173,7 +229,13 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         _vals.append(%s)"""%(name, name, name))
             # dtype
             if dtype_name is not None:
-                code.append("""
+                if is_np_op:
+                    code.append("""
+    if %s is not _Null and %s is not None:
+        _keys.append('%s')
+        _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name, dtype_name))
+                else:
+                    code.append("""
     if %s is not _Null:
         _keys.append('%s')
         _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
@@ -182,8 +244,8 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     if not hasattr(NameManager._current, "value"):
         NameManager._current.value = NameManager()
     name = NameManager._current.value.get(name, '%s')
-    return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name)"""%(
-        func_name.lower(), handle.value))
+    return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name, %s)"""%(
+        func_name.lower(), handle.value, str(is_np_op)))
 
     if signature_only:
         code.append("""
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index d3cd519b9a8c..f0bc235f6661 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -61,6 +61,17 @@ class Symbol(SymbolBase):
     # Make numpy functions return Symbol instead of numpy object array
     __array_priority__ = 1000.0
 
+    def as_np_ndarray(self):
+        """Convert mx.sym.Symbol to mx.sym.np._Symbol."""
+        from .numpy import _Symbol
+        hdl = SymbolHandle()
+        check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
+        return _Symbol(hdl)
+
+    def as_nd_ndarray(self):
+        """Returns self. For the convenience of conversion between legacy and np symbols."""
+        return self
+
     def __repr__(self):
         """Gets a string representation of the symbol."""
         name = self.name
@@ -144,6 +155,8 @@ def __rsub__(self, other):
         array([[-2., -2., -2.],
                [-2., -2., -2.]], dtype=float32)
         """
+        if isinstance(other, Symbol):
+            return other.__sub__(self)
         if isinstance(other, Number):
             return _internal._RMinusScalar(self, scalar=other)
         else:
@@ -192,6 +205,8 @@ def __rdiv__(self, other):
         array([[ 0.33333334,  0.33333334,  0.33333334],
                [ 0.33333334,  0.33333334,  0.33333334]], dtype=float32)
         """
+        if isinstance(other, Symbol):
+            return other.__truediv__(self)
         if isinstance(other, Number):
             return _internal._RDivScalar(self, scalar=other)
         else:
@@ -222,6 +237,8 @@ def __rmod__(self, other):
         array([[ 1.,  1.,  1.,
                [ 1.,  1.,  1., dtype=float32)
         """
+        if isinstance(other, Symbol):
+            return other.__mod__(self)
         if isinstance(other, Number):
             return _internal._RModScalar(self, scalar=other)
         else:
@@ -252,7 +269,13 @@ def __pow__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rpow__(self, other):
-        raise NotImplementedForSymbol(self.__rpow__, 'y**x', other)
+        """x.__rpow__(y) <=> y ** x"""
+        if isinstance(other, Symbol):
+            return other.__pow__(self)
+        elif isinstance(other, Number):
+            return _internal._rpower_scalar(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def __neg__(self):
         """x.__neg__() <=> -x
@@ -2667,8 +2690,12 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
 Variable = var
 
 
-def Group(symbols):
+def Group(symbols, create_fn=Symbol):
     """Creates a symbol that contains a collection of other symbols, grouped together.
+    A classic symbol (`mx.sym.Symbol`) will be returned if all the symbols in the list
+    are of that type; a numpy symbol (`mx.sym.np._Symbol`) will be returned if all the
+    symbols in the list are of that type. A type error will be raised if a list of mixed
+    classic and numpy symbols are provided.
 
     Example
     -------
@@ -2682,6 +2709,9 @@ def Group(symbols):
     symbols : list
         List of symbols to be grouped.
 
+    create_fn : mx.sym.Symbol or mx.sym.np._Symbol
+        Symbol class for creating the grouped symbol.
+
     Returns
     -------
     sym : Symbol
@@ -2693,7 +2723,7 @@ def Group(symbols):
     check_call(_LIB.MXSymbolCreateGroup(
         mx_uint(len(symbols)),
         c_handle_array(symbols), ctypes.byref(handle)))
-    return Symbol(handle)
+    return create_fn(handle)
 
 
 def load(fname):
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index bd102412c6e2..bfe520b0137a 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -47,6 +47,8 @@
 from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from .ndarray import array
 from .symbol import Symbol
+from .symbol.numpy import _Symbol as np_symbol
+from .util import use_np  # pylint: disable=unused-import
 
 
 def default_context():
@@ -88,7 +90,8 @@ def get_etol(etol=None):
 
 def random_arrays(*shapes):
     """Generate some random numpy arrays."""
-    arrays = [np.random.randn(*s).astype(default_dtype())
+    arrays = [np.array(np.random.randn(), dtype=default_dtype())
+              if len(s) == 0 else np.random.randn(*s).astype(default_dtype())
               for s in shapes]
     if len(arrays) == 1:
         return arrays[0]
@@ -407,16 +410,20 @@ def create_sparse_array_zd(shape, stype, density, data_init=None,
                                density=density,
                                shuffle_csr_indices=shuffle_csr_indices)
 
-def rand_shape_2d(dim0=10, dim1=10):
-    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)
 
+def rand_shape_2d(dim0=10, dim1=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return rnd.randint(low, dim0 + 1), rnd.randint(low, dim1 + 1)
 
-def rand_shape_3d(dim0=10, dim1=10, dim2=10):
-    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)
 
+def rand_shape_3d(dim0=10, dim1=10, dim2=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return rnd.randint(low, dim0 + 1), rnd.randint(low, dim1 + 1), rnd.randint(low, dim2 + 1)
 
-def rand_shape_nd(num_dim, dim=10):
-    return tuple(rnd.randint(1, dim+1, size=num_dim))
+
+def rand_shape_nd(num_dim, dim=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return tuple(rnd.randint(low, dim+1, size=num_dim))
 
 
 def rand_coord_2d(x_low, x_high, y_low, y_high):
@@ -828,7 +835,7 @@ def as_stype(var, stype, dtype):
             continue
         stype = executor.arg_dict[k].stype
         old_value = v.copy()
-        for i in range(np.prod(v.shape)):
+        for i in range(int(np.prod(v.shape))):
             # inplace update
             v.ravel()[i] += eps/2.0
             executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
@@ -940,7 +947,12 @@ def random_projection(shape):
     input_shape = {k: v.shape for k, v in location.items()}
     _, out_shape, _ = sym.infer_shape(**input_shape)
     proj = mx.sym.Variable("__random_proj")
+    is_np_sym = bool(isinstance(sym, np_symbol))
+    if is_np_sym:  # convert to np symbol for using element-wise multiplication
+        proj = proj.as_np_ndarray()
     out = sym * proj
+    if is_np_sym:  # convert to classic symbol so that make_loss can be used
+        out = out.as_nd_ndarray()
     out = mx.sym.make_loss(out)
 
     location = dict(list(location.items()) +
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 5bc1dc809c88..d4e95e0c0c9c 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -20,6 +20,9 @@
 import os
 import sys
 import functools
+import itertools
+import inspect
+import threading
 
 from .base import _LIB, check_call
 
@@ -76,14 +79,24 @@ def set_np_shape(active):
     >>> print(mx.is_np_shape())
     True
     """
+    if active:
+        import logging
+        logging.info('NumPy-shape semantics has been activated in your code. '
+                     'This is required for creating and manipulating scalar and zero-size '
+                     'tensors, which were not supported in MXNet before, as in the official '
+                     'NumPy library. Please DO NOT manually deactivate this semantics while '
+                     'using `mxnet.numpy` and `mxnet.numpy_extension` modules.')
+    elif is_np_array():
+        raise ValueError('Deactivating NumPy shape semantics while NumPy array semantics is still'
+                         ' active is not allowed. Please consider calling `npx.reset_np()` to'
+                         ' deactivate both of them.')
     prev = ctypes.c_int()
     check_call(_LIB.MXSetIsNumpyShape(ctypes.c_int(active), ctypes.byref(prev)))
     return bool(prev.value)
 
 
 def is_np_shape():
-    """
-    Checks whether the NumPy shape semantics is currently turned on.
+    """Checks whether the NumPy shape semantics is currently turned on.
     In NumPy shape semantics, `()` represents the shape of scalar tensors,
     and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent
     the shapes of zero-size tensors. This is turned off by default for keeping
@@ -213,39 +226,379 @@ def np_shape(active=True):
     return _NumpyShapeScope(active)
 
 
+def wraps_safely(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS):
+    """This function is safe version of `functools.wraps` in Python2 which skips wrapping functions
+    for the attributes that do not exist."""
+    if sys.version_info[0] > 2:
+        return functools.wraps(wrapped)
+    else:
+        return functools.wraps(wrapped,
+                               assigned=itertools.ifilter(
+                                   functools.partial(hasattr, wrapped), assigned))
+
+
 def use_np_shape(func):
-    """Wraps a function with an activated NumPy-shape scope. This ensures
-    that the execution of the function is guaranteed with the support of
-    scalar and zero-size tensors as in NumPy.
+    """A decorator wrapping a function or class with activated NumPy-shape semantics.
+    When `func` is a function, this ensures that the execution of the function is scoped with NumPy
+    shape semantics, such as the support for zero-dim and zero size tensors. When
+    `func` is a class, it ensures that all the methods, static functions, and properties
+    of the class are executed with the NumPy shape semantics.
+
+    Example::
+        import mxnet as mx
+        @mx.use_np_shape
+        def scalar_one():
+            return mx.nd.ones(())
+        print(scalar_one())
+
+        @np.use_np_shape
+        class ScalarTensor(object):
+            def __init__(self, val=None):
+                if val is None:
+                    val = ScalarTensor.random().value
+                self._scalar = mx.nd.ones(()) * val
+
+            def __repr__(self):
+                print("Is __repr__ in np_shape semantics? {}!".format(str(np.is_np_shape())))
+                return str(self._scalar.asnumpy())
+
+            @staticmethod
+            def random():
+                val = mx.nd.random.uniform().asnumpy().item()
+                return ScalarTensor(val)
+
+            @property
+            def value(self):
+                print("Is value property in np_shape semantics? {}!".format(str(np.is_np_shape())))
+                return self._scalar.asnumpy().item()
+
+
+        print("Is global scope of np_shape activated? {}!".format(str(np.is_np_shape())))
+        scalar_tensor = ScalarTensor()
+        print(scalar_tensor)
+
+    Parameters
+    ----------
+    func : a user-provided callable function or class to be scoped by the NumPy-shape semantics.
+
+    Returns
+    -------
+    Function or class
+        A function or class wrapped in the NumPy-shape scope.
+    """
+
+    if inspect.isclass(func):
+        for name, method in inspect.getmembers(
+                func,
+                predicate=
+                lambda f: inspect.isfunction(f) or inspect.ismethod(f) or isinstance(f, property)):
+            if isinstance(method, property):
+                setattr(func, name, property(use_np_shape(method.__get__),
+                                             method.__set__,
+                                             method.__delattr__,
+                                             method.__doc__))
+            else:
+                setattr(func, name, use_np_shape(method))
+        return func
+    elif callable(func):
+        @wraps_safely(func)
+        def _with_np_shape(*args, **kwargs):
+            with np_shape(active=True):
+                return func(*args, **kwargs)
+        return _with_np_shape
+    else:
+        raise TypeError('use_np_shape can only decorate classes and callable objects, '
+                        'while received a {}'.format(str(type(func))))
+
+
+def _sanity_check_params(func_name, unsupported_params, param_dict):
+    for param_name in unsupported_params:
+        if param_name in param_dict:
+            raise NotImplementedError("function {} does not support parameter {}"
+                                      .format(func_name, param_name))
+
+
+def set_module(module):
+    """Decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module('mxnet.numpy')
+        def example():
+            pass
+
+        assert example.__module__ == 'numpy'
+    """
+    def decorator(func):
+        if module is not None:
+            func.__module__ = module
+        return func
+    return decorator
+
+
+class _NumpyArrayScope(object):
+    """Scope for managing NumPy array creation. This is often used
+    with `is_np_array=True` in initializer to enforce array creation
+    as type `mxnet.numpy.ndarray`, instead of `mx.nd.NDArray` in Gluon.
+
+    Do not use this class directly. Use `np_array(active)` instead.
+    """
+    _current = threading.local()
+
+    def __init__(self, is_np_array):  # pylint: disable=redefined-outer-name
+        self._old_scope = None
+        self._is_np_array = is_np_array
+
+    def __enter__(self):
+        if not hasattr(_NumpyArrayScope._current, "value"):
+            _NumpyArrayScope._current.value = _NumpyArrayScope(False)
+        self._old_scope = _NumpyArrayScope._current.value
+        _NumpyArrayScope._current.value = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        _NumpyArrayScope._current.value = self._old_scope
+
+
+def np_array(active=True):
+    """Returns an activated/deactivated NumPy-array scope to be used in 'with' statement
+    and captures code that needs the NumPy-array semantics.
+
+    Currently, this is used in Gluon to enforce array creation in `Block`s as type
+    `mxnet.numpy.ndarray`, instead of `mx.nd.NDArray`.
+
+    It is recommended to use the decorator `use_np_array` to decorate the classes
+    that need this semantics, instead of using this function in a `with` statement
+    unless you know exactly what has been scoped by this semantics.
 
     Please note that this is designed as an infrastructure for the incoming
     MXNet-NumPy operators. Legacy operators registered in the modules
     `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
     in NumPy even within this scope.
 
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to activate NumPy-array semantics.
+
+    Returns
+    -------
+    _NumpyShapeScope
+        A scope object for wrapping the code w/ or w/o NumPy-shape semantics.
+    """
+    return _NumpyArrayScope(active)
+
+
+def is_np_array():
+    """Checks whether the NumPy-array semantics is currently turned on.
+    This is currently used in Gluon for checking whether an array of type `mxnet.numpy.ndarray`
+    or `mx.nd.NDArray` should be created. For example, at the time when a parameter
+    is created in a `Block`, an `mxnet.numpy.ndarray` is created if this returns true; else
+    an `mx.nd.NDArray` is created.
+
+    Normally, users are not recommended to use this API directly unless you known exactly
+    what is going on under the hood.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy within this semantics.
+
+    Returns
+    -------
+        A bool value indicating whether the NumPy-array semantics is currently on.
+    """
+    return _NumpyArrayScope._current.value._is_np_array if hasattr(
+        _NumpyArrayScope._current, "value") else False
+
+
+def use_np_array(func):
+    """A decorator wrapping Gluon `Block`s and all its methods, properties, and static functions
+    with the semantics of NumPy-array, which means that where ndarrays are created,
+    `mxnet.numpy.ndarray`s should be created, instead of legacy ndarrays of type `mx.nd.NDArray`.
+    For example, at the time when a parameter is created in a `Block`, an `mxnet.numpy.ndarray`
+    is created if it's decorated with this decorator.
+
+    Example::
+        import mxnet as mx
+        from mxnet import gluon, np
+
+
+        class TestHybridBlock1(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock1, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.dot(x, w)
+
+
+        x = mx.nd.ones((2, 2))
+        net1 = TestHybridBlock1()
+        net1.initialize()
+        out = net1.forward(x)
+        for _, v in net1.collect_params().items():
+            assert type(v.data()) is mx.nd.NDArray
+        assert type(out) is mx.nd.NDArray
+
+
+        @np.use_np_array
+        class TestHybridBlock2(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock2, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.np.dot(x, w)
+
+
+        x = np.ones((2, 2))
+        net2 = TestHybridBlock2()
+        net2.initialize()
+        out = net2.forward(x)
+        for _, v in net2.collect_params().items():
+            print(type(v.data()))
+            assert type(v.data()) is np.ndarray
+        assert type(out) is np.ndarray
 
     Parameters
     ----------
-    func : a user-provided callable function to be scoped by the NumPy-shape semantics.
+    func : a user-provided callable function or class to be scoped by the NumPy-array semantics.
 
     Returns
     -------
-    Function
-        A function for wrapping the user functions in the NumPy-shape semantics.
+    Function or class
+        A function or class wrapped in the NumPy-array scope.
+    """
+    if inspect.isclass(func):
+        for name, method in inspect.getmembers(
+                func,
+                predicate=
+                lambda f: inspect.isfunction(f) or inspect.ismethod(f) or isinstance(f, property)):
+            if isinstance(method, property):
+                setattr(func, name, property(use_np_array(method.__get__),
+                                             method.__set__,
+                                             method.__delattr__,
+                                             method.__doc__))
+            else:
+                setattr(func, name, use_np_array(method))
+        return func
+    elif callable(func):
+        @wraps_safely(func)
+        def _with_np_array(*args, **kwargs):
+            with np_array(active=True):
+                return func(*args, **kwargs)
+        return _with_np_array
+    else:
+        raise TypeError('use_np_array can only decorate classes and callable objects, '
+                        'while received a {}'.format(str(type(func))))
 
 
-    Examples
-    --------
-    >>> import mxnet as mx
-    >>> @mx.use_np_shape
-    ... def scalar_one():
-    ...     return mx.nd.ones(())
-    ...
-    >>> print(scalar_one())
+def use_np(func):
+    """A convenience decorator for wrapping user provided functions and classes in the scope of
+    both NumPy-shape and NumPy-array semantics, which means that (1) empty tuples `()` and tuples
+    with zeros, such as `(0, 1)`, `(1, 0, 2)`, will be treated as scalar tensors' shapes and
+    zero-size tensors' shapes in shape inference functions of operators, instead of as unknown
+    in legacy mode; (2) ndarrays of type `mxnet.numpy.ndarray` should be created instead of
+    `mx.nd.NDArray`.
+
+    Example::
+        import mxnet as mx
+        from mxnet import gluon, np
+
+
+        class TestHybridBlock1(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock1, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.dot(x, w) + F.ones((1,))
+
+
+        x = mx.nd.ones((2, 2))
+        net1 = TestHybridBlock1()
+        net1.initialize()
+        out = net1.forward(x)
+        for _, v in net1.collect_params().items():
+            assert type(v.data()) is mx.nd.NDArray
+        assert type(out) is mx.nd.NDArray
+
+
+        @np.use_np
+        class TestHybridBlock2(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock2, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.np.dot(x, w) + F.np.ones(())
+
+
+        x = np.ones((2, 2))
+        net2 = TestHybridBlock2()
+        net2.initialize()
+        out = net2.forward(x)
+        for _, v in net2.collect_params().items():
+            print(type(v.data()))
+            assert type(v.data()) is np.ndarray
+        assert type(out) is np.ndarray
+
+    Parameters
+    ----------
+    func : a user-provided callable function or class to be scoped by the
+    NumPy-shape and NumPy-array semantics.
+
+    Returns
+    -------
+    Function or class
+        A function or class wrapped in the Numpy-shape and NumPy-array scope.
     """
-    @functools.wraps(func)
-    def _with_np_shape(*args, **kwargs):
-        with np_shape(active=True):
-            return func(*args, **kwargs)
+    return use_np_shape(use_np_array(func))
+
+
+def _set_np_array(active):
+    """Turns on/off NumPy array semantics for the current thread in which `mxnet.numpy.ndarray`
+    is expected to be created, instead of the legacy `mx.nd.NDArray`.
+
+    Parameters
+    ---------
+    active : bool
+        A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+
+    Returns
+    -------
+        A bool value indicating the previous state of NumPy array semantics.
+    """
+    if active:
+        import logging
+        logging.info('NumPy array semantics has been activated in your code. This allows you'
+                     ' to use operators from MXNet NumPy and NumPy Extension modules as well'
+                     ' as MXNet NumPy `ndarray`s.')
+    cur_state = is_np_array()
+    _NumpyArrayScope._current.value = _NumpyArrayScope(active)
+    return cur_state
+
+
+def set_np(shape=True, array=True):
+    """Setting NumPy shape and array semantics at the same time.
+    It is required to keep NumPy shape semantics active when activating NumPy array semantics.
+    Deactivating NumPy shape semantics while NumPy array semantics is still active is not allowed.
+
+    Parameters
+    ----------
+    shape : bool
+        A boolean value indicating whether the NumPy-shape semantics should be turned on or off.
+    array : bool
+        A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+    """
+    if not shape and array:
+        raise ValueError('NumPy Shape semantics is required in using NumPy array semantics.')
+    _set_np_array(array)
+    set_np_shape(shape)
+
 
-    return _with_np_shape
+def reset_np():
+    """Deactivate NumPy shape and array semantics at the same time."""
+    set_np(shape=False, array=False)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ffe6d8dcdbdc..13f22191955d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1604,3 +1604,12 @@ int MXStorageEmptyCache(int dev_type, int dev_id) {
   Storage::Get()->ReleaseAll(ctx);
   API_END();
 }
+
+int MXShallowCopyNDArray(NDArrayHandle src_handle, NDArrayHandle* out) {
+  NDArray* ret = nullptr;
+  API_BEGIN();
+  NDArray* src_array = static_cast<NDArray*>(src_handle);
+  ret = new NDArray(*src_array);
+  *out = ret;
+  API_END_HANDLE_ERROR(delete ret);
+}
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 013ecab93da8..233acc85f36b 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -31,6 +31,7 @@
 #include <mxnet/c_api.h>
 #include <mxnet/c_api_error.h>
 #include <mxnet/base.h>
+#include <mxnet/op_attr_types.h>
 #include <nnvm/graph.h>
 #include <vector>
 #include <string>
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4c6229ee29b0..930b03c4d366 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -855,11 +855,20 @@ int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_h
   API_BEGIN();
   nnvm::Symbol *source = static_cast<nnvm::Symbol *>(sym_handle);
   CHECK_EQ(source->outputs.size(), 1U)
-    << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
-  const auto& node = source->outputs[0];
+      << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
+  const auto &node = source->outputs[0];
   const auto *op = node.node->op();
   const auto attrs = source->ListAttrs(nnvm::Symbol::ListAttrOption::kShallow);
   *s = nnvm::Symbol::CreateFunctor(op, attrs);
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
+
+int MXShallowCopySymbol(SymbolHandle src, SymbolHandle* out) {
+  nnvm::Symbol* out_sym = new nnvm::Symbol;
+  API_BEGIN();
+  nnvm::Symbol* src_sym = static_cast<nnvm::Symbol*>(src);
+  *out_sym = *src_sym;
+  *out = out_sym;
+  API_END_HANDLE_ERROR(delete out_sym);
+}
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 067bb2ea750f..562e71775774 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -884,7 +884,6 @@ inline std::multimap<size_t, NDArray> AllocateMemory(
     }
     CHECK_EQ(stypes[i], kDefaultStorage);
     if (mem_plan[i].root == i) {
-      CHECK_GT(mem_plan[i].size, 0);
       auto iter = pool.lower_bound(mem_plan[i].size);
       if (iter != pool.end()) {
         *arrays[i] = iter->second.AsArray(shapes[i], dtypes[i]);
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index c0357998f31c..db9ac7682287 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -357,6 +357,7 @@ inline void copyMakeBorder(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_cvimdecode)
+.add_alias("_npi_cvimdecode")
 .describe("Decode image with OpenCV. \n"
           "Note: return image in RGB by default, "
           "instead of OpenCV's default BGR.")
@@ -368,6 +369,7 @@ NNVM_REGISTER_OP(_cvimdecode)
 .add_arguments(ImdecodeParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_cvimread)
+.add_alias("_npi_cvimread")
 .describe("Read and decode image with OpenCV. \n"
           "Note: return image in RGB by default, "
           "instead of OpenCV's default BGR.")
@@ -378,6 +380,7 @@ NNVM_REGISTER_OP(_cvimread)
 .add_arguments(ImreadParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_cvimresize)
+.add_alias("_npi_cvimresize")
 .describe("Resize image with OpenCV. \n")
 .set_num_inputs(1)
 .set_num_outputs(1)
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index bee8bef37b44..d8cb9317342e 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -312,7 +312,7 @@ NDArray NDArray::AtWithRecord(index_t idx) {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
   NDArray ret = this->SliceWithRecord(idx, idx+1);
-  if (shape_.ndim() > 1) {
+  if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
     return ret.ReshapeWithRecord(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
   } else {
     return ret;
@@ -1205,7 +1205,10 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
   CHECK(!mxnet::op::shape_is_none(from.shape()))
       << "source operands have undefined shape";
-  if (from.shape().Size() == 0U) return;
+  // zero-size array, no need to copy
+  if (from.shape().Size() == 0U) {
+    return;
+  }
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a = from_ctx.dev_mask();
@@ -1725,7 +1728,7 @@ bool NDArray::Load(dmlc::Stream *strm) {
     CHECK(!Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
-           " to scope of the code of loading the ndarray.";
+           " to scope the code of loading the ndarray.";
   }
   if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
     return LegacyLoad(strm, magic);
@@ -1865,6 +1868,10 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
+  // zero-size array, no need to copy
+  if (size == 0U) {
+    return;
+  }
   TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
@@ -1996,6 +2003,10 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
+  // zero-size array, no need to copy
+  if (size == 0U) {
+    return;
+  }
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 65fe5f1208bb..adb254853d2f 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -221,5 +221,9 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxDetection, MultiBoxDetectionProp)
 .add_argument("loc_pred", "NDArray-or-Symbol", "Location regression predictions.")
 .add_argument("anchor", "NDArray-or-Symbol", "Multibox prior anchor boxes")
 .add_arguments(MultiBoxDetectionParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_MultiBoxDetection)
+.add_alias("_npx_multibox_detection");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
index 2ad173a2dd93..66fd2c11517a 100644
--- a/src/operator/contrib/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -100,5 +100,8 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxPrior, MultiBoxPriorProp)
 .add_arguments(MultiBoxPriorParam::__FIELDS__())
 .describe("Generate prior(anchor) boxes from data, sizes and ratios.");
 
+NNVM_REGISTER_OP(_contrib_MultiBoxPrior)
+.add_alias("_npx_multibox_prior");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index a1808c5a7c81..feab3977f82c 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -307,5 +307,9 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxTarget, MultiBoxTargetProp)
 .add_argument("label", "NDArray-or-Symbol", "Object detection labels.")
 .add_argument("cls_pred", "NDArray-or-Symbol", "Class predictions.")
 .add_arguments(MultiBoxTargetParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_MultiBoxTarget)
+.add_alias("_npx_multibox_target");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/image/crop.cc b/src/operator/image/crop.cc
index 52d2f11a464b..6067f89d7033 100644
--- a/src/operator/image/crop.cc
+++ b/src/operator/image/crop.cc
@@ -35,6 +35,7 @@ namespace image {
 DMLC_REGISTER_PARAMETER(CropParam);
 
 NNVM_REGISTER_OP(_image_crop)
+.add_alias("_npx__image_crop")
 .describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C) 
 to the given size.
 Example:
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index 34f4cb4d395c..0c4603ecc475 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -39,6 +39,7 @@ DMLC_REGISTER_PARAMETER(RandomLightingParam);
 DMLC_REGISTER_PARAMETER(RandomColorJitterParam);
 
 NNVM_REGISTER_OP(_image_to_tensor)
+.add_alias("_npx__image_to_tensor")
 .describe(R"code(Converts an image NDArray of shape (H x W x C) or (N x H x W x C) 
 with values in the range [0, 255] to a tensor NDArray of shape (C x H x W) or (N x C x H x W)
 with values in the range [0, 1]
@@ -102,6 +103,7 @@ with values in the range [0, 1]
 .add_argument("data", "NDArray-or-Symbol", "Input ndarray");
 
 NNVM_REGISTER_OP(_image_normalize)
+.add_alias("_npx__image_normalize")
 .describe(R"code(Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
     standard deviation.
 
@@ -189,28 +191,34 @@ NNVM_REGISTER_OP(_backward_image_normalize)
 .set_attr<FCompute>("FCompute<cpu>", NormalizeOpBackward<cpu>);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_left_right)
+.add_alias("_npx__image_flip_left_right")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipLeftRight);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_left_right)
+.add_alias("_npx__image_random_flip_left_right")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipLeftRight);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_top_bottom)
+.add_alias("_npx__image_flip_top_bottom")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_top_bottom)
+.add_alias("_npx__image_random_flip_top_bottom")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_brightness)
+.add_alias("_npx__image_random_brightness")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomBrightness)
 .add_arguments(RandomEnhanceParam::__FIELDS__());
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_contrast)
+.add_alias("_npx__image_random_contrast")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomContrast)
@@ -218,6 +226,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_contrast)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_saturation)
+.add_alias("_npx__image_random_saturation")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomSaturation)
@@ -225,6 +234,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_saturation)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_hue)
+.add_alias("_npx__image_random_hue")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomHue)
@@ -232,6 +242,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_hue)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_color_jitter)
+.add_alias("_npx__image_random_color_jitter")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomColorJitterParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomColorJitter)
@@ -239,6 +250,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_color_jitter)
 
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_adjust_lighting)
+.add_alias("_npx__image_adjust_lighting")
 .describe(R"code(Adjust the lighting level of the input. Follow the AlexNet style.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<AdjustLightingParam>)
 .set_attr<FCompute>("FCompute<cpu>", AdjustLighting)
@@ -246,6 +258,7 @@ MXNET_REGISTER_IMAGE_AUG_OP(_image_adjust_lighting)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_lighting)
+.add_alias("_npx__image_random_lighting")
 .describe(R"code(Randomly add PCA noise. Follow the AlexNet style.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomLightingParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomLighting)
diff --git a/src/operator/image/resize.cc b/src/operator/image/resize.cc
index d93769faa8b3..d2397ea72685 100644
--- a/src/operator/image/resize.cc
+++ b/src/operator/image/resize.cc
@@ -34,6 +34,7 @@ namespace image {
 DMLC_REGISTER_PARAMETER(ResizeParam);
 
 NNVM_REGISTER_OP(_image_resize)
+.add_alias("_npx__image_resize")
 .describe(R"code(Resize an image NDArray of shape (H x W x C) or (N x H x W x C) 
 to the given size
 Example:
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 214e41a84611..c25833b799d0 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -71,6 +71,7 @@ The following modified ReLU Activation functions are supported:
 .add_arguments(LeakyReLUParam::__FIELDS__());
 
 NNVM_REGISTER_OP(LeakyReLU)
+.add_alias("_npx_leaky_relu")
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
       if (index == 1 && var->attrs.dict.find("__init__") == var->attrs.dict.end()) {
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 5b6cece4a92e..5abb6670c9b0 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -154,6 +154,7 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
+.add_alias("_npx_activation")
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 2564609c6b90..6382d46d272d 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -520,6 +520,7 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
 }
 
 NNVM_REGISTER_OP(BatchNorm)
+.add_alias("_npx_batch_norm")
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 8fb229889332..80469b5385eb 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -32,9 +32,9 @@
 namespace mxnet {
 namespace op {
 
-static bool ConcatShape(const nnvm::NodeAttrs& attrs,
-                        mxnet::ShapeVector *in_shape,
-                        mxnet::ShapeVector *out_shape) {
+bool ConcatShape(const nnvm::NodeAttrs& attrs,
+                 mxnet::ShapeVector *in_shape,
+                 mxnet::ShapeVector *out_shape) {
   using namespace mshadow;
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
@@ -138,9 +138,9 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(dshape);
 }
 
-static bool ConcatType(const nnvm::NodeAttrs& attrs,
-                       std::vector<int> *in_type,
-                       std::vector<int> *out_type) {
+bool ConcatType(const nnvm::NodeAttrs& attrs,
+                std::vector<int> *in_type,
+                std::vector<int> *out_type) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   int dtype = -1;
 
@@ -403,6 +403,7 @@ NNVM_REGISTER_OP(_backward_Concat)
 // which handles the case where the first one or two inputs may have
 // unknown shape that can be inferred from output shape.
 NNVM_REGISTER_OP(_rnn_param_concat)
+.add_alias("_npi_rnn_param_concat")
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 536e9a731171..32ed93e4a463 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -397,6 +397,7 @@ struct ConvolutionGrad {
 };
 
 NNVM_REGISTER_OP(Convolution)
+.add_alias("_npx_convolution")
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 09b255d009e0..9f461f4e9de3 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -408,6 +408,7 @@ struct DeconvolutionGrad {
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
 
 NNVM_REGISTER_OP(Deconvolution)
+.add_alias("_npx_deconvolution")
 .describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
     "input tensor. This operation can be seen as the gradient of Convolution operation with "
     "respect to its input. Convolution usually reduces the size of the input. Transposed "
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index d2fe3a5651a4..c4ad4250853e 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -63,6 +63,7 @@ struct DropoutGrad {
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
 NNVM_REGISTER_OP(Dropout)
+.add_alias("_npx_dropout")
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 27f6595aee9e..06ad6d034398 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -244,6 +244,7 @@ DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
 NNVM_REGISTER_OP(FullyConnected)
 MXNET_ADD_SPARSE_OP_ALIAS(FullyConnected)
+.add_alias("_npx_fully_connected")
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index e95f47255d7a..0b53d5091194 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -127,6 +127,7 @@ void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(LayerNorm)
+.add_alias("_npx_layer_norm")
 .describe(R"code(Layer normalization.
 
 Normalizes the channels of the input tensor by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 870557756128..8a3e90da3e71 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -364,7 +364,8 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
 NNVM_REGISTER_OP(Pooling)
-    .describe(R"code(Performs pooling on the input.
+.add_alias("_npx_pooling")
+.describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
 
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index e44bbbb6b8f6..8f1b2e06c371 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -68,6 +68,7 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
 #endif
 
 NNVM_REGISTER_OP(softmax)
+.add_alias("_npx_softmax")
 .describe(R"code(Applies the softmax function.
 
 The resulting array contains elements in the range (0,1) and the elements along the given axis sum up to 1.
@@ -182,6 +183,7 @@ NNVM_REGISTER_OP(_backward_softmin)
                                                         mxnet_op::softmax_bwd, true>);
 
 NNVM_REGISTER_OP(log_softmax)
+.add_alias("_npx_log_softmax")
 .describe(R"code(Computes the log softmax of the input.
 This is equivalent to computing softmax followed by log.
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
new file mode 100644
index 000000000000..c36423dff9fd
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_elemwise_binary_op.cc
+ * \brief CPU Implementation of basic functions for elementwise numpy binary broadcast operator.
+ */
+
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return in_attrs->at(0) != -1;
+}
+
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr_parser([](NodeAttrs* attrs) {                           \
+      attrs->parsed = std::stod(attrs->dict["scalar"]);             \
+    })                                                              \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .add_argument("data", "NDArray-or-Symbol", "source input")        \
+  .add_argument("scalar", "float", "scalar input")
+
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_add)
+.describe(R"code(Add arguments element-wise with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   add(x, y) = [[ 1.,  1.,  1.],
+                [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_subtract)
+.describe(R"code(Subtract arguments element-wise with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   subtract(x, y) = [[ 1.,  1.,  1.],
+                     [ 0.,  0.,  0.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_multiply)
+.describe(R"code(Multiply arguments with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   multiply(x, y) = [[ 0.,  0.,  0.],
+                     [ 1.,  1.,  1.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod)
+.describe(R"code(Return element-wise remainder of division.
+It is equivalent to the Python modulus operator``x1 % x2`` and has the same sign as the divisor x2.
+
+Example::
+
+   x = [[ 8.,  8.,  8.],
+        [ 8.,  8.,  8.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   mod(x, y) = [[ 0.,  0.,  0.],
+                [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_power)
+.describe(R"code(First array elements raised to powers from second array, element-wise.
+
+Raise each base in x1 to the positionally-corresponding power in x2. x1 and x2 must be
+broadcastable to the same shape.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   power(x, y) = [[ 2.,  2.,  2.],
+                  [ 4.,  4.,  4.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::power>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_add_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_subtract_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rsubtract_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_multiply_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_mod_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rmod_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_power_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rpower_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
new file mode 100644
index 000000000000..c858b3a4987a
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_elemwise_broadcast_op.cu
+ * \brief GPU Implementation of basic functions for elementwise binary broadcast operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+NNVM_REGISTER_OP(_npi_add)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::plus>);
+
+NNVM_REGISTER_OP(_npi_subtract)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::minus>);
+
+NNVM_REGISTER_OP(_npi_multiply)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::mul>);
+
+NNVM_REGISTER_OP(_npi_mod)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
+
+NNVM_REGISTER_OP(_npi_power)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::power>);
+
+NNVM_REGISTER_OP(_npi_maximum)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::maximum>);
+
+NNVM_REGISTER_OP(_npi_minimum)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::minimum>);
+
+NNVM_REGISTER_OP(_npi_add_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
+
+NNVM_REGISTER_OP(_npi_subtract_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>);
+
+NNVM_REGISTER_OP(_npi_rsubtract_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
+
+NNVM_REGISTER_OP(_npi_multiply_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>);
+
+NNVM_REGISTER_OP(_npi_mod_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
+
+NNVM_REGISTER_OP(_npi_rmod_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
+
+NNVM_REGISTER_OP(_npi_power_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
+
+NNVM_REGISTER_OP(_npi_rpower_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
+
+NNVM_REGISTER_OP(_npi_maximum_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::maximum>);
+
+NNVM_REGISTER_OP(_npi_minimum_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::minimum>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
new file mode 100644
index 000000000000..83a44c8ae280
--- /dev/null
+++ b/src/operator/numpy/np_init_op.cc
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.cc
+ * \brief CPU Implementation of numpy init op
+ */
+#include "../tensor/init_op.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_zeros)
+.describe("Return a new array of given shape, type, and context, filled with zeros.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
+.set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<InitOpParam, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.add_arguments(InitOpParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_npi_ones)
+.describe("Return a new array of given shape, type, and context, filled with ones.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
+.set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.add_arguments(InitOpParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_np_zeros_like)
+.describe(R"code(Return an array of zeros with the same shape and type as a given array.
+
+Examples::
+
+  x = [[ 1.,  1.,  1.],
+       [ 1.,  1.,  1.]]
+
+  zeros_like(x) = [[ 0.,  0.,  0.],
+                   [ 0.,  0.,  0.]]
+
+)code")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+  [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>(1, 0);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol",
+              "The shape and data-type of a define these same attributes of the returned array.");
+
+NNVM_REGISTER_OP(_np_ones_like)
+.describe(R"code(Return an array of ones with the same shape and type as a given array.
+
+Examples::
+
+  x = [[ 0.,  0.,  0.],
+       [ 0.,  0.,  0.]]
+
+  ones_like(x) = [[ 1.,  1.,  1.],
+                  [ 1.,  1.,  1.]]
+
+)code")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+  [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>(1, 0);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol",
+              "The shape and data-type of a define these same attributes of the returned array.");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
new file mode 100644
index 000000000000..2eb8ed6d83b7
--- /dev/null
+++ b/src/operator/numpy/np_init_op.cu
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.cu
+ * \brief GPU Implementation of numpy init op
+ */
+
+#include "../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_zeros)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(_npi_ones)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
+NNVM_REGISTER_OP(_np_zeros_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(_np_ones_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
new file mode 100644
index 000000000000..429762778700
--- /dev/null
+++ b/src/operator/numpy/np_true_divide.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_true_divide.cc
+ * \brief CPU Implementation of true_divide operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+template <int num_inputs>
+bool TrueDivideType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int>* in_attrs,
+                    std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(num_inputs));
+  CHECK_EQ(out_attrs->size(), 1U);
+  for (const int dtype : *in_attrs) {
+    if (dtype == -1) return false;
+  }
+  if (num_inputs == 2) {
+    const int lhs_dtype = in_attrs->at(0);
+    const int rhs_dtype = in_attrs->at(1);
+    CHECK_EQ(lhs_dtype, rhs_dtype)
+        << "_true_divide currently only supports same dtype for dividend and divisor";
+  }
+  auto is_float = [](const int dtype) {
+    return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
+  };
+
+  for (const int dtype : *in_attrs) {
+    CHECK(is_float(dtype)) << "_true_divide currently only supports float dtype";
+  }
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  return true;
+}
+
+NNVM_REGISTER_OP(_npi_true_divide)
+.describe(R"code(
+Returns a true division of the inputs, element-wise.
+
+It currently only supports dtype float16, float32, and float64.
+
+Example::
+
+   x = [[ 6.,  6.,  6.],
+        [ 6.,  6.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   _true_divide(x, y) = [[ 3.,  3.,  3.],
+                         [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<2>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
+.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
+.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+
+NNVM_REGISTER_OP(_npi_true_divide_scalar)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser([](NodeAttrs* attrs) {
+    attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
+.add_argument("data", "NDArray-or-Symbol", "source input")
+.add_argument("scalar", "float", "scalar input");
+
+NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser([](NodeAttrs* attrs) {
+  attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rdiv>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
+.add_argument("data", "NDArray-or-Symbol", "source input")
+.add_argument("scalar", "float", "scalar input");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
new file mode 100644
index 000000000000..be10c44f92a1
--- /dev/null
+++ b/src/operator/numpy/np_true_divide.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_true_divide.cu
+ * \brief GPU Implementation of true_divide operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_true_divide)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::div>);
+
+NNVM_REGISTER_OP(_npi_true_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::div>);
+
+NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_concat.cc b/src/operator/quantization/quantized_concat.cc
index f7a810b1e404..5835701497d9 100644
--- a/src/operator/quantization/quantized_concat.cc
+++ b/src/operator/quantization/quantized_concat.cc
@@ -28,8 +28,8 @@
 namespace mxnet {
 namespace op {
 
-static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_shape,
-                        mxnet::ShapeVector* out_shape) {
+static bool QuantizedConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_shape,
+                                 mxnet::ShapeVector* out_shape) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_shape->size(), 3U);
@@ -74,8 +74,8 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   return shape_is_known(dshape);
 }
 
-static bool ConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
-                       std::vector<int>* out_type) {
+static bool QuantizedConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
+                                std::vector<int>* out_type) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_type->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_type->size(), 3U);
@@ -130,8 +130,8 @@ If any input holds int8, then the output will be int8. Otherwise output will be
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-.set_attr<nnvm::FInferType>("FInferType", ConcatType)
-.set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedConcatType)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedConcatShape)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
 .add_arguments(ConcatParam::__FIELDS__());
diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc
index 56a162be5da4..543146257ddf 100644
--- a/src/operator/random/sample_op.cc
+++ b/src/operator/random/sample_op.cc
@@ -81,6 +81,7 @@ DMLC_REGISTER_PARAMETER(SampleGenNegBinomialLikeParam);
 MXNET_OPERATOR_REGISTER_SAMPLE(_random_uniform, SampleUniformParam)
 .add_alias("uniform")
 .add_alias("random_uniform")
+.add_alias("_npi_random_uniform")
 .describe(R"code(Draw random samples from a uniform distribution.
 
 .. note:: The existing alias ``uniform`` is deprecated.
@@ -99,6 +100,7 @@ Example::
 MXNET_OPERATOR_REGISTER_SAMPLE(_random_normal, SampleNormalParam)
 .add_alias("normal")
 .add_alias("random_normal")
+.add_alias("_npi_random_normal")
 .describe(R"code(Draw random samples from a normal (Gaussian) distribution.
 
 .. note:: The existing alias ``normal`` is deprecated.
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 70315716dea2..86797c136bab 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -122,6 +122,7 @@ void ShuffleForwardCPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_shuffle)
 .add_alias("shuffle")
+.add_alias("_np__random_shuffle")
 .describe(R"code(Randomly shuffle the elements.
 
 This shuffles the array along the first axis.
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 6a0dbd7a4e23..244e39335a91 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -634,6 +634,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
 #endif
 
 NNVM_REGISTER_OP(RNN)
+.add_alias("_npx_rnn")
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc
index 8862d0db1401..c72b203292fe 100644
--- a/src/operator/roi_pooling.cc
+++ b/src/operator/roi_pooling.cc
@@ -300,5 +300,9 @@ Example::
 "corners of designated region of interest. `batch_index` indicates the index of corresponding "
 "image in the input array")
 .add_arguments(ROIPoolingParam::__FIELDS__());
+
+NNVM_REGISTER_OP(ROIPooling)
+.add_alias("_npx_roi_pooling");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index f4f81a801e70..d7731026ce21 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -191,5 +191,8 @@ Example::
                   "vector of sequence lengths of the form [batch_size]")
     .add_arguments(SequenceMaskParam::__FIELDS__());
 
+NNVM_REGISTER_OP(SequenceMask)
+.add_alias("_npx_sequence_mask");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index b17a81f75bc6..fd9872db6ec8 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -47,7 +47,7 @@ enum SwapAxisOpOutputs {kOut};
 
 struct SwapAxisParam : public dmlc::Parameter<SwapAxisParam> {
   // use int for enumeration
-  uint32_t dim1, dim2;
+  int dim1, dim2;
   DMLC_DECLARE_PARAMETER(SwapAxisParam) {
     DMLC_DECLARE_FIELD(dim1)
     .set_default(0)
@@ -106,8 +106,6 @@ class SwapAxisOp : public Operator {
                 const std::vector<OpReqType> &req) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    int dim1 = param_.dim1;
-    int dim2 = param_.dim2;
 
     TBlob data_in = in_data[swapaxisenum::kData];
     TBlob data_out = out_data[swapaxisenum::kData];
@@ -115,10 +113,27 @@ class SwapAxisOp : public Operator {
 
     mxnet::TShape shape_in = data_in.shape_;
     mxnet::TShape shape_out = data_out.shape_;
+    int axis1 = param_.dim1;
+    if (axis1 < 0) {
+      axis1 += shape_in.ndim();
+    }
+    CHECK(axis1 >= 0 && axis1 < shape_in.ndim())
+        << "axis1: axis " << param_.dim1 << " is out of bounds for array of ndim "
+        << shape_in.ndim();
+
+    int axis2 = param_.dim2;
+    if (axis2 < 0) {
+      axis2 += shape_in.ndim();
+    }
+    CHECK(axis2 >= 0 && axis2 < shape_in.ndim())
+        << "axis2: axis " << param_.dim2 << " is out of bounds for array of ndim "
+        << shape_in.ndim();
+
+    if (shape_in.Size() == 0U) return;
 
     Shape<5> inter_shape;
 
-    Reshape2Five(&inter_shape, shape_in, dim1, dim2);
+    Reshape2Five(&inter_shape, shape_in, axis1, axis2);
 
     Tensor<xpu, 5, DType> inter_data_in = data_in.get_with_shape<xpu, 5, DType>(inter_shape, s);
 
@@ -187,13 +202,28 @@ class SwapAxisProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1U);
 
     mxnet::TShape &shape0 = (*in_shape)[swapaxisenum::kData];
+    if (!ndim_is_known(shape0)) return false;
+    int axis1 = param_.dim1;
+    if (axis1 < 0) {
+      axis1 += shape0.ndim();
+    }
+    CHECK(axis1 >= 0 && axis1 < shape0.ndim())
+        << "axis1: axis " << param_.dim1 << " is out of bounds for array of ndim " << shape0.ndim();
+
+    int axis2 = param_.dim2;
+    if (axis2 < 0) {
+      axis2 += shape0.ndim();
+    }
+    CHECK(axis2 >= 0 && axis2 < shape0.ndim())
+        << "axis2: axis " << param_.dim2 << " is out of bounds for array of ndim " << shape0.ndim();
+
     out_shape->clear();
     out_shape->push_back(shape0);
     mxnet::TShape &shape1 = (*out_shape)[swapaxisenum::kOut];
 
-    std::swap(shape1[param_.dim1], shape1[param_.dim2]);
+    std::swap(shape1[axis1], shape1[axis2]);
 
-    return true;
+    return shape_is_known(*out_shape);
   }
 
   bool InferType(std::vector<int> *in_type,
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index 45bcca4db9ae..32b26cc14f0c 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -69,6 +69,6 @@ Examples::
                        [ 3, 7]]]
 )code" ADD_FILELINE);
 
-NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes");
+NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes").add_alias("_npi_swapaxes");
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index c7c49937730c..27edf0195a1c 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -168,15 +168,27 @@ struct BroadcastLikeParam : public dmlc::Parameter<BroadcastLikeParam> {
   }
 };
 
-inline int CheckAxis(int axis, int ndim) {
-  CHECK(axis < ndim && axis >= -ndim)
-    << "axis " << axis << " exceeds the input dimension of " << ndim;
-  return (axis + ndim)%ndim;
+/*
+ * Check whether the axis is within the legal range.
+ */
+inline int CheckAxis(const int axis, const int ndim) {
+  if (ndim == 0) {
+    CHECK(axis == 0 || axis == -1) << "axis " << axis
+                                   << " is out of bounds for 0-dimension tensor";
+    return 0;
+  } else {
+    CHECK(axis < ndim && axis >= -ndim)
+        << "axis " << axis << " exceeds the input dimension of " << ndim;
+    return (axis + ndim) % ndim;
+  }
 }
 
 inline mxnet::TShape AxisShapeCompact(mxnet::TShape shape, int *axis, bool allow_2d) {
   int ndim = shape.ndim();
-  index_t leading = 1, trailing = 1, M = shape[*axis];
+  index_t leading = 1, trailing = 1, M = 1;
+  if (shape.ndim() > *axis) {
+    M = shape[*axis];
+  }
   for (int i = 0; i < *axis; ++i) leading *= shape[i];
   for (int i = *axis + 1; i < ndim; ++i) trailing *= shape[i];
   if (allow_2d && trailing == 1) {
@@ -553,14 +565,37 @@ void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (!param.axis) LOG(FATAL) << "Global reduction not supported yet";
+  int axis = inputs[0].ndim();
+  TBlob input = inputs[0];
+  if (param.axis.has_value()) {
+    axis = param.axis.value();
+  } else {
+    // If global reduction, reshape the input tensor into 2D shape (1, inputs[0].shape_.Size())
+    // and search on axis = 1.
+    mxnet::TShape shape_2d(2, 1);
+    shape_2d[1] = input.shape_.Size();
+    input = TBlob(input.dptr_, shape_2d, input.dev_mask(), input.type_flag_, input.dev_id());
+    axis = 1;
+  }
 
-  int axis = CheckAxis(param.axis.value(), inputs[0].shape_.ndim());
-  mxnet::TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, false);
+  axis = CheckAxis(axis, input.shape_.ndim());
+  if (inputs[0].shape_.ndim() != 0) {
+    if (param.axis.has_value()) {
+      // cannot do argmax in an empty dimension
+      CHECK_NE(inputs[0].shape_[axis], 0)
+          << "searching input tensor of shape " << inputs[0].shape_
+          << " along axis = " << axis << " of zero dim-size is not allowed";
+    } else {
+      // cannot do argmax on an empty array
+      CHECK_NE(inputs[0].shape_.Size(), 0U) << "attempt to search an empty sequence";
+    }
+  }
+  if (input.shape_.Size() == 0U) return;  // zero-size tensor
+  mxnet::TShape shape = AxisShapeCompact(input.shape_, &axis, false);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Tensor<xpu, 2, DType> out = outputs[0].get_with_shape<xpu, 2, DType>(
       Shape2(shape[0], shape[2]), s);
-    Tensor<xpu, 3, DType> in = inputs[0].get_with_shape<xpu, 3, DType>(
+    Tensor<xpu, 3, DType> in = input.get_with_shape<xpu, 3, DType>(
       shape.get<3>(), s);
     CHECK(req[0] != kAddTo) << "AddTo is not supported";
     ASSIGN_DISPATCH(out, req[0], (reduce_with_axis<reducer, true>(in, 1)));
@@ -865,8 +900,8 @@ struct reduce_axes_backward_broadcast {
                                   OType *out,
                                   DType *igrad,
                                   OType *ograd,
-                                  mshadow::Shape<5> in_shape,
-                                  mshadow::Shape<5> out_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape,
                                   const uint32_t ndim) {
     size_t in_stride = 1;
     size_t out_stride = 1;
@@ -902,9 +937,9 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
 
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-      mshadow::Shape<5> in_shape;
-      mshadow::Shape<5> out_shape;
-      for (int i = 0; i < 5; ++i) {
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+      for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
         if (i < dst_shape.ndim()) {
           in_shape[i] = src_shape[i];
           out_shape[i] = dst_shape[i];
@@ -914,36 +949,36 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
         }
       }
       if (dst_shape.ndim() == 2) {
-        Tensor<xpu, 2, DType> igrad =
-          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, OType> ograd =
-          inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> data =
-          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, OType> out =
-          inputs[2].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> igrad =
+          outputs[0].get_with_shape<xpu, 2, OType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> ograd =
+          inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> data =
+          inputs[1].get_with_shape<xpu, 2, OType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> out =
+          inputs[2].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
         MXNET_REQ_TYPE_SWITCH(req[0], Req, {
           Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
             s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
             in_shape, out_shape, src_shape.ndim());
         });
-        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+        if (normalize) igrad /= scalar<OType>(src_shape.Size()/dst_shape.Size());
       } else {
         const int ndim = MXNET_SPECIAL_MAX_NDIM;
-        Tensor<xpu, ndim, DType> igrad =
-          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, OType> ograd =
-          inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> data =
-          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, OType> out =
-          inputs[2].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> igrad =
+          outputs[0].get_with_shape<xpu, ndim, OType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> ograd =
+          inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> data =
+          inputs[1].get_with_shape<xpu, ndim, OType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> out =
+          inputs[2].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
         MXNET_REQ_TYPE_SWITCH(req[0], Req, {
           Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
             s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
             in_shape, out_shape, src_shape.ndim());
         });
-        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+        if (normalize) igrad /= scalar<OType>(src_shape.Size()/dst_shape.Size());
       }
     });
   });
@@ -968,6 +1003,34 @@ void ReduceAxesBackwardUseInOut(const nnvm::NodeAttrs& attrs,
   ReduceAxesBackwardUseInOutImpl<xpu, OP, normalize>(ctx, small, inputs, req, outputs);
 }
 
+template<typename OP>
+struct broadcast_kernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  IType *input,
+                                  OType *output,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape,
+                                  const OpReqType req,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t in_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % out_shape[iter];
+      in_idx -= dim_idx * out_stride;
+      if (in_shape[iter] != 1) {
+        in_idx += dim_idx * in_stride;
+      }
+      idx /= out_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+  }
+};
+
 template<typename xpu>
 inline void BroadcastComputeImpl(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
@@ -977,24 +1040,40 @@ inline void BroadcastComputeImpl(const nnvm::NodeAttrs& attrs,
                                  const mxnet::TShape& small) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(outputs[0].shape_, small, &dst_shape, &src_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (dst_shape.ndim() == 2) {
-      Tensor<xpu, 2, DType> out =
-        outputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> data =
-        inputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      ASSIGN_DISPATCH(out, req[0], broadcast_to(data, dst_shape));
-    } else {
-      const int ndim = MXNET_SPECIAL_MAX_NDIM;
-      Tensor<xpu, ndim, DType> out =
-        outputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> data =
-        inputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      ASSIGN_DISPATCH(out, req[0], broadcast_to(data, dst_shape));
-    }
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+      for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+        if (i < dst_shape.ndim()) {
+          in_shape[i] = src_shape[i];
+          out_shape[i] = dst_shape[i];
+        } else {
+          in_shape[i] = 1;
+          out_shape[i] = 1;
+        }
+      }
+      if (dst_shape.ndim() == 2) {
+        Tensor<xpu, 2, OType> out =
+          outputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, IType> data =
+          inputs[0].get_with_shape<xpu, 2, IType>(src_shape.get<2>(), s);
+        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
+          s, out.shape_.Size(), data.dptr_, out.dptr_, in_shape, out_shape, req[0], 2);
+      } else {
+        const int ndim = MXNET_SPECIAL_MAX_NDIM;
+        Tensor<xpu, ndim, OType> out =
+          outputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, IType> data =
+          inputs[0].get_with_shape<xpu, ndim, IType>(src_shape.get<ndim>(), s);
+        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
+          s, out.shape_.Size(), data.dptr_, out.dptr_, in_shape, out_shape, req[0], ndim);
+      }
+    });
   });
 }
 
@@ -1215,8 +1294,8 @@ struct norm_backward_broadcast {
                                   DType *igrad,
                                   OType *ograd,
                                   DType *data,
-                                  mshadow::Shape<5> in_shape,
-                                  mshadow::Shape<5> out_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape,
                                   const uint32_t ndim) {
     size_t in_stride = 1;
     size_t out_stride = 1;
@@ -1258,9 +1337,9 @@ void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
     mxnet::TShape src_shape, dst_shape;
     BroadcastReduceShapeCompact(outputs[0].shape_, small, &src_shape, &dst_shape);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    mshadow::Shape<5> in_shape;
-    mshadow::Shape<5> out_shape;
-    for (int i = 0; i < 5; ++i) {
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+    for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
       if (i < dst_shape.ndim()) {
         in_shape[i] = src_shape[i];
         out_shape[i] = dst_shape[i];
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index 56af3887c763..52082f759e7a 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -110,6 +110,7 @@ Examples::
 
 NNVM_REGISTER_OP(pick)
 .add_alias("choose_element_0index")
+.add_alias("_npx_pick")
 .describe(R"code(Picks elements from an input array according to the input indices along the given axis.
 
 Given an input array of shape ``(d0, d1)`` and indices of shape ``(i0,)``, the result will be
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 7d7b6c06c846..11a056146e1d 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -111,6 +111,7 @@ NNVM_REGISTER_OP(_backward_dot)
 .add_arguments(DotParam::__FIELDS__());
 
 NNVM_REGISTER_OP(batch_dot)
+.add_alias("_npx_batch_dot")
 .describe(R"doc(Batchwise dot product.
 
 ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index f84767dd4b2f..8a81bbc1c475 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -292,6 +292,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                             const std::vector<TBlob>& inputs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
+  if (outputs[0].shape_.Size() == 0U) return;
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index cd433e00a770..e3c2e0e898d9 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -30,6 +30,7 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_equal)
+.add_alias("_npi_equal")
 .describe(R"code(Returns the result of element-wise **equal to** (==) comparison operation with broadcasting.
 
 Example::
@@ -48,6 +49,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_not_equal)
+.add_alias("_npi_not_equal")
 .describe(R"code(Returns the result of element-wise **not equal to** (!=) comparison operation with broadcasting.
 
 Example::
@@ -66,6 +68,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater)
+.add_alias("_npi_greater")
 .describe(R"code(Returns the result of element-wise **greater than** (>) comparison operation with broadcasting.
 
 Example::
@@ -84,6 +87,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater_equal)
+.add_alias("_npi_greater_equal")
 .describe(R"code(Returns the result of element-wise **greater than or equal to** (>=) comparison operation with broadcasting.
 
 Example::
@@ -102,6 +106,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser)
+.add_alias("_npi_less")
 .describe(R"code(Returns the result of element-wise **lesser than** (<) comparison operation with broadcasting.
 
 Example::
@@ -120,6 +125,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser_equal)
+.add_alias("_npi_less_equal")
 .describe(R"code(Returns the result of element-wise **lesser than or equal to** (<=) comparison operation with broadcasting.
 
 Example::
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 2fe3fd9919cf..9c1d8b17fdea 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -487,9 +487,11 @@ class ElemwiseBinaryOp : public OpBase {
         MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          if (size != 0) {
+            Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+            outputs[0].dptr<DType>(),
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          }
         });
       });
     }
@@ -510,9 +512,11 @@ class ElemwiseBinaryOp : public OpBase {
         MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          if (size != 0) {
+            Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+            outputs[0].dptr<DType>(),
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          }
         });
       });
     }
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index f027665a549b..3a687c2aa062 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -84,7 +84,8 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_hypot_scalar)
   cpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-  .describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
+.add_alias("_npx_smooth_l1")
+.describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
 
 .. math::
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 17e76153ebb2..87ba394c99b2 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -71,26 +71,32 @@ static bool BinaryScalarLogicStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_equal_scalar, mshadow_op::eq)
+.add_alias("_npi_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_EqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_not_equal_scalar, mshadow_op::ne)
+.add_alias("_npi_not_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_NotEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_greater_scalar, mshadow_op::gt)
+.add_alias("_npi_greater_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_greater_equal_scalar, mshadow_op::ge)
+.add_alias("_npi_greater_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_lesser_scalar, mshadow_op::lt)
+.add_alias("_npi_less_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_lesser_equal_scalar, mshadow_op::le)
+.add_alias("_npi_less_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserEqualScalar");
 
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 458106e02671..87964ac246f0 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -243,8 +243,10 @@ class UnaryOp : public OpBase {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        if (inputs[0].Size() != 0) {
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        }
       });
     });
   }
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index f2b8dd6b1314..729c9f5ce6e5 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -483,6 +483,7 @@ Negative indices are supported, and `None` can be used for either `lhs_end` or `
   - lhs shape = (30, 12), rhs shape = (4, 2, 2, 3), lhs_begin=-1, lhs_end=None, rhs_begin=1, rhs_end=None, output shape = (30, 2, 2, 3)
 
 )code" ADD_FILELINE)
+.add_alias("_npx_reshape_like")
 .set_num_inputs(2)
 .set_attr_parser(ParamParser<ReshapeLikeParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
@@ -621,6 +622,7 @@ Example::
 DMLC_REGISTER_PARAMETER(CastParam);
 NNVM_REGISTER_OP(Cast)
 .add_alias("cast")
+.add_alias("_npx_cast")
 .describe(R"code(Casts all elements of the input to a new type.
 
 .. note:: ``Cast`` is deprecated. Use ``cast`` instead.
@@ -1195,6 +1197,7 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_expm1, unary_bwd<msh
 // gamma
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(gamma, cpu, mshadow_op::gamma)
 MXNET_ADD_SPARSE_OP_ALIAS(gamma)
+.add_alias("_npx_gamma")
 .describe(R"code(Returns the gamma function (extension of the factorial function \
 to the reals), computed element-wise on the input array.
 
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 396d1c612cd2..ad4e54db54f1 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -466,6 +466,7 @@ DMLC_REGISTER_PARAMETER(ScatterNDParam);
 
 NNVM_REGISTER_OP(Embedding)
 MXNET_ADD_SPARSE_OP_ALIAS(Embedding)
+.add_alias("_npx_embedding")
 .describe(R"code(Maps integer indices to vector representations (embeddings).
 
 This operator maps words to real-valued vectors in a high-dimensional space,
@@ -764,6 +765,7 @@ Examples::
 .add_argument("indices", "NDArray-or-Symbol", "The index array");
 
 NNVM_REGISTER_OP(one_hot)
+.add_alias("_npx_one_hot")
 .describe(R"code(Returns a one-hot array.
 
 The locations represented by `indices` take value `on_value`, while all
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 5cd7bf6652d3..aa6e7bbaf5ee 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -265,11 +265,17 @@ void TransposeImpl(RunContext ctx,
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(src.type_flag_, ret.type_flag_);
+  // zero-size tensor, no need to compute
+  if (src.shape_.Size() == 0U) return;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(ret.type_flag_, DType, {
     switch (axes.ndim()) {
-     case 0:
+     case 0: {
+      Tensor<xpu, 1, DType> in = src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(1), s);
+      Tensor<xpu, 1, DType> out = ret.get_with_shape<xpu, 1, DType>(mshadow::Shape1(1), s);
+      Copy(out, in, s);
       break;
+     }
      case 1: {
       Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = ret.get<xpu, 1, DType>(s);
@@ -1781,9 +1787,6 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
-  for (int i = 0; i < reps.ndim(); ++i) {
-    CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
-  }
   mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()), -1);
   int i1 = ishape.ndim() - 1;
   int i2 = reps.ndim() - 1;
@@ -1796,6 +1799,11 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
       oshape[i] = reps[i2--];
     }
   }
+  // If reps contains 0s, oshape is a zero-size shape.
+  // Need to distinguish between np_shape mode and legacy mode.
+  if (!Imperative::Get()->is_np_shape()) {
+    common::ConvertToNumpyShape(&oshape);
+  }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return shape_is_known(oshape);
 }
@@ -1814,7 +1822,7 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
 
 /*!
  * \brief Reshape the input and output tensors for
- * using broadcast_to to achieve the funcitonality
+ * using broadcast_to to achieve the functionality
  * of operator tile.
  * \return a pair of mxnet::TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
@@ -1822,7 +1830,7 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   const mxnet::TShape& ishape,
   const mxnet::Tuple<int>& reps) {
-  if (ishape.ndim() == 0 || reps.ndim() == 0) {
+  if (reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }
 
@@ -2177,7 +2185,7 @@ inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
   for (int i = 0; i < shape->ndim(); ++i) {
-    if ((*shape)[i] == 0) {
+    if ((*shape)[i] == -1) {
       ++count;
     } else {
       std::swap((*shape)[i], (*shape)[i-count]);
@@ -2210,12 +2218,12 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       CHECK_EQ(dshape[axes[i]], 1)
         << "cannot select an axis to squeeze out which has size="
         << dshape[axes[i]] << " not equal to one";
-      CHECK_NE(oshape[axes[i]], 0) << "duplicate value in axis";
-      oshape[axes[i]] = 0;
+      CHECK_NE(oshape[axes[i]], -1) << "duplicate value in axis";
+      oshape[axes[i]] = -1;
     }
   } else {
     for (int i = 0; i < oshape.ndim(); ++i) {
-      if (oshape[i] == 1) oshape[i] = 0;
+      if (oshape[i] == 1) oshape[i] = -1;
     }
   }
   size_t oshape_size = SqueezeShapeHelper(&oshape);
@@ -2631,10 +2639,14 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < num_outputs; ++i) {
     int start = indices[i];
     int end = (i < num_outputs - 1) ? indices[i + 1] : ishape[real_axis];
-    CHECK(start < end)
-      << "start " << start << " is not less than end " << end << "for subarray " << i;
-    CHECK(end <= ishape[real_axis])
-      << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    if (ishape[real_axis] == 0U) {
+      end = start;
+    } else {
+      CHECK(start < end)
+        << "start " << start << " is not less than end " << end << "for subarray " << i;
+      CHECK(end <= ishape[real_axis])
+        << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    }
     dshape[real_axis] = (end - start);
     if (param.squeeze_axis) {
       CHECK_EQ(end - start, 1U) << "expected axis size of 1 but got " << end - start;
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index c2bcb29193a7..bff76bc6bbb0 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -263,6 +263,7 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
+.add_alias("_npx_batch_flatten")
 .describe(R"code(Flattens the input array into a 2-D array by collapsing the higher dimensions.
 
 .. note:: `Flatten` is deprecated. Use `flatten` instead.
@@ -408,6 +409,7 @@ Examples::
 
 
 NNVM_REGISTER_OP(expand_dims)
+.add_alias("_npi_expand_dims")
 .describe(R"code(Inserts a new axis of size 1 into the array shape
 
 For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
@@ -696,6 +698,7 @@ NNVM_REGISTER_OP(_backward_slice_like)
 
 NNVM_REGISTER_OP(clip)
 MXNET_ADD_SPARSE_OP_ALIAS(clip)
+.add_alias("_npi_clip")
 .describe(R"code(Clips (limits) the values in an array.
 
 Given an interval, values outside the interval are clipped to the interval edges.
@@ -770,6 +773,7 @@ NNVM_REGISTER_OP(_backward_clip)
 .set_attr<FCompute>("FCompute<cpu>", ClipGrad_<cpu>);
 
 NNVM_REGISTER_OP(repeat)
+.add_alias("_np_repeat")
 .describe(R"code(Repeats elements of an array.
 
 By default, ``repeat`` flattens the input array into 1-D and then repeats the
@@ -820,6 +824,7 @@ NNVM_REGISTER_OP(_backward_repeat)
 });
 
 NNVM_REGISTER_OP(tile)
+.add_alias("_npi_tile")
 .describe(R"code(Repeats the whole array multiple times.
 
 If ``reps`` has length *d*, and input array has dimension of *n*. There are
@@ -1121,6 +1126,7 @@ Example::
 .add_arguments(DepthToSpaceParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_split_v2)
+.add_alias("_npi_split")
 .describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
 
 Example::
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index e2f014d1ad41..f693601a8822 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -34,6 +34,7 @@ DMLC_REGISTER_PARAMETER(SortParam);
 DMLC_REGISTER_PARAMETER(ArgSortParam);
 
 NNVM_REGISTER_OP(topk)
+.add_alias("_npx_topk")
 .describe(R"code(Returns the top *k* elements in an input array along the given axis.
  The returned elements will be sorted.
 
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index e4ec98f9f1bd..84fbdc8b49a5 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -35,6 +35,7 @@
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
 from common import run_in_spawned_process
 from test_operator import *
+from test_numpy_ndarray import *
 from test_optimizer import *
 from test_random import *
 from test_exc_handling import *
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
new file mode 100644
index 000000000000..6024ac9b4acd
--- /dev/null
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -0,0 +1,672 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import absolute_import
+from __future__ import division
+import os
+import numpy as _np
+import mxnet as mx
+from mxnet import np, npx, autograd
+from mxnet.gluon import HybridBlock
+from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, retry, assert_exception, use_np
+from common import with_seed, TemporaryDirectory
+
+
+@with_seed()
+@use_np
+def test_array_creation():
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    objects = [
+        [],
+        (),
+        [[1, 2], [3, 4]],
+        _np.random.uniform(size=rand_shape_nd(3)),
+        _np.random.uniform(size=(3, 0, 4))
+    ]
+    for dtype in dtypes:
+        for src in objects:
+            mx_arr = np.array(src, dtype=dtype)
+            assert mx_arr.context == mx.current_context()
+            if isinstance(src, mx.nd.NDArray):
+                np_arr = _np.array(src.asnumpy(), dtype=dtype if dtype is not None else _np.float32)
+            else:
+                np_arr = _np.array(src, dtype=dtype if dtype is not None else _np.float32)
+            assert mx_arr.dtype == np_arr.dtype
+            assert same(mx_arr.asnumpy(), np_arr)
+
+
+@with_seed()
+@use_np
+def test_zeros():
+    # test np.zeros in Gluon
+    class TestZeros(HybridBlock):
+        def __init__(self, shape, dtype=None):
+            super(TestZeros, self).__init__()
+            self._shape = shape
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x + F.np.zeros(shape, dtype)
+
+    class TestZerosOutputType(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x, F.np.zeros(shape=())
+
+    # test np.zeros in imperative
+    def check_zero_array_creation(shape, dtype):
+        np_out = _np.zeros(shape=shape, dtype=dtype)
+        mx_out = np.zeros(shape=shape, dtype=dtype)
+        assert same(mx_out.asnumpy(), np_out)
+        if dtype is None:
+            assert mx_out.dtype == _np.float32
+            assert np_out.dtype == _np.float64
+
+    shapes = [(0,), (2, 0, 2), (0, 0, 0, 0), ()]
+    shapes += [rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)]
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    for shape in shapes:
+        for dtype in dtypes:
+            check_zero_array_creation(shape, dtype)
+            x = np.array(_np.random.uniform(size=shape), dtype=dtype)
+            if dtype is None:
+                x = x.astype('float32')
+            for hybridize in [True, False]:
+                test_zeros = TestZeros(shape, dtype)
+                test_zeros_output_type = TestZerosOutputType()
+                if hybridize:
+                    test_zeros.hybridize()
+                    test_zeros_output_type.hybridize()
+                y = test_zeros(x)
+                assert type(y) == np.ndarray
+                assert same(x.asnumpy(), y.asnumpy())
+                y = test_zeros_output_type(x)
+                assert type(y[1]) == np.ndarray
+
+
+@with_seed()
+@use_np
+def test_ones():
+    # test np.ones in Gluon
+    class TestOnes(HybridBlock):
+        def __init__(self, shape, dtype=None):
+            super(TestOnes, self).__init__()
+            self._shape = shape
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x * F.np.ones(shape, dtype)
+
+    class TestOnesOutputType(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x, F.np.ones(shape=())
+
+    # test np.ones in imperative
+    def check_ones_array_creation(shape, dtype):
+        np_out = _np.ones(shape=shape, dtype=dtype)
+        mx_out = np.ones(shape=shape, dtype=dtype)
+        assert same(mx_out.asnumpy(), np_out)
+        if dtype is None:
+            assert mx_out.dtype == _np.float32
+            assert np_out.dtype == _np.float64
+
+    shapes = [(0,), (2, 0, 2), (0, 0, 0, 0), ()]
+    shapes += [rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)]
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    for shape in shapes:
+        for dtype in dtypes:
+            check_ones_array_creation(shape, dtype)
+            x = mx.nd.array(_np.random.uniform(size=shape), dtype=dtype).as_np_ndarray()
+            if dtype is None:
+                x = x.astype('float32')
+            for hybridize in [True, False]:
+                test_ones = TestOnes(shape, dtype)
+                test_ones_output_type = TestOnesOutputType()
+                if hybridize:
+                    test_ones.hybridize()
+                    test_ones_output_type.hybridize()
+                y = test_ones(x)
+                assert type(y) == np.ndarray
+                assert same(x.asnumpy(), y.asnumpy())
+                y = test_ones_output_type(x)
+                assert type(y[1]) == np.ndarray
+
+
+@with_seed()
+def test_ndarray_binary_element_wise_ops():
+    np_op_map = {
+        '+': _np.add,
+        '*': _np.multiply,
+        '-': _np.subtract,
+        '/': _np.divide,
+        'mod': _np.mod,
+        'pow': _np.power,
+        '==': _np.equal,
+        '>': _np.greater,
+        '>=': _np.greater_equal,
+        '<': _np.less,
+        '<=': _np.less_equal
+    }
+
+    def get_np_ret(x1, x2, op):
+        return np_op_map[op](x1, x2)
+
+    @use_np
+    class TestBinaryElementWiseOp(HybridBlock):
+        def __init__(self, op, scalar=None, reverse=False):
+            super(TestBinaryElementWiseOp, self).__init__()
+            self._op = op
+            self._scalar = scalar
+            self._reverse = reverse  # if false, scalar is the right operand.
+
+        def hybrid_forward(self, F, x, *args):
+            if self._op == '+':
+                if self._scalar is not None:
+                    return x + self._scalar if not self._reverse else self._scalar + x
+                else:
+                    return x + args[0] if not self._reverse else args[0] + x
+            elif self._op == '*':
+                if self._scalar is not None:
+                    return x * self._scalar if not self._reverse else self._scalar * x
+                else:
+                    return x * args[0] if not self._reverse else args[0] * x
+            elif self._op == '-':
+                if self._scalar is not None:
+                    return x - self._scalar if not self._reverse else self._scalar - x
+                else:
+                    return x - args[0] if not self._reverse else args[0] - x
+            elif self._op == '/':
+                if self._scalar is not None:
+                    return x / self._scalar if not self._reverse else self._scalar / x
+                else:
+                    return x / args[0] if not self._reverse else args[0] / x
+            elif self._op == 'mod':
+                if self._scalar is not None:
+                    return x % self._scalar if not self._reverse else self._scalar % x
+                else:
+                    return x % args[0] if not self._reverse else args[0] % x
+            elif self._op == 'pow':
+                if self._scalar is not None:
+                    return x ** self._scalar if not self._reverse else self._scalar ** x
+                else:
+                    return x ** args[0] if not self._reverse else args[0] ** x
+            elif self._op == '>':
+                if self._scalar is not None:
+                    return x > self._scalar if not self._reverse else self._scalar > x
+                else:
+                    return x > args[0]
+            elif self._op == '>=':
+                if self._scalar is not None:
+                    return x >= self._scalar if not self._reverse else self._scalar >= x
+                else:
+                    return x >= args[0]
+            elif self._op == '<':
+                if self._scalar is not None:
+                    return x < self._scalar if not self._reverse else self._scalar < x
+                else:
+                    return x < args[0]
+            elif self._op == '<=':
+                if self._scalar is not None:
+                    return x <= self._scalar if not self._reverse else self._scalar <= x
+                else:
+                    return x <= args[0]
+            elif self._op == '==':
+                if self._scalar is not None:
+                    return x == self._scalar if not self._reverse else self._scalar == x
+                else:
+                    return x == args[0]
+            else:
+                print(self._op)
+                assert False
+
+    @use_np
+    def check_binary_op_result(shape1, shape2, op, dtype=None):
+        if shape1 is None:
+            mx_input1 = abs(_np.random.uniform()) + 1
+            np_input1 = mx_input1
+        else:
+            mx_input1 = rand_ndarray(shape1, dtype=dtype).abs() + 1
+            np_input1 = mx_input1.asnumpy()
+        if shape2 is None:
+            mx_input2 = abs(_np.random.uniform()) + 1
+            np_input2 = mx_input2
+        else:
+            mx_input2 = rand_ndarray(shape2, dtype=dtype).abs() + 1
+            np_input2 = mx_input2.asnumpy()
+
+        scalar = None
+        reverse = False
+        if isinstance(mx_input1, mx.nd.NDArray) and not isinstance(mx_input2, mx.nd.NDArray):
+            scalar = mx_input2
+            reverse = False
+        elif isinstance(mx_input2, mx.nd.NDArray) and not isinstance(mx_input1, mx.nd.NDArray):
+            scalar = mx_input1
+            reverse = True
+
+        np_out = get_np_ret(np_input1, np_input2, op)
+        for hybridize in [True, False]:
+            if scalar is None:
+                get_mx_ret_np = TestBinaryElementWiseOp(op)
+                get_mx_ret_classic = TestBinaryElementWiseOp(op)
+                if hybridize:
+                    get_mx_ret_np.hybridize()
+                    get_mx_ret_classic.hybridize()
+                mx_out = get_mx_ret_np(mx_input1.as_np_ndarray(), mx_input2.as_np_ndarray())
+                assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+            else:
+                get_mx_ret = TestBinaryElementWiseOp(op, scalar=scalar, reverse=reverse)
+                if hybridize:
+                    get_mx_ret.hybridize()
+                if reverse:
+                    mx_out = get_mx_ret(mx_input2.as_np_ndarray())
+                    assert type(mx_out) == np.ndarray
+                else:
+                    mx_out = get_mx_ret(mx_input1.as_np_ndarray())
+                    assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+
+    dtypes = [_np.float32, _np.float64, None]
+    ops = np_op_map.keys()
+    for dtype in dtypes:
+        for op in ops:
+            check_binary_op_result((3, 4), (3, 4), op, dtype)
+            check_binary_op_result(None, (3, 4), op, dtype)
+            check_binary_op_result((3, 4), None, op, dtype)
+            check_binary_op_result((1, 4), (3, 1), op, dtype)
+            check_binary_op_result(None, (3, 1), op, dtype)
+            check_binary_op_result((1, 4), None, op, dtype)
+            check_binary_op_result((1, 4), (3, 5, 4), op, dtype)
+            check_binary_op_result((), (3, 5, 4), op, dtype)
+            check_binary_op_result((), None, op, dtype)
+            check_binary_op_result(None, (), op, dtype)
+            check_binary_op_result((0, 2), (1, 1), op, dtype)
+            check_binary_op_result((0, 2), None, op, dtype)
+            check_binary_op_result(None, (0, 2), op, dtype)
+
+
+@with_seed()
+def test_hybrid_block_multiple_outputs():
+    @use_np
+    class TestAllNumpyOutputs(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return F.np.add(x, x), F.np.multiply(x, x)
+
+    class TestAllClassicOutputs(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x.as_nd_ndarray() + x.as_nd_ndarray(), x.as_nd_ndarray() * x.as_nd_ndarray()
+
+    data_np = np.ones((2, 3))
+    for block, expected_out_type in [(TestAllClassicOutputs, mx.nd.NDArray),
+                                     (TestAllNumpyOutputs, np.ndarray)]:
+        net = block()
+        for hybridize in [True, False]:
+            if hybridize:
+                net.hybridize()
+            out1, out2 = net(data_np)
+            assert type(out1) is expected_out_type
+            assert type(out2) is expected_out_type
+
+    @use_np
+    class TestMixedTypeOutputsFailure(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x.as_nd_ndarray() + x.as_nd_ndarray(), F.np.multiply(x, x)
+
+    net = TestMixedTypeOutputsFailure()
+    assert_exception(net, TypeError, data_np)
+    net.hybridize()
+    assert_exception(net, TypeError, data_np)
+
+
+@with_seed()
+@use_np
+def test_grad_ndarray_type():
+    data = np.array(2, dtype=_np.float32)
+    data.attach_grad()
+    assert type(data.grad) == np.ndarray
+    assert type(data.detach()) == np.ndarray
+
+
+@with_seed()
+def test_np_ndarray_astype():
+    mx_data = np.array([2, 3, 4, 5], dtype=_np.int32)
+    np_data = mx_data.asnumpy()
+
+    def check_astype_equal(dtype, copy, expect_zero_copy=False):
+        mx_ret = mx_data.astype(dtype=dtype, copy=copy)
+        assert type(mx_ret) is np.ndarray
+        np_ret = np_data.astype(dtype=dtype, copy=copy)
+        assert mx_ret.dtype == np_ret.dtype
+        assert same(mx_ret.asnumpy(), np_ret)
+        if expect_zero_copy:
+            assert id(mx_ret) == id(mx_data)
+            assert id(np_ret) == id(np_data)
+
+    for dtype in [_np.int8, _np.uint8, _np.int32, _np.float16, _np.float32, _np.float64]:
+        for copy in [True, False]:
+            check_astype_equal(dtype, copy, copy is False and mx_data.dtype == dtype)
+
+
+@with_seed()
+def test_np_ndarray_copy():
+    mx_data = np.array([2, 3, 4, 5], dtype=_np.int32)
+    assert_exception(mx_data.copy, NotImplementedError, order='F')
+    mx_ret = mx_data.copy()
+    np_ret = mx_data.asnumpy().copy()
+    assert same(mx_ret.asnumpy(), np_ret)
+
+
+@with_seed()
+@use_np
+def test_np_ndarray_indexing():
+    def test_getitem(np_array, index):
+        """`is_scalar` indicates whether we should expect a scalar for the result.
+        If so, the indexed array of NDArray should call asscalar to compare
+        with numpy's indexed array."""
+        np_index = index
+        if isinstance(index, np.ndarray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, np.ndarray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        np_indexed_array = np_array[np_index]
+        mx_array = np.array(np_array, dtype=np_array.dtype)
+        mx_indexed_array = mx_array[index].asnumpy()
+        assert same(np_indexed_array, mx_indexed_array), 'Failed with index=%s' % str(index)
+
+    def test_setitem(np_array, index):
+        def assert_same(np_array, np_index, mx_array, mx_index, mx_value, np_value=None):
+            if np_value is not None:
+                np_array[np_index] = np_value
+            elif isinstance(mx_value, np.ndarray):
+                np_array[np_index] = mx_value.asnumpy()
+            else:
+                np_array[np_index] = mx_value
+            mx_array[mx_index] = mx_value
+            assert same(np_array, mx_array.asnumpy())
+
+        np_index = index
+        if isinstance(index, np.ndarray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, np.ndarray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        mx_array = np.array(np_array, dtype=np_array.dtype)
+        np_array = mx_array.asnumpy()
+        indexed_array_shape = np_array[np_index].shape
+        np_indexed_array = _np.random.randint(low=-10000, high=0, size=indexed_array_shape)
+        # test value is a numpy array without broadcast
+        assert_same(np_array, np_index, mx_array, index, np_indexed_array)
+        # test value is an numeric_type
+        assert_same(np_array, np_index, mx_array, index, _np.random.randint(low=-10000, high=0))
+        if len(indexed_array_shape) > 1:
+            # test ndarray with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        _np.random.uniform(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+            # test numpy array with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        _np.random.randint(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+            # test list with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        [_np.random.randint(low=-10000, high=0)] * indexed_array_shape[-1])
+
+    def test_getitem_autograd(np_array, index):
+        x = np.array(np_array, dtype=np_array.dtype)
+        x.attach_grad()
+        with autograd.record():
+            y = x[index]
+        y.backward()
+        value = np.ones_like(y)
+        x_grad = np.zeros_like(x)
+        x_grad[index] = value
+        assert same(x_grad.asnumpy(), x.grad.asnumpy())
+
+    def test_setitem_autograd(np_array, index):
+        x = np.array(np_array, dtype=np_array.dtype)
+        out_shape = x[index].shape
+        y = np.array(_np.random.uniform(size=out_shape))
+        y.attach_grad()
+        try:
+            with autograd.record():
+                x[index] = y
+                assert False  # should not reach here
+        except mx.base.MXNetError as err:
+            assert str(err).find('Inplace operations (+=, -=, x[:]=, etc) are not supported when recording with') != -1
+
+    def np_int(index, int_type=_np.int32):
+        def convert(num):
+            if num is None:
+                return num
+            else:
+                return int_type(num)
+
+        if isinstance(index, slice):
+            return slice(convert(index.start), convert(index.stop), convert(index.step))
+        elif isinstance(index, tuple):  # tuple of slices and integers
+            ret = []
+            for elem in index:
+                if isinstance(elem, slice):
+                    ret.append(slice(convert(elem.start), convert(elem.stop), convert(elem.step)))
+                else:
+                    ret.append(convert(elem))
+            return tuple(ret)
+        else:
+            assert False
+
+    shape = (8, 16, 9, 9)
+    np_array = _np.arange(_np.prod(shape), dtype='int32').reshape(shape)
+    index_list = [
+        (),
+        0,
+        _np.int32(0),
+        _np.int64(0),
+        5,
+        _np.int32(5),
+        _np.int64(5),
+        -1,
+        _np.int32(-1),
+        _np.int64(-1),
+        slice(5),
+        np_int(slice(5), _np.int32),
+        np_int(slice(5), _np.int64),
+        slice(1, 5),
+        np_int(slice(1, 5), _np.int32),
+        np_int(slice(1, 5), _np.int64),
+        slice(1, 5, 2),
+        np_int(slice(1, 5, 2), _np.int32),
+        np_int(slice(1, 5, 2), _np.int64),
+        slice(7, 0, -1),
+        np_int(slice(7, 0, -1)),
+        np_int(slice(7, 0, -1), _np.int64),
+        slice(None, 6),
+        np_int(slice(None, 6)),
+        np_int(slice(None, 6), _np.int64),
+        slice(None, 6, 3),
+        np_int(slice(None, 6, 3)),
+        np_int(slice(None, 6, 3), _np.int64),
+        slice(1, None),
+        np_int(slice(1, None)),
+        np_int(slice(1, None), _np.int64),
+        slice(1, None, 3),
+        np_int(slice(1, None, 3)),
+        np_int(slice(1, None, 3), _np.int64),
+        slice(None, None, 2),
+        np_int(slice(None, None, 2)),
+        np_int(slice(None, None, 2), _np.int64),
+        slice(None, None, -1),
+        np_int(slice(None, None, -1)),
+        np_int(slice(None, None, -1), _np.int64),
+        slice(None, None, -2),
+        np_int(slice(None, None, -2), _np.int32),
+        np_int(slice(None, None, -2), _np.int64),
+        (slice(None), slice(None), 1, 8),
+        (slice(None), slice(None), -1, 8),
+        (slice(None), slice(None), 1, -8),
+        (slice(None), slice(None), -1, -8),
+        np_int((slice(None), slice(None), 1, 8)),
+        np_int((slice(None), slice(None), 1, 8), _np.int64),
+        (slice(None), slice(None), 1, 8),
+        np_int((slice(None), slice(None), -1, -8)),
+        np_int((slice(None), slice(None), -1, -8), _np.int64),
+        (slice(None), 2, slice(1, 5), 1),
+        np_int((slice(None), 2, slice(1, 5), 1)),
+        np_int((slice(None), 2, slice(1, 5), 1), _np.int64),
+        (1, 2, 3),
+        np_int((1, 2, 3)),
+        np_int((1, 2, 3), _np.int64),
+        (-1, -2, -3),
+        np_int((-1, -2, -3)),
+        np_int((-1, -2, -3), _np.int64),
+        (1, 2, 3, 4),
+        np_int((1, 2, 3, 4)),
+        np_int((1, 2, 3, 4), _np.int64),
+        (-4, -3, -2, -1),
+        np_int((-4, -3, -2, -1)),
+        np_int((-4, -3, -2, -1), _np.int64),
+        (slice(None, None, -1), 2, slice(1, 5), 1),
+        np_int((slice(None, None, -1), 2, slice(1, 5), 1)),
+        np_int((slice(None, None, -1), 2, slice(1, 5), 1), _np.int64),
+        (slice(None, None, -1), 2, slice(1, 7, 2), 1),
+        np_int((slice(None, None, -1), 2, slice(1, 7, 2), 1)),
+        np_int((slice(None, None, -1), 2, slice(1, 7, 2), 1), _np.int64),
+        (slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)),
+        np_int((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3))),
+        np_int((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)), _np.int64),
+        (slice(1, 8, 2), 1, slice(3, 8), 2),
+        np_int((slice(1, 8, 2), 1, slice(3, 8), 2)),
+        np_int((slice(1, 8, 2), 1, slice(3, 8), 2), _np.int64),
+        [1],
+        [1, 2],
+        [2, 1, 3],
+        [7, 5, 0, 3, 6, 2, 1],
+        _np.array([6, 3], dtype=_np.int32),
+        _np.array([[3, 4], [0, 6]], dtype=_np.int32),
+        _np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int32),
+        _np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int64),
+        _np.array([[2], [0], [1]], dtype=_np.int32),
+        _np.array([[2], [0], [1]], dtype=_np.int64),
+        np.array([4, 7], dtype=_np.int32),
+        np.array([4, 7], dtype=_np.int64),
+        np.array([[3, 6], [2, 1]], dtype=_np.int32),
+        np.array([[3, 6], [2, 1]], dtype=_np.int64),
+        np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int32),
+        np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int64),
+        (1, [2, 3]),
+        (1, [2, 3], _np.array([[3], [0]], dtype=_np.int32)),
+        (1, [2, 3]),
+        (1, [2, 3], _np.array([[3], [0]], dtype=_np.int64)),
+        (1, [2], _np.array([[5], [3]], dtype=_np.int32), slice(None)),
+        (1, [2], _np.array([[5], [3]], dtype=_np.int64), slice(None)),
+        (1, [2, 3], _np.array([[6], [0]], dtype=_np.int32), slice(2, 5)),
+        (1, [2, 3], _np.array([[6], [0]], dtype=_np.int64), slice(2, 5)),
+        (1, [2, 3], _np.array([[4], [7]], dtype=_np.int32), slice(2, 5, 2)),
+        (1, [2, 3], _np.array([[4], [7]], dtype=_np.int64), slice(2, 5, 2)),
+        (1, [2], _np.array([[3]], dtype=_np.int32), slice(None, None, -1)),
+        (1, [2], _np.array([[3]], dtype=_np.int64), slice(None, None, -1)),
+        (1, [2], _np.array([[3]], dtype=_np.int32), np.array([[5, 7], [2, 4]], dtype=_np.int64)),
+        (1, [2], np.array([[4]], dtype=_np.int32), np.array([[1, 3], [5, 7]], dtype='int64')),
+        [0],
+        [0, 1],
+        [1, 2, 3],
+        [2, 0, 5, 6],
+        ([1, 1], [2, 3]),
+        ([1], [4], [5]),
+        ([1], [4], [5], [6]),
+        ([[1]], [[2]]),
+        ([[1]], [[2]], [[3]], [[4]]),
+        (slice(0, 2), [[1], [6]], slice(0, 2), slice(0, 5, 2)),
+        ([[[[1]]]], [[1]], slice(0, 3), [1, 5]),
+        ([[[[1]]]], 3, slice(0, 3), [1, 3]),
+        ([[[[1]]]], 3, slice(0, 3), 0),
+        ([[[[1]]]], [[2], [12]], slice(0, 3), slice(None)),
+        ([1, 2], slice(3, 5), [2, 3], [3, 4]),
+        ([1, 2], slice(3, 5), (2, 3), [3, 4]),
+        range(4),
+        range(3, 0, -1),
+        (range(4,), [1]),
+        # slice(0, 0) does not support output zero-size tensor yet
+    ]
+    for index in index_list:
+        test_getitem(np_array, index)
+        test_setitem(np_array, index)
+        test_getitem_autograd(np_array, index)
+        if not isinstance(index, tuple) or len(index) != 0:
+            # When index = (), this is same a[()] = b is equivalent to b.copyto(a)
+            # which should have no problem to do autograd
+            test_setitem_autograd(np_array, index)
+
+
+@with_seed()
+@use_np
+def test_np_save_load_ndarrays():
+    shapes = [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 5, 0), (4, 5, 6), (0, 0, 0)]
+    array_list = [_np.random.randint(0, 10, size=shape) for shape in shapes]
+    array_list = [np.array(arr, dtype=arr.dtype) for arr in array_list]
+    # test save/load single ndarray
+    for i, arr in enumerate(array_list):
+        with TemporaryDirectory() as work_dir:
+            fname = os.path.join(work_dir, 'dataset.npy')
+            npx.save(fname, arr)
+            arr_loaded = npx.load(fname)
+            assert isinstance(arr_loaded, list)
+            assert len(arr_loaded) == 1
+            assert _np.array_equal(arr_loaded[0].asnumpy(), array_list[i].asnumpy())
+
+    # test save/load a list of ndarrays
+    with TemporaryDirectory() as work_dir:
+        fname = os.path.join(work_dir, 'dataset.npy')
+        npx.save(fname, array_list)
+        array_list_loaded = mx.nd.load(fname)
+        assert isinstance(arr_loaded, list)
+        assert len(array_list) == len(array_list_loaded)
+        assert all(isinstance(arr, np.ndarray) for arr in arr_loaded)
+        for a1, a2 in zip(array_list, array_list_loaded):
+            assert _np.array_equal(a1.asnumpy(), a2.asnumpy())
+
+    # test save/load a dict of str->ndarray
+    arr_dict = {}
+    keys = [str(i) for i in range(len(array_list))]
+    for k, v in zip(keys, array_list):
+        arr_dict[k] = v
+    with TemporaryDirectory() as work_dir:
+        fname = os.path.join(work_dir, 'dataset.npy')
+        npx.save(fname, arr_dict)
+        arr_dict_loaded = npx.load(fname)
+        assert isinstance(arr_dict_loaded, dict)
+        assert len(arr_dict_loaded) == len(arr_dict)
+        for k, v in arr_dict_loaded.items():
+            assert k in arr_dict
+            assert _np.array_equal(v.asnumpy(), arr_dict[k].asnumpy())
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index b553299ab4d7..ee56ba780a95 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -23,6 +23,7 @@
 from mxnet.attribute import AttrScope
 from mxnet.name import NameManager
 from mxnet.test_utils import set_default_context
+from mxnet.util import _NumpyArrayScope
 
 def test_context():
     ctx_list = []
@@ -163,6 +164,41 @@ def f():
     thread.join()
     assert status[0], "Failed to execute a symbolic graph within a thread"
 
+
+def test_np_array_scope():
+    np_array_scope_list = []
+    _NumpyArrayScope._current = _NumpyArrayScope(False)
+    np_array_scope_list.append(_NumpyArrayScope._current)
+
+    def f():
+        _NumpyArrayScope._current = _NumpyArrayScope(True)
+        np_array_scope_list.append(_NumpyArrayScope._current)
+
+    thread = threading.Thread(target=f)
+    thread.start()
+    thread.join()
+    assert len(np_array_scope_list) == 2
+    assert not np_array_scope_list[0]._is_np_array
+    assert np_array_scope_list[1]._is_np_array
+
+    event = threading.Event()
+    status = [False]
+
+    def g():
+        with mx.np_array(False):
+            event.wait()
+            if not mx.is_np_array():
+                status[0] = True
+
+    thread = threading.Thread(target=g)
+    thread.start()
+    _NumpyArrayScope._current = _NumpyArrayScope(True)
+    event.set()
+    thread.join()
+    event.clear()
+    assert status[0], "Spawned thread didn't set status correctly"
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()