diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index f568e45bd69c..0c13e4eaa27e 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -421,7 +421,9 @@ inline int32_t Context::GetGPUCount() {
 #if MXNET_USE_CUDA
   int32_t count;
   cudaError_t e = cudaGetDeviceCount(&count);
-  if (e == cudaErrorNoDevice) {
+  // TODO(junwu): Remove e == cudaErrorInsufficientDriver
+  // This is skipped for working around wheel build system with older CUDA driver.
+  if (e == cudaErrorNoDevice || e == cudaErrorInsufficientDriver) {
     return 0;
   }
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 95d13fe2125c..f0d5e8e55b52 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2860,6 +2860,18 @@ MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
+/*!
+ * \brief Create an NDArray from source sharing the same data chunk.
+ * \param src source NDArray
+ * \param out new NDArray sharing the same data chunck with src
+ */
+MXNET_DLL int MXShallowCopyNDArray(NDArrayHandle src, NDArrayHandle* out);
+/*!
+ * \brief Create an Symbol from source sharing the same graph structure.
+ * \param src source Symbol
+ * \param out new Symbol sharing the same graph structure with src
+ */
+MXNET_DLL int MXShallowCopySymbol(SymbolHandle src, SymbolHandle * out);
 
 /*!
   * \brief Push an asynchronous operation to the engine.
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index bc630f153744..f018c8faabea 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -272,6 +272,14 @@ class Tuple {
       is.get();
       if (ch == '(' || ch == '[') break;
       if (!isspace(ch)) {
+        if (ch == 'N') {
+          std::string tmp_val;
+          is >> tmp_val;
+          if (tmp_val == "one") {  // is stores "None"
+            t.SetDim(-1);
+            return is;
+          }
+        }
         is.setstate(std::ios::failbit);
         return is;
       }
@@ -653,6 +661,13 @@ inline bool shape_is_known(const TShape& x) {
   return true;
 }
 
+inline bool shape_is_known(const std::vector<TShape>& shapes) {
+  for (const TShape& shape : shapes) {
+    if (!shape_is_known(shape)) return false;
+  }
+  return true;
+}
+
 /*! \brief helper function to cast type of container elements */
 template<typename SrcIter, typename DstIter>
 inline DstIter ShapeTypeCast(const SrcIter begin,
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 233bb2a1f57e..e9c1229d7f2f 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -25,11 +25,16 @@
 from . import engine
 from .base import MXNetError
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
+from .util import is_np_array, np_array, use_np_array, use_np
 from . import base
 from . import library
 from . import contrib
 from . import ndarray
 from . import ndarray as nd
+from . import numpy
+from . import numpy_extension
+from . import numpy as np
+from . import numpy_extension as npx
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index f324545a2352..b1a38c1d2621 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -55,6 +55,8 @@ def __reduce__(self):
 
 
 _ndarray_cls = None
+_np_ndarray_cls = None
+
 
 def _set_ndarray_class(cls):
     """Set the symbolic class to be cls"""
@@ -62,7 +64,13 @@ def _set_ndarray_class(cls):
     _ndarray_cls = cls
 
 
-def _imperative_invoke(handle, ndargs, keys, vals, out):
+def _set_np_ndarray_class(cls):
+    """Set the symbolic class to be cls"""
+    global _np_ndarray_cls
+    _np_ndarray_cls = cls
+
+
+def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op):
     """ctypes implementation of imperative invoke wrapper"""
     if out is not None:
         original_output = out
@@ -91,23 +99,27 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         c_str_array([str(s) for s in vals]),
         ctypes.byref(out_stypes)))
 
+    create_ndarray_fn = _np_ndarray_cls if is_np_op else _ndarray_cls
     if original_output is not None:
         return original_output
     if num_output.value == 1:
-        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
-                            stype=out_stypes[0])
+        return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
+                                 stype=out_stypes[0])
     else:
-        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
-                             stype=out_stypes[i])
-                for i in range(num_output.value)]
+        return [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                  stype=out_stypes[i]) for i in range(num_output.value)]
 
 
 class CachedOp(object):
     """Cached operator handle."""
-    __slots__ = ["handle"]
+    __slots__ = ["handle", "is_np_sym"]
+
     def __init__(self, sym, flags=()):
         self.handle = CachedOpHandle()
 
+        from ..symbol.numpy._symbol import _Symbol
+        self.is_np_sym = bool(isinstance(sym, _Symbol))
+
         check_call(_LIB.MXCreateCachedOpEx(
             sym.handle,
             len(flags),
@@ -151,10 +163,10 @@ def __call__(self, *args, **kwargs):
 
         if original_output is not None:
             return original_output
+        create_ndarray_fn = _np_ndarray_cls if self.is_np_sym else _ndarray_cls
         if num_output.value == 1:
-            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
-                                stype=out_stypes[0])
+            return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
+                                     stype=out_stypes[0])
         else:
-            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
-                                 stype=out_stypes[i])
-                    for i in range(num_output.value)]
+            return [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                      stype=out_stypes[i]) for i in range(num_output.value)]
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index fe4cb950ed14..01ba18b38963 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -26,7 +26,9 @@
 from ..base import SymbolHandle
 from ..base import check_call
 
+# The symbol class to be used (Cython or Ctypes)
 _symbol_cls = None
+_np_symbol_cls = None
 
 class SymbolBase(object):
     """Symbol is symbolic graph."""
@@ -115,7 +117,13 @@ def _set_symbol_class(cls):
     _symbol_cls = cls
 
 
-def _symbol_creator(handle, args, kwargs, keys, vals, name):
+def _set_np_symbol_class(cls):
+    """Set the numpy-compatible symbolic class to be cls"""
+    global _np_symbol_cls
+    _np_symbol_cls = cls
+
+
+def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op):
     sym_handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateAtomicSymbol(
         ctypes.c_void_p(handle),
@@ -128,7 +136,8 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         raise TypeError(
             'Operators with variable length input can only accept input'
             'Symbols either as positional or keyword arguments, not both')
-    s = _symbol_cls(sym_handle)
+    create_symbol_fn = _np_symbol_cls if is_np_op else _symbol_cls
+    s = create_symbol_fn(sym_handle)
     if args:
         s._compose(*args, name=name)
     elif kwargs:
diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
new file mode 100644
index 000000000000..5543ebc8e8c9
--- /dev/null
+++ b/python/mxnet/_numpy_op_doc.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+
+"""Doc placeholder for numpy ops with prefix _np."""
+
+
+def _np_ones_like(a):
+    """Return an array of ones with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : ndarray
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+
+    Returns
+    -------
+    out : ndarray
+        Array of ones with the same shape and type as `a`.
+    """
+    pass
+
+
+def _np_zeros_like(a):
+    """Return an array of zeros with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : ndarray
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the same shape and type as `a`.
+    """
+    pass
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 17819bde28b2..c16c36c7f419 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass, wrong-import-position
+# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass, too-many-lines, wrong-import-position
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
@@ -598,7 +598,9 @@ def _init_op_module(root_namespace, module_name, make_op_func):
                                      ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
-        op_names.append(py_str(plist[i]))
+        op_name = py_str(plist[i])
+        if not _is_np_op(op_name):
+            op_names.append(op_name)
 
     module_op = sys.modules["%s.%s.op" % (root_namespace, module_name)]
     module_internal = sys.modules["%s.%s._internal" % (root_namespace, module_name)]
@@ -692,7 +694,9 @@ def write_all_str(module_file, module_all_list):
                                      ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
-        op_names.append(py_str(plist[i]))
+        op_name = py_str(plist[i])
+        if not _is_np_op(op_name):
+            op_names.append(op_name)
 
     module_op_file = get_module_file("%s.%s.op" % (root_namespace, module_name))
     module_op_all = []
@@ -739,3 +743,110 @@ def write_all_str(module_file, module_all_list):
 if Features().is_enabled("TVM_OP"):
     _LIB_TVM_OP = libinfo.find_lib_path("libtvmop")
     check_call(_LIB.MXLoadTVMOp(c_str(_LIB_TVM_OP[0])))
+
+
+_NP_OP_PREFIX = '_np_'
+_NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
+
+_NP_EXT_OP_PREFIX = '_npx_'
+_NP_EXT_OP_SUBMODULE_LIST = ['_image_']
+
+_NP_INTERNAL_OP_PREFIX = '_npi_'
+
+
+def _is_np_op(op_name):
+    return op_name.startswith(_NP_OP_PREFIX) or op_name.startswith(_NP_EXT_OP_PREFIX)\
+           or op_name.startswith(_NP_INTERNAL_OP_PREFIX)
+
+
+def _get_op_submodule_name(op_name, op_name_prefix, submodule_name_list):
+    """Get the submodule name of a specific op"""
+    assert op_name.startswith(op_name_prefix)
+    for submodule_name in submodule_name_list:
+        if op_name[len(op_name_prefix):].startswith(submodule_name):
+            return submodule_name
+    return ""
+
+
+def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op_func):
+    """
+    Register numpy operators in namespaces `mxnet.numpy`, `mxnet.ndarray.numpy`
+    and `mxnet.symbol.numpy`. They are used in imperative mode, Gluon APIs w/o hybridization,
+    and Gluon APIs w/ hybridization, respectively. Essentially, operators with the same name
+    registered in three namespaces, respectively share the same functionality in C++ backend.
+    Different namespaces are needed for dispatching operator calls in Gluon's `HybridBlock` by `F`.
+
+    Parameters
+    ----------
+    root_module_name : str
+        Top level module name, `mxnet` in the current cases.
+    np_module_name : str
+        Second level module name, `numpy` or `numpy_extension` in the current case.
+    make_op_func : function
+        Function for creating op functions.
+    """
+    from . import _numpy_op_doc as _np_op_doc
+    if np_module_name == 'numpy':
+        op_name_prefix = _NP_OP_PREFIX
+        submodule_name_list = _NP_OP_SUBMODULE_LIST
+    elif np_module_name == 'numpy_extension':
+        op_name_prefix = _NP_EXT_OP_PREFIX
+        submodule_name_list = _NP_EXT_OP_SUBMODULE_LIST
+    elif np_module_name == 'numpy._internal':
+        op_name_prefix = _NP_INTERNAL_OP_PREFIX
+        submodule_name_list = []
+    else:
+        raise ValueError('unsupported np module name {}'.format(np_module_name))
+
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size), ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        name = py_str(plist[i])
+        if name.startswith(op_name_prefix):
+            op_names.append(name)
+
+    if mx_module_name is None:
+        # register np/npx ops for imperative programming
+        op_module_name = "%s.%s._op" % (root_module_name, np_module_name)  # e.g. mxnet.numpy._op
+        op_submodule_name = "%s.%s" % (root_module_name, np_module_name)  # e.g. mxnet.numpy.random
+    elif mx_module_name in ('ndarray', 'symbol'):
+        # register numpy internal ops and np/npx ops for use in Gluon
+        # np internal ops are registered in mxnet.ndarray/symbol.numpy._internal
+        # np ops are registered in mxnet.ndarray/symbol.numpy._op
+        # npx ops are registered in mxnet.ndarray/symbol.numpy_extension._op
+        op_module_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
+        if op_name_prefix != _NP_INTERNAL_OP_PREFIX:
+            op_module_name += '._op'
+        # e.g. mxnet.symbol.numpy.random
+        op_submodule_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
+    else:
+        raise ValueError('unsupported mxnet module {}'.format(mx_module_name))
+    op_submodule_name += '.%s'
+
+    op_module = sys.modules[op_module_name]
+    submodule_dict = {}
+    for submodule_name in submodule_name_list:
+        submodule_dict[submodule_name] = sys.modules[op_submodule_name % submodule_name[1:-1]]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        submodule_name = _get_op_submodule_name(name, op_name_prefix, submodule_name_list)
+        if len(submodule_name) > 0:
+            func_name = name[(len(op_name_prefix) + len(submodule_name)):]
+            cur_module = submodule_dict[submodule_name]
+            module_name_local = op_submodule_name % submodule_name[1:-1]
+        else:
+            func_name = name[len(op_name_prefix):]
+            cur_module = op_module
+            module_name_local =\
+                op_module_name[:-len('._op')] if op_module_name.endswith('._op') else op_module_name
+
+        function = make_op_func(hdl, name, func_name)
+        function.__module__ = module_name_local
+        setattr(cur_module, function.__name__, function)
+        cur_module.__all__.append(function.__name__)
+
+        if hasattr(_np_op_doc, name):
+            function.__doc__ = getattr(_np_op_doc, name).__doc__
diff --git a/python/mxnet/cython/ndarray.pyx b/python/mxnet/cython/ndarray.pyx
index f9279889b504..50791e9b9a86 100644
--- a/python/mxnet/cython/ndarray.pyx
+++ b/python/mxnet/cython/ndarray.pyx
@@ -64,21 +64,27 @@ cdef class NDArrayBase:
 
 
 _ndarray_cls = None
+_np_ndarray_cls = None
 
 def _set_ndarray_class(cls):
     global _ndarray_cls
     _ndarray_cls = cls
 
 
-cdef NewArray(NDArrayHandle handle, int stype=-1):
+def _set_np_ndarray_class(cls):
+    global _np_ndarray_cls
+    _np_ndarray_cls = cls
+
+
+cdef NewArray(NDArrayHandle handle, int stype=-1, int is_np_array=0):
     """Create a new array given handle"""
-    return _ndarray_cls(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
+    create_array_fn = _np_ndarray_cls if is_np_array else _ndarray_cls
+    return create_array_fn(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
 
 
 cdef class CachedOp:
     """Cached operator handle."""
     cdef CachedOpHandle chandle
-
     cdef _set_handle(self, handle):
         cdef unsigned long long ptr
         if handle is None:
@@ -96,6 +102,8 @@ cdef class CachedOp:
         def __set__(self, value):
             self._set_handle(value)
 
+    cdef int is_np_sym
+
     def __init__(self, sym, flags=()):
         cdef vector[string] s_flag_keys
         cdef vector[string] s_flag_vals
@@ -106,6 +114,9 @@ cdef class CachedOp:
         cdef vector[const char*] c_flag_keys = SVec2Ptr(s_flag_keys)
         cdef vector[const char*] c_flag_vals = SVec2Ptr(s_flag_vals)
 
+        from ..symbol.numpy._symbol import _Symbol
+        self.is_np_sym = bool(isinstance(sym, _Symbol))
+
         CALL(MXCreateCachedOpEx(
             <SymbolHandle>(<unsigned long long>sym.handle.value),
             len(flags),
@@ -154,12 +165,12 @@ cdef class CachedOp:
         if original_output is not None:
             return original_output
         if num_output == 1:
-            return NewArray(p_output_vars[0], p_output_stypes[0])
+            return NewArray(p_output_vars[0], p_output_stypes[0], self.is_np_sym)
         else:
-            return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
+            return [NewArray(p_output_vars[i], p_output_stypes[i], self.is_np_sym) for i in range(num_output)]
 
 
-def _imperative_invoke(handle, ndargs, keys, vals, out):
+def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op=0):
     """cython implementation of imperative invoke wrapper"""
     cdef unsigned long long ihandle = handle
     cdef OpHandle chandle = <OpHandle>ihandle
@@ -211,6 +222,6 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
     if original_output is not None:
         return original_output
     if num_output == 1:
-        return NewArray(p_output_vars[0], p_output_stypes[0])
+        return NewArray(p_output_vars[0], p_output_stypes[0], is_np_op)
     else:
-        return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
+        return [NewArray(p_output_vars[i], p_output_stypes[i], is_np_op) for i in range(num_output)]
diff --git a/python/mxnet/cython/symbol.pyx b/python/mxnet/cython/symbol.pyx
index 1bdea6c6c547..86fe8ae6db4f 100644
--- a/python/mxnet/cython/symbol.pyx
+++ b/python/mxnet/cython/symbol.pyx
@@ -84,19 +84,27 @@ cdef SymbolSetAttr(SymbolHandle handle, dict kwargs):
 
 
 _symbol_cls = SymbolBase
+_np_symbol_cls = None
 
 def _set_symbol_class(cls):
     global _symbol_cls
     _symbol_cls = cls
 
-cdef NewSymbol(SymbolHandle handle):
+
+def _set_np_symbol_class(cls):
+    global _np_symbol_cls
+    _np_symbol_cls = cls
+
+
+cdef NewSymbol(SymbolHandle handle, int is_np_sym=0):
     """Create a new symbol given handle"""
-    sym = _symbol_cls(None)
+    create_symbol_fn = _np_symbol_cls if is_np_sym else _symbol_cls
+    sym = create_symbol_fn(None)
     (<SymbolBase>sym).chandle = handle
     return sym
 
 
-def _symbol_creator(handle, args, kwargs, keys, vals, name):
+def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op=0):
     cdef unsigned long long ihandle = handle
     cdef OpHandle chandle = <OpHandle>ihandle
     cdef vector[string] ckeys
@@ -143,4 +151,4 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         &csym_keys[0] if csym_keys.size() != 0 else NULL,
         &sym_args[0] if sym_args.size() != 0 else NULL))
 
-    return NewSymbol(ret_handle)
+    return NewSymbol(ret_handle, is_np_op)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 3bac3c023bf8..1567c7baa58e 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -26,7 +26,6 @@
 import re
 from collections import OrderedDict
 
-
 from ..base import mx_real_t, MXNetError
 from .. import symbol, ndarray, initializer
 from ..symbol import Symbol
@@ -34,6 +33,10 @@
 from .. import name as _name
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
 from .utils import _indent, _brief_print_list, HookHandle
+from .utils import _check_same_symbol_type, _check_all_np_ndarrays
+from .. import numpy_extension as _mx_npx
+from .. import numpy as _mx_np
+from .. util import is_np_array, np_shape, np_array
 
 
 class _BlockScope(object):
@@ -332,7 +335,8 @@ def save_parameters(self, filename):
         """
         params = self._collect_params_with_prefix()
         arg_dict = {key : val._reduce() for key, val in params.items()}
-        ndarray.save(filename, arg_dict)
+        save_fn = _mx_npx.save if is_np_array() else ndarray.save
+        save_fn(filename, arg_dict)
 
     def save_params(self, filename):
         """[Deprecated] Please use save_parameters. Note that if you want load
@@ -381,7 +385,28 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         `Saving and Loading Gluon Models \
         <https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html>`_
         """
-        loaded = ndarray.load(filename)
+        if is_np_array():
+            # failure may happen when loading parameters saved as NDArrays within
+            # NumPy semantics. Check the failure type and recover from it if it happens.
+            try:
+                loaded = _mx_npx.load(filename)
+            except MXNetError as e:
+                err_msg = str(e)
+                if 'is_np_shape' in err_msg:
+                    # Loading failure due to parameters saved without numpy semantics.
+                    # Temporarily disable numpy semantics and load parameters. After it's
+                    # done, resume the numpy semantics. This is fine because the cases
+                    # numpy ndarray covers is a superset of the legacy ndarray's.
+                    with np_array(False):
+                        with np_shape(False):
+                            loaded_nds = ndarray.load(filename)
+                    assert isinstance(loaded_nds, dict),\
+                        'expecting a dict type, got {}'.format(str(type(loaded_nds)))
+                    loaded = {k: loaded_nds[k].as_np_ndarray() for k in loaded_nds}
+                else:
+                    raise ValueError(err_msg)
+        else:
+            loaded = ndarray.load(filename)
         params = self._collect_params_with_prefix()
         if not loaded and not params:
             return
@@ -549,7 +574,8 @@ def __call__(self, *args):
 
         for hook in self._forward_hooks.values():
             hook(self, args, out)
-
+        if _mx_npx.is_np_array():
+            _check_all_np_ndarrays(out)
         return out
 
     def forward(self, *args):
@@ -739,9 +765,13 @@ def _get_graph(self, *args):
         if not self._cached_graph:
             args, self._in_format = _flatten(args, "input")
             if len(args) > 1:
-                inputs = [symbol.var('data%d'%i) for i in range(len(args))]
+                inputs = [symbol.var('data%d' % i).as_np_ndarray()
+                          if isinstance(args[i], _mx_np.ndarray)
+                          else symbol.var('data%d' % i) for i in range(len(args))]
             else:
-                inputs = [symbol.var('data')]
+                inputs = [symbol.var('data').as_np_ndarray()
+                          if isinstance(args[0], _mx_np.ndarray)
+                          else symbol.var('data')]
             grouped_inputs = _regroup(inputs, self._in_format)[0]
 
             params = {i: j.var() for i, j in self._reg_params.items()}
@@ -749,7 +779,7 @@ def _get_graph(self, *args):
                 out = self.hybrid_forward(symbol, *grouped_inputs, **params)  # pylint: disable=no-value-for-parameter
             out, self._out_format = _flatten(out, "output")
 
-            self._cached_graph = inputs, symbol.Group(out)
+            self._cached_graph = inputs, symbol.Group(out, _check_same_symbol_type(out))
 
         return self._cached_graph
 
@@ -904,7 +934,8 @@ def export(self, path, epoch=0, remove_amp_cast=True):
             else:
                 assert name in aux_names
                 arg_dict['aux:%s'%name] = param._reduce()
-        ndarray.save('%s-%04d.params'%(path, epoch), arg_dict)
+        save_fn = _mx_npx.save if is_np_array() else ndarray.save
+        save_fn('%s-%04d.params'%(path, epoch), arg_dict)
 
     def forward(self, x, *args):
         """Defines the forward computation. Arguments can be either
@@ -1057,7 +1088,7 @@ def __init__(self, outputs, inputs, params=None):
 
         syms, self._in_format = _flatten(inputs, "input")
         out, self._out_format = _flatten(outputs, "output")
-        out = symbol.Group(out)
+        out = symbol.Group(out, _check_same_symbol_type(out))
 
         input_names = set()
         for i in syms:
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index accd968cc9df..04610165094d 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -124,6 +124,7 @@ def __init__(self, *args, **kwargs):
         self._send = self._writer.send
         self._recv = self._reader.recv
 
+
 def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
@@ -382,6 +383,7 @@ def same_process_iter():
     def __len__(self):
         return len(self._batch_sampler)
 
+
 _worker_dataset = None
 def _worker_initializer(dataset):
     """Initialier for processing pool."""
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 548407584715..ca4be735bcb2 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=unnecessary-pass
 """Neural network parameter."""
+from __future__ import absolute_import
+
 __all__ = ['DeferredInitializationError', 'Parameter', 'Constant',
            'ParameterDict', 'tensor_types']
 
@@ -31,7 +33,8 @@
 from ..context import Context, cpu
 from .. import autograd
 from .utils import _indent, _brief_print_list, shape_is_known
-from .. import is_np_shape
+from ..util import is_np_shape, is_np_array
+from .. import numpy as _mx_np  # pylint: disable=reimported
 
 # pylint: disable= invalid-name
 tensor_types = (symbol.Symbol, ndarray.NDArray)
@@ -131,7 +134,6 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self._grad_stype = grad_stype
         self._stype = stype
 
-
     def __repr__(self):
         s = 'Parameter {name} (shape={shape}, dtype={dtype})'
         return s.format(name=self.name, shape=self.shape, dtype=self.dtype)
@@ -191,9 +193,9 @@ def shape(self, new_shape):
             return
 
         assert len(self._shape) == len(new_shape) and \
-            all(j in (0, i) for i, j in zip(new_shape, self._shape)), \
+            all(j in (-1, 0, i) for i, j in zip(new_shape, self._shape)), \
             "Expected shape %s is incompatible with given shape %s."%(
-                str(new_shape), str(self._shape))
+                str(new_shape), str(self._shape))  # -1 means unknown dim size in np_shape mode
 
         self._shape = new_shape
 
@@ -272,12 +274,14 @@ def _load_init(self, data, ctx, cast_dtype=False, dtype_source='current'):
         if cast_dtype:
             assert dtype_source in ['current', 'saved']
         if self.shape:
+            unknown_dim_size = -1 if is_np_shape() else 0
             for self_dim, data_dim in zip(self.shape, data.shape):
-                assert self_dim in (0, data_dim), \
+                assert self_dim in (unknown_dim_size, data_dim), \
                     "Failed loading Parameter '%s' from saved params: " \
                     "shape incompatible expected %s vs saved %s"%(
                         self.name, str(self.shape), str(data.shape))
-            self.shape = tuple(i if i != 0 else j for i, j in zip(self.shape, data.shape))
+            self.shape = tuple(i if i != unknown_dim_size else j
+                               for i, j in zip(self.shape, data.shape))
         if self.dtype:
             if cast_dtype and np.dtype(self.dtype).type != data.dtype:
                 if dtype_source == 'current':
@@ -318,6 +322,7 @@ def _finish_deferred_init(self):
             return
         init, ctx, default_init, data = self._deferred_init
         self._deferred_init = ()
+
         assert shape_is_known(self.shape), \
             "Cannot initialize Parameter '%s' because it has " \
             "invalid shape: %s. Please specify in_units, " \
@@ -326,8 +331,16 @@ def _finish_deferred_init(self):
 
         with autograd.pause():
             if data is None:
-                data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
-                                     ctx=context.cpu(), stype=self._stype)
+                kwargs = {'shape': self.shape, 'dtype': self.dtype, 'ctx': context.cpu()}
+                if is_np_array():
+                    if self._stype != 'default':
+                        raise ValueError("mxnet.numpy.zeros does not support stype = {}"
+                                         .format(self._stype))
+                    zeros_fn = _mx_np.zeros
+                else:
+                    kwargs['stype'] = self._stype
+                    zeros_fn = ndarray.zeros
+                data = zeros_fn(**kwargs)
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
 
@@ -352,8 +365,15 @@ def _init_grad(self):
             self._grad = None
             return
 
-        self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
-                                    stype=self._grad_stype) for i in self._data]
+        if is_np_array():
+            if self._grad_stype != 'default':
+                raise ValueError("mxnet.numpy.zeros does not support stype = {}"
+                                 .format(self._grad_stype))
+            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context)
+                          for i in self._data]
+        else:
+            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
+                                        stype=self._grad_stype) for i in self._data]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
                                 self._grad, self.grad_req)
@@ -363,7 +383,10 @@ def _reduce(self):
         ctx = context.cpu()
         if self._stype == 'default':
             block = self.list_data()
-            data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
+            if is_np_array():
+                data = sum([w.copyto(ctx) for w in block]) / len(block)
+            else:
+                data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
         else:
             # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
@@ -463,7 +486,6 @@ def reset_ctx(self, ctx):
             raise ValueError("Cannot reset context for Parameter '%s' because it "
                              "has not been initialized."%self.name)
 
-
     def set_data(self, data):
         """Sets this parameter's value on all contexts."""
         self.shape = data.shape
@@ -602,6 +624,8 @@ def var(self):
             self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
                                    init=self.init, stype=self._stype)
+            if is_np_array():
+                self._var = self._var.as_np_ndarray()
         return self._var
 
     def cast(self, dtype):
@@ -763,12 +787,12 @@ def get(self, name, **kwargs):
                         inferred_shape = []
                         matched = True
                         for dim1, dim2 in zip(v, existing):
-                            if dim1 != dim2 and dim1 * dim2 != 0:
+                            if dim1 != dim2 and dim1 > 0 and dim2 > 0:
                                 matched = False
                                 break
                             elif dim1 == dim2:
                                 inferred_shape.append(dim1)
-                            elif dim1 == 0:
+                            elif dim1 in (0, -1):  # -1 means unknown dim size in np_shape mode
                                 inferred_shape.append(dim2)
                             else:
                                 inferred_shape.append(dim1)
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index b3cc596282a7..2a9cd88bb214 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -236,7 +236,6 @@ def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
         else:
             return super(_RNNLayer, self).__call__(inputs, states, **kwargs)
 
-
     def hybrid_forward(self, F, inputs, states, sequence_length=None, **kwargs):
         if F is ndarray:
             batch_size = inputs.shape[self._layout.find('N')]
@@ -285,7 +284,6 @@ def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
                     lstm_state_clip_max=self._lstm_state_clip_max,
                     lstm_state_clip_nan=self._lstm_state_clip_nan)
 
-
         if self._mode == 'lstm':
             outputs, states = rnn[0], [rnn[1], rnn[2]]
         else:
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 3957b7402688..97c513fb6447 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -40,6 +40,7 @@ class requests_failed_to_import(object):
 from .. import ndarray
 from ..util import is_np_shape
 
+
 def split_data(data, num_slice, batch_axis=0, even_split=True):
     """Splits an NDArray into `num_slice` slices along `batch_axis`.
     Usually used for data parallelism where each slices is sent
@@ -108,7 +109,7 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
 
     Returns
     -------
-    list of NDArray
+    list of NDArrays
         Each corresponds to a context in `ctx_list`.
     """
     if not isinstance(data, ndarray.NDArray):
@@ -414,6 +415,7 @@ def __enter__(self):
     def __exit__(self, ptype, value, trace):
         self.detach()
 
+
 def shape_is_known(shape):
     """Check whether a shape is completely known with or without np semantics.
 
@@ -430,3 +432,44 @@ def shape_is_known(shape):
         assert dim_size > unknown_dim_size, "shape dimension size cannot be less than {}, while " \
                                             "received {}".format(unknown_dim_size, dim_size)
     return True
+
+
+def _check_same_symbol_type(symbols):
+    """Check whether all the symbols in the list are of the same type.
+    Raise type error if the types are different. Return the class of
+    the symbols."""
+    from ..symbol.numpy import _Symbol as np_symbol
+    from ..symbol import Symbol as nd_symbol
+    is_np_sym = isinstance(symbols[0], np_symbol)
+    for s in symbols[1:]:
+        if is_np_sym != isinstance(s, np_symbol):
+            raise TypeError('Found both classic symbol (mx.sym.Symbol) and numpy symbol '
+                            '(mx.sym.np._Symbol) in outputs. This will prevent you from building '
+                            'a computation graph by grouping them since different types of symbols '
+                            'are not allowed to be grouped in Gluon to form a computation graph. '
+                            'You will need to convert them to the same type of symbols, either '
+                            'classic or numpy following this rule: if you want numpy ndarray '
+                            'output(s) from the computation graph, please convert all the classic '
+                            'symbols in the list to numpy symbols by calling `as_np_ndarray()` '
+                            'on each of them; if you want classic ndarray output(s) from the '
+                            'computation graph, please convert all the numpy symbols in the list '
+                            'to classic symbols by calling `as_nd_ndarray()` on each of them.')
+    return np_symbol if is_np_sym else nd_symbol
+
+
+def _check_all_np_ndarrays(out):
+    """Check if ndarrays/symbols in out are all np.ndarray/np._Symbol."""
+    from ..numpy import ndarray as np_ndarray
+    from ..symbol.numpy import _Symbol as np_symbol
+    from ..symbol import Symbol as nd_symbol
+    from ..ndarray import NDArray as nd_ndarray
+
+    # pylint: disable=no-else-raise
+    if isinstance(out, (nd_ndarray, nd_symbol)) and not isinstance(out, (np_ndarray, np_symbol)):
+        raise TypeError("Block's output ndarrays/symbols must be of type `mxnet.numpy.ndarray`"
+                        " or `mxnet.symbol.numpy._Symbol`, while got output type {}"
+                        .format(str(type(out))))
+    elif isinstance(out, (list, tuple)):
+        for i in out:
+            _check_all_np_ndarrays(i)
+    # pylint: enable=no-else-raise
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index a142282c83a6..2eb1405b5569 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -428,7 +428,7 @@ def fixed_crop(src, x0, y0, w, h, size=None, interp=2):
     NDArray
         An `NDArray` containing the cropped image.
     """
-    out = nd.slice(src, begin=(y0, x0, 0), end=(y0 + h, x0 + w, int(src.shape[2])))
+    out = src[y0:y0+h, x0:x0+w]
     if size is not None and (w, h) != size:
         sizes = (h, w, size[1], size[0])
         out = imresize(out, *size, interp=_get_interp_method(interp, sizes))
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
index f09908e894d5..f6b8712a2513 100644
--- a/python/mxnet/ndarray/__init__.py
+++ b/python/mxnet/ndarray/__init__.py
@@ -17,7 +17,7 @@
 
 """NDArray API of MXNet."""
 
-from . import _internal, contrib, linalg, op, random, sparse, utils, image, ndarray
+from . import _internal, contrib, linalg, op, random, sparse, utils, image, ndarray, numpy
 # pylint: disable=wildcard-import, redefined-builtin
 try:
     from .gen_op import * # pylint: disable=unused-wildcard-import
@@ -30,6 +30,8 @@
 from .utils import load, load_frombuffer, save, zeros, empty, array
 from .sparse import _ndarray_cls
 from .ndarray import _GRAD_REQ_MAP, _DTYPE_MX_TO_NP, _DTYPE_NP_TO_MX, _new_empty_handle
+from . import numpy as np
+from . import numpy_extension as npx
 
 __all__ = op.__all__ + ndarray.__all__ + utils.__all__ + \
-          ['contrib', 'linalg', 'random', 'sparse', 'image']
+          ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/ndarray/_internal.py b/python/mxnet/ndarray/_internal.py
index 8045d9bd2b14..d48255647939 100644
--- a/python/mxnet/ndarray/_internal.py
+++ b/python/mxnet/ndarray/_internal.py
@@ -23,18 +23,18 @@
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.ndarray import NDArrayBase, CachedOp
-        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
     elif _sys.version_info >= (3, 0):
         from .._cy3.ndarray import NDArrayBase, CachedOp
-        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
     else:
         from .._cy2.ndarray import NDArrayBase, CachedOp
-        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
 except ImportError:
     if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
         raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
     from .._ctypes.ndarray import NDArrayBase, CachedOp
-    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
 
 from ..base import _Null
 try:
@@ -42,4 +42,5 @@
 except ImportError:
     pass
 
-__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class']
+__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class',
+           '_set_np_ndarray_class']
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 7e21daedcde1..d67779e9a0f7 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -184,6 +184,24 @@ class NDArray(NDArrayBase):
     # See C++ side of definition(kTVMNDArrayTypeCode) at include/mxmet/tensor_blob.h
     _tvm_tcode = 19
     # pylint: disable= no-member, undefined-variable
+
+    def as_np_ndarray(self):
+        """Convert mxnet.ndarray.NDArray to mxnet.numpy.ndarray."""
+        storage_type = self.stype
+        if storage_type != 'default':
+            raise ValueError('cannot convert ndarray of stype {} to numpy ndarray'
+                             .format(str(type(storage_type))))
+        from ..numpy import ndarray
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+        return ndarray(handle=hdl, writable=self.writable)
+
+    def as_nd_ndarray(self):
+        """A convenience function for creating a classic ndarray from the current
+        ndarray with zero copy. For this class, it just returns itself since it is
+        already a classic ndarray."""
+        return self
+
     @property
     def _tvm_handle(self):
         return self.handle.value
@@ -908,7 +926,7 @@ def _slice(self, start, stop):
 
         check_call(_LIB.MXNDArraySlice(
             self.handle, mx_uint(start), mx_uint(stop), ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def _at(self, idx):
         """Returns a view of the array sliced at `idx` in the first dim.
@@ -942,7 +960,7 @@ def _at(self, idx):
                                  % (idx-length, length))
         check_call(_LIB.MXNDArrayAt(
             self.handle, mx_uint(idx), ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def reshape(self, *shape, **kwargs):
         """Returns a **view** of this array with a new shape without altering any data.
@@ -1065,7 +1083,7 @@ def reshape(self, *shape, **kwargs):
                                            c_array(ctypes.c_int64, shape),
                                            reverse,
                                            ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -2384,7 +2402,7 @@ def _get_broadcast_shape(shape1, shape2):
     for a, b in zip(shape1[::-1], shape2[::-1]):
         if a != 1 and b != 1 and a != b:
             raise ValueError('shape1=%s is not broadcastable to shape2=%s' % (shape1, shape2))
-        shape[i] = max(a, b)
+        shape[i] = b if a == 1 else a
         i -= 1
     return tuple(shape)
 
diff --git a/python/mxnet/ndarray/numpy/__init__.py b/python/mxnet/ndarray/numpy/__init__.py
new file mode 100644
index 000000000000..7eb478f792f5
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for numpy ops under mxnet.ndarray."""
+
+from . import random
+from . import linalg
+from . import _op, _internal
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/ndarray/numpy/_internal.py b/python/mxnet/ndarray/numpy/_internal.py
new file mode 100644
index 000000000000..c5f292842b3b
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_internal.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for numpy internal ops."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
new file mode 100644
index 000000000000..d7c06e76c182
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -0,0 +1,295 @@
+# pylint: disable=C0302
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
+
+from __future__ import absolute_import
+import numpy as _np
+from ...base import numeric_types
+from ...util import set_module
+from ...context import current_context
+from . import _internal as _npi
+
+__all__ = ['zeros', 'ones', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power']
+
+
+@set_module('mxnet.ndarray.numpy')
+def zeros(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.zeros(shape=shape, ctx=ctx, dtype=dtype)
+
+
+@set_module('mxnet.ndarray.numpy')
+def ones(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with ones.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.ones(shape=shape, ctx=ctx, dtype=dtype)
+
+
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
+
+    Parameters
+    --------
+    lhs : ndarray or numeric value
+        Left-hand side operand.
+
+    rhs : ndarray or numeric value
+        Right-hand operand,
+
+    fn_array : function
+        Function to be called if both lhs and rhs are of ``ndarray`` type.
+
+    fn_scalar : function
+        Function to be called if both lhs and rhs are numeric values.
+
+    lfn_scalar : function
+        Function to be called if lhs is ``ndarray`` while rhs is numeric value
+
+    rfn_scalar : function
+        Function to be called if lhs is numeric value while rhs is ``ndarray``;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    --------
+    mxnet.numpy.ndarray or scalar
+        result array or scalar
+    """
+    from ...numpy import ndarray
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return fn_scalar(lhs, rhs, out=out)
+        else:
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs), out=out)
+            else:
+                return rfn_scalar(rhs, float(lhs), out=out)
+    elif isinstance(rhs, numeric_types):
+        return lfn_scalar(lhs, float(rhs), out=out)
+    elif isinstance(rhs, ndarray):
+        return fn_array(lhs, rhs, out=out)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(rhs))))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+
+@set_module('mxnet.ndarray.numpy')
+def add(x1, x2, out=None):
+    """Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be added. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.add, _np.add, _npi.add_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def subtract(x1, x2, out=None):
+    """Subtract arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be subtracted from each other. If x1.shape != x2.shape,
+        they must be broadcastable to a common shape (which may be the shape
+        of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    subtract : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.subtract, _np.subtract, _npi.subtract_scalar,
+                         _npi.rsubtract_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def multiply(x1, x2, out=None):
+    """Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be multiplied. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The multiplication of x1 and x2, element-wise. This is a scalar if both x1 and x2
+        are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.multiply, _np.multiply, _npi.multiply_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def divide(x1, x2, out=None):
+    """Returns a true division of the inputs, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.true_divide, _np.divide, _npi.true_divide_scalar,
+                         _npi.rtrue_divide_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def mod(x1, x2, out=None):
+    """Return element-wise remainder of division.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def power(x1, x2, out=None):
+    """First array elements raised to powers from second array, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        The bases.
+
+    x2 : ndarray or scalar
+        The exponent.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The bases in x1 raised to the exponents in x2.
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
diff --git a/python/mxnet/ndarray/numpy/_register.py b/python/mxnet/ndarray/numpy/_register.py
new file mode 100644
index 000000000000..3ac464e24217
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_register.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy._internal',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
new file mode 100644
index 000000000000..0222bb45d148
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=ndarray."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
new file mode 100644
index 000000000000..339fb1e77920
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=ndarray."""
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy_extension/__init__.py b/python/mxnet/ndarray/numpy_extension/__init__.py
new file mode 100644
index 000000000000..5be34ac9b3d5
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for the ops not belonging to the official numpy package."""
+
+from . import _op
+from . import image
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/ndarray/numpy_extension/_op.py b/python/mxnet/ndarray/numpy_extension/_op.py
new file mode 100644
index 000000000000..22738a0f1950
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/_op.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for the operators not belonging to the official numpy package
+used in Gluon dispatched by F=ndarray module."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy_extension/_register.py b/python/mxnet/ndarray/numpy_extension/_register.py
new file mode 100644
index 000000000000..32cd0686551c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/_register.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy_extension ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/ndarray/numpy_extension/image.py b/python/mxnet/ndarray/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 1ccf228698ba..bdbfa1584ca6 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -24,12 +24,97 @@
 from ._internal import NDArrayBase, _imperative_invoke # pylint: disable=unused-import
 from ..ndarray_doc import _build_doc
 
-from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null # pylint: disable=unused-import
+from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null, _is_np_op  # pylint: disable=unused-import
+from ..util import use_np_shape  # pylint: disable=unused-import
+
+
+def _verify_all_np_ndarrays(op_name, func_name, args, out):
+    """Verify if all the arrays are numpy ndarrays.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    args : list of arrays
+        Input ndarray arguments to be checked.
+    out : ndarray or None or list of ndarrays
+        User-provided output ndarrays.
+    """
+    from ..numpy import ndarray as np_ndarray
+    for arr in args:
+        if (arr is not None) and (not isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a numpy operator which can only accept '
+                            'MXNet numpy ndarrays, while received a legacy ndarray. '
+                            'Please ensure that you have activated numpy semantics by calling '
+                            '`npx.set_np()` in your code. If you still see this error with numpy '
+                            'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                            'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                            'converted array to this operator.'
+                            .format(op_name, func_name))
+    if out is None:
+        return
+    if not isinstance(out, (list, tuple)):
+        out = [out]
+    for arr in out:
+        if (arr is not None) and (not isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a numpy operator which can only accept '
+                            'MXNet numpy ndarrays, while received a legacy ndarray. '
+                            'Please ensure that you have activated numpy semantics by calling '
+                            '`npx.set_np()` in your code. If you still see this error with numpy '
+                            'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                            'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                            'converted array to this operator.'
+                            .format(op_name, func_name))
+
+
+def _verify_all_legacy_ndarrays(op_name, func_name, args, out):
+    """Verify if all the arrays are legacy ndarrays.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    args : list of arrays
+        Input ndarray arguments to be checked.
+    out : ndarray or None or list of ndarrays
+        User-provided output ndarrays.
+    """
+    from ..numpy import ndarray as np_ndarray
+    for arr in args:
+        if (arr is not None) and (isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a legacy operator which can only accept '
+                            'legacy ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                            'convert it to a legacy ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
+    if out is None:
+        return
+    if not isinstance(out, (list, tuple)):
+        out = [out]
+    for arr in out:
+        if (arr is not None) and (isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a legacy operator which can only write to '
+                            'legacy ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                            'convert it to a legacy ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
 
 
 # pylint: disable=too-many-locals
-def _generate_ndarray_function_code(handle, name, func_name, signature_only=False):
-    """Generate function for ndarray op by handle and function name."""
+def _generate_ndarray_function_code(handle, op_name, func_name, signature_only=False):
+    """Generate function for ndarray op by handle and function op_name."""
     real_name = ctypes.c_char_p()
     desc = ctypes.c_char_p()
     num_args = mx_uint()
@@ -52,7 +137,7 @@ def _generate_ndarray_function_code(handle, name, func_name, signature_only=Fals
     arg_types = [py_str(arg_types[i]) for i in range(narg)]
     key_var_num_args = py_str(key_var_num_args.value)
     ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(name,
+    doc_str = _build_doc(op_name,
                          py_str(desc.value),
                          arg_names,
                          arg_types,
@@ -90,6 +175,10 @@ def _generate_ndarray_function_code(handle, name, func_name, signature_only=Fals
     signature = ndsignature + signature
 
     code = []
+    is_np_op = _is_np_op(op_name)
+    doc_str_idx = 1
+    if is_np_op:
+        doc_str_idx = 2
     if arr_name:
         code.append("""
 def %s(*%s, **kwargs):"""%(func_name, arr_name))
@@ -134,15 +223,26 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         vals.append(%s)"""%(name, name, name))
             # dtype
             if dtype_name is not None:
-                code.append("""
+                if is_np_op:
+                    code.append("""
+    if %s is not _Null and %s is not None:
+        keys.append('%s')
+        vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name, dtype_name))
+                else:
+                    code.append("""
     if %s is not _Null:
         keys.append('%s')
         vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
+    verify_ndarrays_fn =\
+        _verify_all_np_ndarrays.__name__ if is_np_op else _verify_all_legacy_ndarrays.__name__
     if not signature_only:
         code.append("""
-    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
-        handle.value))
+    {verify_fn}("{op_name}", "{func_name}", ndargs, out)
+        """.format(verify_fn=verify_ndarrays_fn, op_name=op_name, func_name=func_name))
+        code.append("""
+    return _imperative_invoke(%d, ndargs, keys, vals, out, %s)"""%(
+        handle.value, str(is_np_op)))
     else:
         code.append("""
     return (0,)""")
@@ -150,7 +250,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     doc_str_lines = _os.linesep+''.join(['    '+s if s.strip() else s
                                          for s in 'r"""{doc_str}"""'.format(doc_str=doc_str)
                                          .splitlines(True)])
-    code.insert(1, doc_str_lines)
+    code.insert(doc_str_idx, doc_str_lines)
     return ''.join(code), doc_str
 
 
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index ff93d0be6d73..730f2172c4f4 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -248,6 +248,7 @@ def save(fname, data):
     >>> mx.nd.load('my_dict')
     {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
     """
+    from ..numpy import ndarray as np_ndarray
     if isinstance(data, NDArray):
         data = [data]
         handles = c_array(NDArrayHandle, [])
@@ -257,11 +258,17 @@ def save(fname, data):
         if any(not isinstance(k, string_types) for k in str_keys) or \
            any(not isinstance(v, NDArray) for v in nd_vals):
             raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        if any(isinstance(v, np_ndarray) for v in nd_vals):
+            raise TypeError('cannot save mxnet.numpy.ndarray using mxnet.ndarray.save;'
+                            ' use mxnet.numpy.save instead.')
         keys = c_str_array(str_keys)
         handles = c_handle_array(nd_vals)
     elif isinstance(data, list):
         if any(not isinstance(v, NDArray) for v in data):
             raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        if any(isinstance(v, np_ndarray) for v in data):
+            raise TypeError('cannot save mxnet.numpy.ndarray using mxnet.ndarray.save;'
+                            ' use mxnet.numpy.save instead.')
         keys = None
         handles = c_handle_array(data)
     else:
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
new file mode 100644
index 000000000000..979e8d61a5ab
--- /dev/null
+++ b/python/mxnet/numpy/__init__.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""MXNet NumPy module."""
+
+from __future__ import division, absolute_import, print_function
+
+from . import random
+from . import linalg
+from .multiarray import *  # pylint: disable=wildcard-import
+from . import _op
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from .utils import *  # pylint: disable=wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy/_op.py b/python/mxnet/numpy/_op.py
new file mode 100644
index 000000000000..8f6f9cc053e4
--- /dev/null
+++ b/python/mxnet/numpy/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for registering numpy ops for imperative programming."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/_register.py b/python/mxnet/numpy/_register.py
new file mode 100644
index 000000000000..8a2d2ea61c24
--- /dev/null
+++ b/python/mxnet/numpy/_register.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering ops in mxnet.numpy for imperative programming."""
+
+from __future__ import absolute_import
+
+from ..base import _init_np_op_module
+from ..ndarray.register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name=None, make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
new file mode 100644
index 000000000000..c4109378e146
--- /dev/null
+++ b/python/mxnet/numpy/linalg.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for ops used in imperative programming."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
new file mode 100644
index 000000000000..54b1069b1029
--- /dev/null
+++ b/python/mxnet/numpy/multiarray.py
@@ -0,0 +1,1551 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""numpy ndarray and util functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+
+try:
+    from __builtin__ import slice as py_slice
+except ImportError:
+    from builtins import slice as py_slice
+
+from array import array as native_array
+import sys
+import ctypes
+import warnings
+import numpy as _np
+from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _GRAD_REQ_MAP
+from ..ndarray._internal import _set_np_ndarray_class
+from . import _op as _mx_np_op
+from ..base import check_call, _LIB, NDArrayHandle
+from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, integer_types
+from ..util import _sanity_check_params, set_module
+from ..context import current_context
+from ..ndarray import numpy as _mx_nd_np
+from ..ndarray.numpy import _internal as _npi
+
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'add', 'subtract', 'multiply', 'divide',
+           'mod', 'power']
+
+
+# This function is copied from ndarray.py since pylint
+# keeps giving false alarm error of undefined-all-variable
+def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
+    """Return a new handle with specified shape and context.
+
+    Empty handle is only used to hold results.
+
+    Returns
+    -------
+    handle
+        A new empty `ndarray` handle.
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXNDArrayCreateEx(
+        c_array_buf(mx_uint, native_array('I', shape)),
+        mx_uint(len(shape)),
+        ctypes.c_int(ctx.device_typeid),
+        ctypes.c_int(ctx.device_id),
+        ctypes.c_int(int(delay_alloc)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[_np.dtype(dtype).type])),
+        ctypes.byref(hdl)))
+    return hdl
+
+
+# Have to use 0 as default value for stype since pylint does not allow
+# importing _STORAGE_TYPE_DEFAULT from ndarray.py.
+def _np_ndarray_cls(handle, writable=True, stype=0):
+    if stype != 0:
+        raise ValueError('_np_ndarray_cls currently only supports default storage '
+                         'type, while received stype = {}'.format(stype))
+    return ndarray(handle, writable=writable)
+
+
+_set_np_ndarray_class(_np_ndarray_cls)
+
+
+def _get_index(idx):
+    if isinstance(idx, NDArray) and not isinstance(idx, ndarray):
+        raise TypeError('Cannot have mx.nd.NDArray as index')
+    if isinstance(idx, ndarray):
+        return idx.as_nd_ndarray()
+    elif sys.version_info[0] > 2 and isinstance(idx, range):
+        return array(_np.arange(idx.start, idx.stop, idx.step, dtype=_np.int32)).as_nd_ndarray()
+    else:
+        return idx
+
+
+@set_module('mxnet.numpy')  # pylint: disable=invalid-name
+class ndarray(NDArray):
+    """An array object represents a multidimensional, homogeneous array of fixed-size items.
+    An associated data-type object describes the format of each element in the array
+    (its byte-order, how many bytes it occupies in memory, whether it is an integer, a
+    floating point number, or something else, etc.). Arrays should be constructed using
+    `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
+
+    # pylint: disable=too-many-return-statements
+    def __getitem__(self, key):
+        # TODO(junwu): calling base class __getitem__ is a temp solution
+        ndim = self.ndim
+        shape = self.shape
+        if ndim == 0:
+            if key != ():
+                raise IndexError('scalar tensor can only accept `()` as index')
+        if isinstance(key, tuple) and len(key) == 0:
+            return self
+        elif isinstance(key, tuple) and len(key) == ndim\
+                and all(isinstance(idx, integer_types) for idx in key):
+            out = self
+            for idx in key:
+                out = out[idx]
+            return out
+        elif isinstance(key, integer_types):
+            if key > shape[0] - 1:
+                raise IndexError(
+                    'index {} is out of bounds for axis 0 with size {}'.format(
+                        key, shape[0]))
+            return self._at(key)
+        elif isinstance(key, py_slice):
+            if key.step is not None and key.step != 1:
+                if key.step == 0:
+                    raise ValueError("slice step cannot be zero")
+                return self.as_nd_ndarray()._get_nd_basic_indexing(key).as_np_ndarray()
+            elif key.start is not None or key.stop is not None:
+                return self._slice(key.start, key.stop)
+            else:
+                return self
+
+        if isinstance(key, ndarray):
+            key = key.as_nd_ndarray()
+        elif isinstance(key, tuple):
+            key = [_get_index(idx) for idx in key]
+            key = tuple(key)
+        elif isinstance(key, list):
+            key = [_get_index(idx) for idx in key]
+        elif sys.version_info[0] > 2 and isinstance(key, range):
+            key = _get_index(key)
+        return self.as_nd_ndarray().__getitem__(key).as_np_ndarray()
+    # pylint: enable=too-many-return-statements
+
+    def __setitem__(self, key, value):
+        # TODO(junwu): calling base class __setitem__ is a temp solution
+        if isinstance(value, NDArray) and not isinstance(value, ndarray):
+            raise TypeError('Cannot assign mx.nd.NDArray to mxnet.numpy.ndarray')
+        if self.ndim == 0:
+            if not isinstance(key, tuple) or len(key) != 0:
+                raise IndexError('scalar tensor can only accept `()` as index')
+        if isinstance(value, ndarray):
+            value = value.as_nd_ndarray()
+        # TODO(junwu): Better handling of this situation
+        if isinstance(key, tuple) and len(key) == 0:
+            self.as_nd_ndarray().__setitem__(slice(None), value)
+            return
+
+        if isinstance(key, ndarray):
+            key = key.as_nd_ndarray()
+        elif isinstance(key, tuple):
+            key = [_get_index(idx) for idx in key]
+            key = tuple(key)
+        elif isinstance(key, list):
+            key = [_get_index(idx) for idx in key]
+        elif sys.version_info[0] > 2 and isinstance(key, range):
+            key = _get_index(key)
+        self.as_nd_ndarray().__setitem__(key, value)
+
+    def __add__(self, other):
+        """x.__add__(y) <=> x + y"""
+        return add(self, other)
+
+    def __iadd__(self, other):
+        """x.__iadd__(y) <=> x += y"""
+        if not self.writable:
+            raise ValueError('trying to add to a readonly ndarray')
+        return add(self, other, out=self)
+
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x - y"""
+        return subtract(self, other)
+
+    def __isub__(self, other):
+        """x.__isub__(y) <=> x -= y"""
+        if not self.writable:
+            raise ValueError('trying to subtract from a readonly ndarray')
+        return subtract(self, other, out=self)
+
+    def __rsub__(self, other):
+        """x.__rsub__(y) <=> y - x"""
+        return subtract(other, self)
+
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x * y"""
+        return multiply(self, other)
+
+    def __neg__(self):
+        return self.__mul__(-1.0)
+
+    def __imul__(self, other):
+        """x.__imul__(y) <=> x *= y"""
+        if not self.writable:
+            raise ValueError('trying to add to a readonly ndarray')
+        return multiply(self, other, out=self)
+
+    def __rmul__(self, other):
+        """x.__rmul__(y) <=> y * x"""
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        raise AttributeError('ndarray.__div__ is replaced by __truediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __rdiv__(self, other):
+        raise AttributeError('ndarray.__rdiv__ is replaced by __rtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __idiv__(self, other):
+        raise AttributeError('ndarray.__idiv__ is replaced by __irtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __truediv__(self, other):
+        """x.__truediv__(y) <=> x / y"""
+        return divide(self, other)
+
+    def __rtruediv__(self, other):
+        """x.__rtruediv__(y) <=> y / x"""
+        return divide(other, self)
+
+    def __itruediv__(self, other):
+        return divide(self, other, out=self)
+
+    def __mod__(self, other):
+        """x.__mod__(y) <=> x % y"""
+        return mod(self, other)
+
+    def __rmod__(self, other):
+        """x.__rmod__(y) <=> y % x"""
+        return mod(other, self)
+
+    def __imod__(self, other):
+        """x.__imod__(y) <=> x %= y"""
+        return mod(self, other, out=self)
+
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x ** y"""
+        return power(self, other)
+
+    def __rpow__(self, other):
+        """x.__rpow__(y) <=> y ** x"""
+        return power(other, self)
+
+    def __eq__(self, other):
+        """x.__eq__(y) <=> x == y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __hash__(self):
+        raise NotImplementedError
+
+    def __ne__(self, other):
+        """x.__ne__(y) <=> x != y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.not_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.not_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __gt__(self, other):
+        """x.__gt__(y) <=> x > y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.greater(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __ge__(self, other):
+        """x.__ge__(y) <=> x >= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.greater_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __lt__(self, other):
+        """x.__lt__(y) <=> x < y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.less(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __le__(self, other):
+        """x.__le__(y) <=> x <= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.less_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    def __bool__(self):
+        num_elements = self.size
+        if num_elements == 0:
+            warnings.simplefilter('default')
+            warnings.warn('The truth value of an empty array is ambiguous. Returning False, but in'
+                          ' future this will result in an error.', DeprecationWarning)
+            return False
+        elif num_elements == 1:
+            return bool(self.item())
+        else:
+            raise ValueError("The truth value of an ndarray with multiple elements is ambiguous.")
+
+    __nonzero__ = __bool__
+
+    def __float__(self):
+        num_elements = self.size
+        if num_elements != 1:
+            raise TypeError('only size-1 arrays can be converted to Python scalars')
+        return float(self.item())
+
+    def __int__(self):
+        num_elements = self.size
+        if num_elements != 1:
+            raise TypeError('only size-1 arrays can be converted to Python scalars')
+        return int(self.item())
+
+    def __len__(self):
+        """Number of elements along the first axis."""
+        shape = self.shape
+        if len(shape) == 0:
+            raise TypeError('len() of unsized object')
+        return self.shape[0]
+
+    def __reduce__(self):
+        return ndarray, (None,), self.__getstate__()
+
+    def item(self, *args):
+        """Copy an element of an array to a standard Python scalar and return it.
+
+        Parameters
+        ----------
+        *args : Arguments (variable number and type)
+            none: in this case, the method only works for arrays with one element (a.size == 1),
+            which element is copied into a standard Python scalar object and returned.
+
+            int_type: this argument is interpreted as a flat index into the array, specifying which
+            element to copy and return.
+
+            tuple of int_types: functions as does a single int_type argument, except that the
+            argument is interpreted as an nd-index into the array.
+
+        Returns
+        -------
+        z : Standard Python scalar object
+            A copy of the specified element of the array as a suitable Python scalar.
+        """
+        # TODO(junwu): no need to call asnumpy() on the whole array.
+        return self.asnumpy().item(*args)
+
+    @property
+    # pylint: disable= invalid-name, undefined-variable
+    def T(self):
+        """Same as self.transpose(). This always returns a copy of self."""
+        return self.transpose()
+    # pylint: enable= invalid-name, undefined-variable
+
+    def all(self, axis=None, out=None, keepdims=False):
+        raise NotImplementedError
+
+    def any(self, axis=None, out=None, keepdims=False):
+        raise NotImplementedError
+
+    def as_nd_ndarray(self):
+        """Convert mxnet.numpy.ndarray to mxnet.ndarray.NDArray to use its fluent methods."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+        return NDArray(handle=hdl, writable=self.writable)
+
+    def as_np_ndarray(self):
+        """A convenience function for creating a numpy ndarray from the current ndarray
+        with zero copy. For this class, it just returns itself since it's already a
+        numpy ndarray."""
+        return self
+
+    def __repr__(self):
+        """
+        Returns a string representation of the array. The dtype of the ndarray will not
+        be appended to the string if it is `float32`. The context of the ndarray will
+        be appended for devices other than CPU.
+
+        Examples
+        --------
+        >>> from mxnet import np, npx
+        >>> a = np.random.uniform(size=(2, 3))
+        >>> a
+        array([[0.5488135 , 0.5928446 , 0.71518934],
+               [0.84426576, 0.60276335, 0.8579456 ]])
+        >>> print(a)
+        [[0.5488135  0.5928446  0.71518934]
+         [0.84426576 0.60276335 0.8579456 ]]
+        >>> a.dtype
+        <class 'numpy.float32'>
+        >>> b = a.astype(np.float64)
+        >>> b
+        array([[0.54881352, 0.59284461, 0.71518934],
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64)
+        >>> print(b)
+        [[0.54881352 0.59284461 0.71518934]
+         [0.84426576 0.60276335 0.85794562]]
+        >>> b.dtype
+        <class 'numpy.float64'>
+        >>> c = a.copyto(npx.gpu(0))
+        >>> c
+        array([[0.5488135 , 0.5928446 , 0.71518934],
+               [0.84426576, 0.60276335, 0.8579456 ]], ctx=gpu(0))
+        >>> print(c)
+        [[0.5488135  0.5928446  0.71518934]
+         [0.84426576 0.60276335 0.8579456 ]] @gpu(0)
+        >>> d = b.copyto(npx.gpu(0))
+        >>> d
+        array([[0.54881352, 0.59284461, 0.71518934],
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64, ctx=gpu(0))
+        >>> print(d)
+        [[0.54881352 0.59284461 0.71518934]
+         [0.84426576 0.60276335 0.85794562]] @gpu(0)
+        """
+        array_str = self.asnumpy().__repr__()
+        dtype = self.dtype
+        if 'dtype=' in array_str:
+            if dtype == _np.float32:
+                array_str = array_str[:array_str.rindex(',')] + ')'
+        elif dtype != _np.float32:
+            array_str = array_str[:-1] + ', dtype={})'.format(dtype.__name__)
+
+        context = self.context
+        if context.device_type == 'cpu':
+            return array_str
+        return array_str[:-1] + ', ctx={})'.format(str(context))
+
+    def __str__(self):
+        """Returns a string representation of the array."""
+        array_str = self.asnumpy().__str__()
+        context = self.context
+        if context.device_type == 'cpu' or self.ndim == 0:
+            return array_str
+        return '{array} @{ctx}'.format(array=array_str, ctx=context)
+
+    def attach_grad(self, grad_req='write'):  # pylint: disable=arguments-differ
+        """Attach a gradient buffer to this ndarray, so that `backward`
+        can compute gradient with respect to it.
+
+        Parameters
+        ----------
+        grad_req : {'write', 'add', 'null'}
+            How gradient will be accumulated.
+            - 'write': gradient will be overwritten on every backward.
+            - 'add': gradient will be added to existing value on every backward.
+            - 'null': do not compute gradient for this NDArray.
+        """
+        grad = _mx_np_op.zeros_like(self)  # pylint: disable=undefined-variable
+        grad_req = _GRAD_REQ_MAP[grad_req]
+        check_call(_LIB.MXAutogradMarkVariables(
+            1, ctypes.pointer(self.handle),
+            ctypes.pointer(mx_uint(grad_req)),
+            ctypes.pointer(grad.handle)))
+
+    @property
+    def grad(self):
+        """Returns gradient buffer attached to this ndarray."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetGrad(self.handle, ctypes.byref(hdl)))
+        if hdl.value is None:
+            return None
+        return _np_ndarray_cls(hdl)
+
+    def detach(self):
+        """Returns a new ndarray, detached from the current graph."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
+        return _np_ndarray_cls(hdl)
+
+    def astype(self, dtype, *args, **kwargs):  # pylint: disable=arguments-differ,unused-argument
+        """
+        Copy of the array, cast to a specified type.
+
+        Parameters
+        ----------
+        dtype : str or dtype
+            Typecode or data-type to which the array is cast.
+        copy : bool, optional
+            Default `True`. By default, astype always returns a newly
+            allocated ndarray on the same context. If this is set to
+            `False`, and the dtype requested is the same as the ndarray's
+            dtype, the ndarray is returned instead of a copy.
+
+        Returns
+        -------
+        arr_t : ndarray
+            Unless `copy` is False and the other conditions for returning the input
+            array are satisfied (see description for `copy` input parameter), `arr_t`
+            is a new array of the same shape as the input array with `dtype`.
+        """
+        _sanity_check_params('astype', ['order', 'casting', 'subok'], kwargs)
+        copy = kwargs.get('copy', True)
+        if not copy and _np.dtype(dtype) == self.dtype:
+            return self
+
+        res = empty(self.shape, dtype=dtype, ctx=self.context)
+        self.copyto(res)
+        return res
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``ndarray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``NDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : ndarray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        ndarray
+            The copied array. If ``other`` is an ``ndarray``, then the return value
+            and ``other`` will point to the same ``ndarray``.
+
+        Examples
+        --------
+        >>> x = np.ones((2,3))
+        >>> y = np.zeros((2,3), mx.gpu(0))
+        >>> z = x.copyto(y)
+        >>> z is y
+        True
+        >>> y.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        """
+        if isinstance(other, ndarray):
+            other = other.as_nd_ndarray()
+        return self.as_nd_ndarray().copyto(other).as_np_ndarray()
+
+    def asscalar(self):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute asscalar')
+
+    def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    def as_in_context(self, context):
+        """Returns an array on the target device with the same value as this array.
+
+        If the target context is the same as ``self.context``, then ``self`` is
+        returned.  Otherwise, a copy is made.
+
+        Parameters
+        ----------
+        context : Context
+            The target context.
+
+        Returns
+        -------
+        ndarray
+            The target array.
+        """
+        if self.context == context:
+            return self
+        return self.copyto(context)
+
+    def copy(self, order='C'):  # pylint: disable=arguments-differ
+        if order != 'C':
+            raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
+                                      'received {}'.format(str(order)))
+        return super(ndarray, self).copy().as_np_ndarray()
+
+    def dot(self, b, out=None):
+        raise NotImplementedError
+
+    def reshape(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Returns an array containing the same data with a new shape.
+
+        Notes
+        -----
+        Unlike the free function `numpy.reshape`, this method on `ndarray` allows
+        the elements of the shape parameter to be passed in as separate arguments.
+        For example, ``a.reshape(10, 11)`` is equivalent to
+        ``a.reshape((10, 11))``.
+        """
+        order = 'C'
+        if len(kwargs) > 1:
+            raise TypeError('function takes at most 1 keyword argument')
+        if len(kwargs) == 1:
+            if 'order' not in kwargs:
+                raise TypeError('{} is an invalid keyword argument for this function'
+                                .format(kwargs.keys()[0]))
+            order = kwargs.pop('order', 'C')
+            if order != 'C':
+                raise NotImplementedError('only supports C-order,'
+                                          ' while received {}'.format(order))
+        if len(args) == 0:
+            raise TypeError('reshape() takes exactly 1 argument (0 given)')
+        raise NotImplementedError
+
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute reshape_like')
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute zeros_like')
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute ones_like')
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
+
+    def repeat(self, repeats, axis=None):  # pylint: disable=arguments-differ
+        """Repeat elements of an array."""
+        raise NotImplementedError
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute pad')
+
+    def swapaxes(self, axis1, axis2):  # pylint: disable=arguments-differ
+        """Return a copy of the array with axis1 and axis2 interchanged.
+        Refer to `mxnet.numpy.swapaxes` for full documentation.
+        """
+        raise NotImplementedError
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute split')
+
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute split_v2')
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice')
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_axis')
+
+    def slice_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_like`.
+
+        The arguments are the same as for :py:func:`slice_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_like')
+
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute one_hot')
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute pick')
+
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute topk')
+
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute argmax_channel')
+
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
+        """Return an array whose values are limited to [min, max].
+        One of max or min must be given.
+        """
+        raise NotImplementedError
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+
+    def flatten(self, order='C'):  # pylint: disable=arguments-differ
+        """Return a copy of the array collapsed into one dimension."""
+        raise NotImplementedError
+
+    def shape_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`shape_array`.
+
+        The arguments are the same as for :py:func:`shape_array`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute shape_array')
+
+    def size_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`size_array`.
+
+        The arguments are the same as for :py:func:`size_array`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute size_array')
+
+    def expand_dims(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute expand_dims')
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tile')
+
+    def transpose(self, *axes):  # pylint: disable=arguments-differ
+        """Permute the dimensions of an array."""
+        raise NotImplementedError
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute flip')
+
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute depth_to_space')
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute space_to_depth')
+
+    def diag(self, k=0, **kwargs):
+        """Convenience fluent method for :py:func:`diag`.
+
+        The arguments are the same as for :py:func:`diag`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute diag')
+
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute nansum')
+
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the product of the array elements over the given axis."""
+        raise NotImplementedError
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute nanprod')
+
+    def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Returns the average of the array elements along given axis."""
+        raise NotImplementedError
+
+    # TODO(junwu): Use mxnet std op instead of onp.std
+    def std(self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint: disable=arguments-differ
+        """Returns the standard deviation of the array elements along given axis."""
+        ret_np = self.asnumpy().std(axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
+        return array(ret_np, dtype=ret_np.dtype, ctx=self.context)
+
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of the elements along the given axis."""
+        raise NotImplementedError
+
+    def tolist(self):
+        return self.asnumpy().tolist()
+
+    def max(self, axis=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the maximum along a given axis."""
+        raise NotImplementedError
+
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
+
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rint')
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute fix')
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute floor')
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute ceil')
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute trunc')
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sin')
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cos')
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tan')
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arcsin')
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arccos')
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arctan')
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute degrees')
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute radians')
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sinh')
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cosh')
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tanh')
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arcsinh')
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arccosh')
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arctanh')
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute exp')
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute expm1')
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log')
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log10')
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log2')
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log1p')
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sqrt')
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rsqrt')
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cqrt')
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rcqrt')
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute square')
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute reciprocal')
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute relu')
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sigmoid')
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute softmax')
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log_softmax')
+
+    def softmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmin`.
+
+        The arguments are the same as for :py:func:`softmin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute softmin')
+
+    def squeeze(self, axis=None):  # pylint: disable=arguments-differ
+        """Remove single-dimensional entries from the shape of a.
+        """
+        raise NotImplementedError
+
+    def broadcast_to(self, shape):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_to')
+
+    def broadcast_like(self, other):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
+
+    @property
+    def shape(self):
+        return super(ndarray, self).shape
+
+    @property
+    def ndim(self):
+        """Number of array dimensions."""
+        return len(self.shape)
+
+    @property
+    def size(self):
+        """Number of elements in the array."""
+        return super(ndarray, self).size
+
+    def tostype(self, stype):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tostype')
+
+
+@set_module('mxnet.numpy')
+def empty(shape, dtype=float, order='C', ctx=None):
+    """Return a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int Shape of the empty array, e.g., ``(2, 3)`` or ``2``.
+    dtype : data-type, optional
+        Desired output data-type for the array, e.g, `numpy.int8`. Default is
+        `numpy.float32`. Note that this behavior is different from NumPy's `empty`
+        function where `float64` is the default value, because `float32` is
+        considered as the default data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+
+    Returns
+    -------
+    out : ndarray
+        Array of uninitialized (arbitrary) data of the given shape, dtype, and order.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    if dtype is None:
+        dtype = _np.float32
+    if isinstance(shape, int):
+        shape = (shape,)
+    return ndarray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+
+
+@set_module('mxnet.numpy')
+def array(object, dtype=None, ctx=None):
+    """
+    Create an array.
+
+    Parameters
+    ----------
+    object : array_like or `numpy.ndarray` or `mxnet.numpy.ndarray`
+        An array, any object exposing the array interface, an object whose
+        __array__ method returns an array, or any (nested) sequence.
+    dtype : data-type, optional
+        The desired data-type for the array. Default is `float32`.
+    ctx : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+
+    Returns
+    -------
+    out : ndarray
+        An array object satisfying the specified requirements.
+    """
+    if ctx is None:
+        ctx = current_context()
+    if isinstance(object, ndarray):
+        dtype = object.dtype if dtype is None else dtype
+    else:
+        dtype = mx_real_t if dtype is None else dtype
+        if not isinstance(object, (ndarray, _np.ndarray)):
+            try:
+                object = _np.array(object, dtype=dtype)
+            except Exception as e:
+                raise TypeError('{}'.format(str(e)))
+    ret = empty(object.shape, dtype=dtype, ctx=ctx)
+    if len(object.shape) == 0:
+        ret[()] = object
+    else:
+        ret[:] = object
+    return ret
+
+
+@set_module('mxnet.numpy')
+def zeros(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `numpy.float32`). Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    return _mx_nd_np.zeros(shape, dtype, order, ctx)
+
+
+@set_module('mxnet.numpy')
+def ones(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    return _mx_nd_np.ones(shape, dtype, order, ctx)
+
+
+@set_module('mxnet.numpy')
+def add(x1, x2, out=None):
+    """Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be added. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.add(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def subtract(x1, x2, out=None):
+    """Subtract arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be subtracted from each other. If x1.shape != x2.shape,
+        they must be broadcastable to a common shape (which may be the shape
+        of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    subtract : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.subtract(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def multiply(x1, x2, out=None):
+    """Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be multiplied. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.multiply(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def divide(x1, x2, out=None):
+    """Returns a true division of the inputs, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.divide(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def mod(x1, x2, out=None):
+    """Return element-wise remainder of division.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.mod(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def power(x1, x2, out=None):
+    """First array elements raised to powers from second array, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        The bases.
+
+    x2 : ndarray or scalar
+        The exponent.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The bases in x1 raised to the exponents in x2.
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.power(x1, x2, out=out)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
new file mode 100644
index 000000000000..c4109378e146
--- /dev/null
+++ b/python/mxnet/numpy/random.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for ops used in imperative programming."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/numpy/utils.py b/python/mxnet/numpy/utils.py
new file mode 100644
index 000000000000..920897efc80b
--- /dev/null
+++ b/python/mxnet/numpy/utils.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions for the numpy module."""
+
+
+from __future__ import absolute_import
+
+import numpy as onp
+
+__all__ = ['float16', 'float32', 'float64', 'uint8', 'int32', 'int8', 'int64', 'pi']
+
+float16 = onp.float16
+float32 = onp.float32
+float64 = onp.float64
+uint8 = onp.uint8
+int32 = onp.int32
+int8 = onp.int8
+int64 = onp.int64
+
+pi = onp.pi
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
new file mode 100644
index 000000000000..d71d65f08de2
--- /dev/null
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for ops not belonging to the official numpy package for imperative programming."""
+
+from __future__ import absolute_import
+from . import _op
+from . import image
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from ..context import *  # pylint: disable=wildcard-import
+from ..util import is_np_shape, is_np_array, set_np, reset_np
+from ..ndarray import waitall
+from .utils import *  # pylint: disable=wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/_op.py b/python/mxnet/numpy_extension/_op.py
new file mode 100644
index 000000000000..a995e480221a
--- /dev/null
+++ b/python/mxnet/numpy_extension/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for registering numpy_extension ops for imperative programming."""
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/_register.py b/python/mxnet/numpy_extension/_register.py
new file mode 100644
index 000000000000..8abb7254057c
--- /dev/null
+++ b/python/mxnet/numpy_extension/_register.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering ops in mxnet.numpy_extension for imperative programming."""
+
+from __future__ import absolute_import
+
+from ..base import _init_np_op_module
+from ..ndarray.register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name=None, make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/numpy_extension/image.py b/python/mxnet/numpy_extension/image.py
new file mode 100644
index 000000000000..00a028b3c18f
--- /dev/null
+++ b/python/mxnet/numpy_extension/image.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+from ..image import *  # pylint: disable=wildcard-import, unused-wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/utils.py b/python/mxnet/numpy_extension/utils.py
new file mode 100644
index 000000000000..0aa89badbb58
--- /dev/null
+++ b/python/mxnet/numpy_extension/utils.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions for the numpy module."""
+
+
+from __future__ import absolute_import
+
+import ctypes
+from .. util import is_np_array, is_np_shape
+from .. base import _LIB, check_call, string_types, c_str_array
+from .. base import c_handle_array, c_str, mx_uint, NDArrayHandle, py_str
+from ..numpy import ndarray
+
+__all__ = ['save', 'load']
+
+
+def save(file, arr):
+    """Saves a list of `ndarray`s or a dict of `str`->`ndarray` to file.
+
+    Examples of filenames:
+
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+
+    Parameters
+    ----------
+    file : str
+        Filename to which the data is saved.
+    arr : `ndarray` or list of `ndarray`s or dict of `str` to `ndarray`
+        The data to be saved.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot save `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if isinstance(arr, ndarray):
+        arr = [arr]
+    if isinstance(arr, dict):
+        str_keys = arr.keys()
+        nd_vals = arr.values()
+        if any(not isinstance(k, string_types) for k in str_keys) or \
+                any(not isinstance(v, ndarray) for v in nd_vals):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = c_str_array(str_keys)
+        handles = c_handle_array(nd_vals)
+    elif isinstance(arr, list):
+        if any(not isinstance(v, ndarray) for v in arr):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = None
+        handles = c_handle_array(arr)
+    else:
+        raise ValueError("data needs to either be a ndarray, dict of (str, ndarray) pairs "
+                         "or a list of ndarrays.")
+    check_call(_LIB.MXNDArraySave(c_str(file),
+                                  mx_uint(len(handles)),
+                                  handles,
+                                  keys))
+
+
+def load(file):
+    """Loads an array from file.
+
+    See more details in ``save``.
+
+    Parameters
+    ----------
+    file : str
+        The filename.
+
+    Returns
+    -------
+    result : list of ndarrays or dict of str -> ndarray
+        Data stored in the file.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot load `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if not isinstance(file, string_types):
+        raise TypeError('file required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(file),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [ndarray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), ndarray(NDArrayHandle(handles[i])))
+            for i in range(out_size.value))
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index c2c1aa6a76f4..d953e9247900 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -18,6 +18,7 @@
 
 # pylint: disable=too-many-lines
 """Weight updating functions."""
+from __future__ import absolute_import
 import logging
 import math
 import pickle
@@ -33,6 +34,7 @@
                        multi_mp_sgd_mom_update)
 from ..ndarray import sparse
 from ..random import normal
+from ..util import is_np_array
 
 __all__ = [
     'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD',
@@ -119,6 +121,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.idx2name = param_idx2name.copy()
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
+        self.allow_np_array = is_np_array()
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -1644,6 +1647,28 @@ def update(self, index, weight, grad, state):
 # backward compatibility wrapper for Optimizer.CreateOptimizer
 create = Optimizer.create_optimizer  # pylint: disable=invalid-name
 
+
+def _as_classic(a, allow_np):
+    # TODO(junwu): This is a temp solution for allowing converting
+    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
+    # users may have custom optimizers implemented using mx.nd.NDArray ops.
+    from ..numpy import ndarray as np_ndarray
+    if isinstance(a, (tuple, list)):
+        if any(isinstance(x, np_ndarray) for x in a):
+            if allow_np:
+                return [x.as_nd_ndarray() for x in a]
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    else:
+        if isinstance(a, np_ndarray):
+            if allow_np:
+                return a.as_nd_ndarray()
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    return a
+
+
+
 class Updater(object):
     """Updater for kvstore."""
     def __init__(self, optimizer):
@@ -1654,14 +1679,15 @@ def __init__(self, optimizer):
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
+        allow_np = self.optimizer.allow_np_array
         if not isinstance(index, (list, tuple)):
             indices = [index]
-            grads = [grad]
-            weights = [weight]
+            grads = [_as_classic(grad, allow_np)]
+            weights = [_as_classic(weight, allow_np)]
         else:
             indices = index
-            grads = grad
-            weights = weight
+            grads = _as_classic(grad, allow_np)
+            weights = _as_classic(weight, allow_np)
         if weights:
             self.optimizer._set_current_context(weights[0].context.device_id)
         for i, idx in enumerate(indices):
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
index f438e4954aa9..2ce395bdd279 100644
--- a/python/mxnet/symbol/__init__.py
+++ b/python/mxnet/symbol/__init__.py
@@ -17,7 +17,7 @@
 
 """Symbol API of MXNet."""
 
-from . import _internal, contrib, linalg, op, random, sparse, image, symbol
+from . import _internal, contrib, linalg, op, random, sparse, image, symbol, numpy
 # pylint: disable=wildcard-import, redefined-builtin
 try:
     from .gen_op import * # pylint: disable=unused-wildcard-import
@@ -27,5 +27,8 @@
 from .op import *
 from .symbol import *
 # pylint: enable=wildcard-import
+from . import numpy as np
+from . import numpy_extension as npx
 
-__all__ = op.__all__ + symbol.__all__ + ['contrib', 'linalg', 'random', 'sparse', 'image']
+__all__ = op.__all__ + symbol.__all__\
+          + ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/symbol/_internal.py b/python/mxnet/symbol/_internal.py
index 7e9787e32b1c..d46c0e64e6f1 100644
--- a/python/mxnet/symbol/_internal.py
+++ b/python/mxnet/symbol/_internal.py
@@ -24,18 +24,18 @@
 
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
-        from .._ctypes.symbol import SymbolBase, _set_symbol_class
+        from .._ctypes.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._ctypes.symbol import _symbol_creator
     elif _sys.version_info >= (3, 0):
-        from .._cy3.symbol import SymbolBase, _set_symbol_class
+        from .._cy3.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._cy3.symbol import _symbol_creator
     else:
-        from .._cy2.symbol import SymbolBase, _set_symbol_class
+        from .._cy2.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._cy2.symbol import _symbol_creator
 except ImportError:
     if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
         raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
-    from .._ctypes.symbol import SymbolBase, _set_symbol_class
+    from .._ctypes.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
     from .._ctypes.symbol import _symbol_creator
 from ..attribute import AttrScope
 from ..base import _Null
@@ -45,4 +45,4 @@
 except ImportError:
     pass
 
-__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator']
+__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator', '_set_np_symbol_class']
diff --git a/python/mxnet/symbol/numpy/__init__.py b/python/mxnet/symbol/numpy/__init__.py
new file mode 100644
index 000000000000..857849c4ae62
--- /dev/null
+++ b/python/mxnet/symbol/numpy/__init__.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for numpy ops under mxnet.symbol."""
+
+from . import random
+from . import linalg
+from . import _op, _symbol, _internal
+from ._symbol import _Symbol
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from ._symbol import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__ + _symbol.__all__
diff --git a/python/mxnet/symbol/numpy/_internal.py b/python/mxnet/symbol/numpy/_internal.py
new file mode 100644
index 000000000000..c5f292842b3b
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_internal.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for numpy internal ops."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/_op.py b/python/mxnet/symbol/numpy/_op.py
new file mode 100644
index 000000000000..a4a979f30b18
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/_register.py b/python/mxnet/symbol/numpy/_register.py
new file mode 100644
index 000000000000..3245c8d6d638
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_register.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_symbol_function
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy._internal',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
new file mode 100644
index 000000000000..616f3066d98d
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -0,0 +1,1013 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""numpy namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+from __future__ import absolute_import
+import ctypes
+import numpy as _np
+from ...base import _LIB, SymbolHandle, numeric_types, mx_uint
+from ...util import check_call, set_module
+from ...context import current_context
+from ..symbol import Symbol
+from .._internal import _set_np_symbol_class
+from . import _internal as _npi
+
+__all__ = ['zeros', 'ones', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power']
+
+
+def _num_outputs(sym):
+    return len(sym.as_nd_ndarray())
+
+
+@set_module('mxnet.symbol.numpy')
+class _Symbol(Symbol):
+    def __getitem__(self, key):
+        num_outputs = _num_outputs(self)
+        if num_outputs == 1:
+            raise NotImplementedError
+        if not isinstance(key, int):
+            raise NotImplementedError
+        if key >= num_outputs:
+            # Important, python determines the end by this exception
+            raise IndexError
+        handle = SymbolHandle()
+        check_call(_LIB.MXSymbolGetOutput(
+            self.handle, mx_uint(key), ctypes.byref(handle)))
+        return _Symbol(handle=handle)
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise AttributeError('_Symbol object has no attribute __iter__')
+
+    def __add__(self, other):
+        """x.__add__(y) <=> x + y"""
+        return add(self, other)
+
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x - y"""
+        return subtract(self, other)
+
+    def __rsub__(self, other):
+        """x.__rsub__(y) <=> y - x"""
+        return subtract(other, self)
+
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x * y"""
+        return multiply(self, other)
+
+    def __rmul__(self, other):
+        """x.__rmul__(y) <=> y * x"""
+        return multiply(other, self)
+
+    def __div__(self, other):
+        raise AttributeError('_Symbol.__div__ is replaced by __truediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __rdiv__(self, other):
+        raise AttributeError('_Symbol.__rdiv__ is replaced by __rtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __mod__(self, other):
+        """x.__mod__(y) <=> x % y"""
+        return mod(self, other)
+
+    def __rmod__(self, other):
+        """x.__rmod__(y) <=> y % x"""
+        return mod(other, self)
+
+    def __idiv__(self, other):
+        raise NotImplementedError
+
+    def __truediv__(self, other):
+        """x.__truediv__(y) <=> x / y"""
+        return divide(self, other)
+
+    def __rtruediv__(self, other):
+        """x.__rtruediv__(y) <=> y / x"""
+        return divide(other, self)
+
+    def __itruediv__(self, other):
+        raise NotImplementedError
+
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x ** y"""
+        return power(self, other)
+
+    def __rpow__(self, other):
+        return power(other, self)
+
+    def __neg__(self):
+        """x.__neg__() <=> - x"""
+        return self.__mul__(-1.0)
+
+    def __deepcopy__(self, _):
+        return super(_Symbol, self).as_np_ndarray()
+
+    def __eq__(self, other):
+        """x.__eq__(y) <=> x == y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __ne__(self, other):
+        """x.__ne__(y) <=> x != y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.not_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.not_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __gt__(self, other):
+        """x.__gt__(y) <=> x > y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.greater(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __ge__(self, other):
+        """x.__ge__(y) <=> x >= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.greater_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __lt__(self, other):
+        """x.__lt__(y) <=> x < y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.less(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __le__(self, other):
+        """x.__le__(y) <=> x <= y"""
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.less_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def as_nd_ndarray(self):
+        """Convert _Symbol to mxnet.symbol.Symbol to use its convenience fluent methods."""
+        hdl = SymbolHandle()
+        check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
+        return Symbol(handle=hdl)
+
+    def as_np_ndarray(self):
+        """For the convenience of conversion between legacy and np symbols."""
+        return self
+
+    @property
+    # pylint: disable= invalid-name, undefined-variable
+    def T(self):
+        """Same as self.transpose()."""
+        return self.transpose()
+    # pylint: enable= invalid-name, undefined-variable
+
+    def astype(self, dtype, **kwargs):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    def dot(self, b, out=None):
+        raise NotImplementedError
+
+    def reshape(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Returns an array containing the same data with a new shape.
+
+        Notes
+        -----
+        Unlike the free function `numpy.reshape`, this method on `ndarray` allows
+        the elements of the shape parameter to be passed in as separate arguments.
+        For example, ``a.reshape(10, 11)`` is equivalent to
+        ``a.reshape((10, 11))``.
+        """
+        order = 'C'
+        if len(kwargs) > 1:
+            raise TypeError('function takes at most 1 keyword argument')
+        if len(kwargs) == 1:
+            if 'order' not in kwargs:
+                raise TypeError('{} is an invalid keyword argument for this function'
+                                .format(kwargs.keys()[0]))
+            order = kwargs.pop('order', 'C')
+            if order != 'C':
+                raise NotImplementedError('only supports C-order,'
+                                          ' while received {}'.format(order))
+        if len(args) == 0:
+            raise TypeError('reshape() takes exactly 1 argument (0 given)')
+        raise NotImplementedError
+
+    def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute reshape_like')
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute zeros_like')
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute ones_like')
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute broadcast_like')
+
+    def repeat(self, repeats, axis=None):  # pylint: disable=arguments-differ
+        """Repeat elements of an array."""
+        raise NotImplementedError
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute pad')
+
+    def swapaxes(self, axis1, axis2):  # pylint: disable=arguments-differ
+        """Return a copy of the array with axis1 and axis2 interchanged.
+        Refer to `mxnet.numpy.swapaxes` for full documentation.
+        """
+        raise NotImplementedError
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute split')
+
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute split_v2')
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute slice')
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute slice_axis')
+
+    def slice_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_like`.
+
+        The arguments are the same as for :py:func:`slice_like`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute slice_like')
+
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute one_hot')
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute pick')
+
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute topk')
+
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute argmax_channel')
+
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
+        """Return an array whose values are limited to [min, max].
+        One of max or min must be given.
+        """
+        raise NotImplementedError
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute abs')
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute abs')
+
+    def flatten(self, order='C'):  # pylint: disable=arguments-differ
+        """Return a copy of the array collapsed into one dimension."""
+        return self.reshape(-1, order=order)
+
+    def shape_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`shape_array`.
+
+        The arguments are the same as for :py:func:`shape_array`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute shape_array')
+
+    def size_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`size_array`.
+
+        The arguments are the same as for :py:func:`size_array`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute size_array')
+
+    def expand_dims(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute expand_dims')
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute tile')
+
+    def transpose(self, *axes):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`transpose`.
+
+        The arguments are the same as for :py:func:`transpose`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute flip')
+
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute depth_to_space')
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute space_to_depth')
+
+    def diag(self, k=0, **kwargs):
+        """Convenience fluent method for :py:func:`diag`.
+
+        The arguments are the same as for :py:func:`diag`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute diag')
+
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute nansum')
+
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the product of the array elements over the given axis."""
+        raise NotImplementedError
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute nanprod')
+
+    def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Convenience fluent method for :py:func:`mean`.
+
+        The arguments are the same as for :py:func:`mean`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of the elements along the given axis."""
+        raise NotImplementedError
+
+    def max(self, axis=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the maximum along a given axis."""
+        raise NotImplementedError
+
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute norm')
+
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute rint')
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute fix')
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute floor')
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute ceil')
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute trunc')
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sin')
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute cos')
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute tan')
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arcsin')
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arccos')
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arctan')
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute degrees')
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute radians')
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sinh')
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute cosh')
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute tanh')
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arcsinh')
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arccosh')
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute arctanh')
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute exp')
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute expm1')
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log')
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log10')
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log2')
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log1p')
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sqrt')
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute rsqrt')
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute cqrt')
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute rcqrt')
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute square')
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute reciprocal')
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute relu')
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute sigmoid')
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute softmax')
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute log_softmax')
+
+    def softmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmin`.
+
+        The arguments are the same as for :py:func:`softmin`, with
+        this array as data.
+        """
+        raise AttributeError('_Symbol object has no attribute softmin')
+
+    def squeeze(self, axis=None):  # pylint: disable=arguments-differ
+        """Remove single-dimensional entries from the shape of a.
+        """
+        raise NotImplementedError
+
+    def broadcast_to(self, *args, **kwargs):
+        raise AttributeError('_Symbol object has no attribute broadcast_to')
+
+    def broadcast_like(self, *args, **kwargs):
+        raise AttributeError('_Symbol object has no attribute broadcast_like')
+
+
+@set_module('mxnet.symbol.numpy')
+def zeros(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `zeros` function  where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : Symbol
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.zeros(shape=shape, ctx=ctx, dtype=dtype)
+
+
+@set_module('mxnet.symbol.numpy')
+def ones(shape, dtype=_np.float32, order='C', ctx=None):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    order : {'C'}, optional, default: 'C'
+        How to store multi-dimensional data in memory, currently only row-major
+        (C-style) is supported.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    if order != 'C':
+        raise NotImplementedError
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _npi.ones(shape=shape, ctx=ctx, dtype=dtype)
+
+
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
+
+    Parameters
+    --------
+    lhs : Symbol or numeric value
+        Left-hand side operand.
+
+    rhs : Symbol or numeric value
+        Right-hand operand,
+
+    fn_array : function
+        Function to be called if both lhs and rhs are of ``Symbol`` type.
+
+    fn_scalar : function
+        Function to be called if both lhs and rhs are numeric values.
+
+    lfn_scalar : function
+        Function to be called if lhs is ``Symbol`` while rhs is numeric value
+
+    rfn_scalar : function
+        Function to be called if lhs is numeric value while rhs is ``Symbol``;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    --------
+    mxnet.numpy.ndarray
+        result array
+    """
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return fn_scalar(lhs, rhs, out=out)
+        else:
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs), out=out)
+            else:
+                return rfn_scalar(rhs, float(lhs), out=out)
+    elif isinstance(rhs, numeric_types):
+        return lfn_scalar(lhs, float(rhs), out=out)
+    elif isinstance(rhs, Symbol):
+        return fn_array(lhs, rhs, out=out)
+    else:
+        raise TypeError('type %s not supported' % str(type(rhs)))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+
+@set_module('mxnet.symbol.numpy')
+def add(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.add, _np.add, _npi.add_scalar, None, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def subtract(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.subtract, _np.subtract, _npi.subtract_scalar,
+                         _npi.rsubtract_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def multiply(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.multiply, _np.multiply, _npi.multiply_scalar, None, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def divide(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.true_divide, _np.divide, _npi.true_divide_scalar,
+                         _npi.rtrue_divide_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def mod(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def power(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
+
+
+_set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
new file mode 100644
index 000000000000..28cfd0f3806a
--- /dev/null
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=symbol."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
new file mode 100644
index 000000000000..28cfd0f3806a
--- /dev/null
+++ b/python/mxnet/symbol/numpy/random.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators used in Gluon dispatched by F=symbol."""
+
+from __future__ import absolute_import
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy_extension/__init__.py b/python/mxnet/symbol/numpy_extension/__init__.py
new file mode 100644
index 000000000000..5be34ac9b3d5
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for the ops not belonging to the official numpy package."""
+
+from . import _op
+from . import image
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/symbol/numpy_extension/_op.py b/python/mxnet/symbol/numpy_extension/_op.py
new file mode 100644
index 000000000000..82eaa8e6ec9f
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/_op.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators not belonging to the official numpy package
+used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy_extension/_register.py b/python/mxnet/symbol/numpy_extension/_register.py
new file mode 100644
index 000000000000..b118987b1fd3
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/_register.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy_extension ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_symbol_function
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
diff --git a/python/mxnet/symbol/numpy_extension/image.py b/python/mxnet/symbol/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index ac59f8b97f15..a17dd79048d4 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -27,12 +27,60 @@
 from ..attribute import AttrScope
 from ..base import mx_uint, check_call, _LIB, py_str
 from ..symbol_doc import _build_doc
-from ..base import _Null, _init_op_module
+from ..base import _Null, _init_op_module, _is_np_op
 from ..name import NameManager
 # pylint: enable=unused-import
 
 
-def _generate_symbol_function_code(handle, name, func_name, signature_only=False):
+def _verify_np_symbol(op_name, func_name, sym):
+    """Verify if the sym is a numpy symbol.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    sym : symbol to be verified
+    """
+    from .numpy._symbol import _Symbol as np_symbol
+    if not isinstance(sym, np_symbol):
+        raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                        'This is a numpy operator which can only accept '
+                        'MXNet numpy ndarrays, while received a legacy ndarray. '
+                        'Please ensure that you have activated numpy semantics by calling '
+                        '`npx.set_np()` in your code. If you still see this error with numpy '
+                        'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                        'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                        'converted array to this operator.'
+                        .format(op_name, func_name))
+
+
+def _verify_legacy_symbol(op_name, func_name, sym):
+    """Verify if the sym is a legacy symbol.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    sym : symbol to be verified
+    """
+    from .numpy._symbol import _Symbol as np_symbol
+    if isinstance(sym, np_symbol):
+        raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                        'This is a legacy operator which can only accept '
+                        'legacy ndarrays, while received an MXNet numpy ndarray. '
+                        'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                        'convert it to a legacy ndarray, and then feed the converted '
+                        'array to this operator.'
+                        .format(op_name, func_name))
+
+
+def _generate_symbol_function_code(handle, op_name, func_name, signature_only=False):
     """Generate function for symbol op by handle and function name."""
     real_name = ctypes.c_char_p()
     desc = ctypes.c_char_p()
@@ -56,7 +104,7 @@ def _generate_symbol_function_code(handle, name, func_name, signature_only=False
     arg_types = [py_str(arg_types[i]) for i in range(narg)]
     key_var_num_args = py_str(key_var_num_args.value)
     ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(name,
+    doc_str = _build_doc(op_name,
                          py_str(desc.value),
                          arg_names,
                          arg_types,
@@ -95,6 +143,8 @@ def _generate_symbol_function_code(handle, name, func_name, signature_only=False
     signature.append('**kwargs')
     signature = ndsignature + signature
 
+    is_np_op = _is_np_op(op_name)
+    verify_symbol_fn = _verify_np_symbol.__name__ if is_np_op else _verify_legacy_symbol.__name__
     code = []
     if arr_name:
         code.append("""
@@ -106,7 +156,8 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
         assert isinstance(i, SymbolBase), \\
             "Positional arguments must be Symbol instances, " \\
             "but got %s"%str(i)
-        sym_args.append(i)""".format(arr_name))
+        {}('{}', '{}', i)
+        sym_args.append(i)""".format(arr_name, verify_symbol_fn, op_name, func_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
@@ -128,9 +179,10 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
     for k, v in kwargs.items():
         if isinstance(v, SymbolBase):
             sym_kwargs[k] = v
+            %s('%s', '%s', v)
         else:
             keys.append(k)
-            vals.append(v)"""%(func_name.lower()))
+            vals.append(v)"""%(func_name.lower(), verify_symbol_fn, op_name, func_name))
             if key_var_num_args: # pylint: disable=using-constant-test
                 code.append("""
     if '%s' not in kwargs:
@@ -139,8 +191,8 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             key_var_num_args, key_var_num_args))
 
             code.append("""
-    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%(
-        handle.value))
+    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name, %s)"""%(
+        handle.value, str(is_np_op)))
     else:
         code.append("""
 def %s(%s):"""%(func_name, ', '.join(signature)))
@@ -155,9 +207,10 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     for _k, _v in kwargs.items():
         if isinstance(_v, SymbolBase):
             sym_kwargs[_k] = _v
+            {}('{}', '{}', _v)
         else:
             _keys.append(_k)
-            _vals.append(_v)""")
+            _vals.append(_v)""".format(verify_symbol_fn, op_name, func_name))
             # NDArray args
             for name in ndarg_names: # pylint: disable=redefined-argument-from-local
                 code.append("""
@@ -165,6 +218,9 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         assert isinstance({name}, SymbolBase), \\
             "Argument {name} must be Symbol instances, but got %s"%str({name})
         sym_kwargs['{name}'] = {name}""".format(name=name))
+                code.append("""
+        {}('{}', '{}', {name})
+                """.format(verify_symbol_fn, op_name, func_name, name=name))
             # kwargs
             for name in kwarg_names: # pylint: disable=redefined-argument-from-local
                 code.append("""
@@ -173,7 +229,13 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         _vals.append(%s)"""%(name, name, name))
             # dtype
             if dtype_name is not None:
-                code.append("""
+                if is_np_op:
+                    code.append("""
+    if %s is not _Null and %s is not None:
+        _keys.append('%s')
+        _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name, dtype_name))
+                else:
+                    code.append("""
     if %s is not _Null:
         _keys.append('%s')
         _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
@@ -182,8 +244,8 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     if not hasattr(NameManager._current, "value"):
         NameManager._current.value = NameManager()
     name = NameManager._current.value.get(name, '%s')
-    return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name)"""%(
-        func_name.lower(), handle.value))
+    return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name, %s)"""%(
+        func_name.lower(), handle.value, str(is_np_op)))
 
     if signature_only:
         code.append("""
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index d3cd519b9a8c..f0bc235f6661 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -61,6 +61,17 @@ class Symbol(SymbolBase):
     # Make numpy functions return Symbol instead of numpy object array
     __array_priority__ = 1000.0
 
+    def as_np_ndarray(self):
+        """Convert mx.sym.Symbol to mx.sym.np._Symbol."""
+        from .numpy import _Symbol
+        hdl = SymbolHandle()
+        check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
+        return _Symbol(hdl)
+
+    def as_nd_ndarray(self):
+        """Returns self. For the convenience of conversion between legacy and np symbols."""
+        return self
+
     def __repr__(self):
         """Gets a string representation of the symbol."""
         name = self.name
@@ -144,6 +155,8 @@ def __rsub__(self, other):
         array([[-2., -2., -2.],
                [-2., -2., -2.]], dtype=float32)
         """
+        if isinstance(other, Symbol):
+            return other.__sub__(self)
         if isinstance(other, Number):
             return _internal._RMinusScalar(self, scalar=other)
         else:
@@ -192,6 +205,8 @@ def __rdiv__(self, other):
         array([[ 0.33333334,  0.33333334,  0.33333334],
                [ 0.33333334,  0.33333334,  0.33333334]], dtype=float32)
         """
+        if isinstance(other, Symbol):
+            return other.__truediv__(self)
         if isinstance(other, Number):
             return _internal._RDivScalar(self, scalar=other)
         else:
@@ -222,6 +237,8 @@ def __rmod__(self, other):
         array([[ 1.,  1.,  1.,
                [ 1.,  1.,  1., dtype=float32)
         """
+        if isinstance(other, Symbol):
+            return other.__mod__(self)
         if isinstance(other, Number):
             return _internal._RModScalar(self, scalar=other)
         else:
@@ -252,7 +269,13 @@ def __pow__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rpow__(self, other):
-        raise NotImplementedForSymbol(self.__rpow__, 'y**x', other)
+        """x.__rpow__(y) <=> y ** x"""
+        if isinstance(other, Symbol):
+            return other.__pow__(self)
+        elif isinstance(other, Number):
+            return _internal._rpower_scalar(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def __neg__(self):
         """x.__neg__() <=> -x
@@ -2667,8 +2690,12 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
 Variable = var
 
 
-def Group(symbols):
+def Group(symbols, create_fn=Symbol):
     """Creates a symbol that contains a collection of other symbols, grouped together.
+    A classic symbol (`mx.sym.Symbol`) will be returned if all the symbols in the list
+    are of that type; a numpy symbol (`mx.sym.np._Symbol`) will be returned if all the
+    symbols in the list are of that type. A type error will be raised if a list of mixed
+    classic and numpy symbols are provided.
 
     Example
     -------
@@ -2682,6 +2709,9 @@ def Group(symbols):
     symbols : list
         List of symbols to be grouped.
 
+    create_fn : mx.sym.Symbol or mx.sym.np._Symbol
+        Symbol class for creating the grouped symbol.
+
     Returns
     -------
     sym : Symbol
@@ -2693,7 +2723,7 @@ def Group(symbols):
     check_call(_LIB.MXSymbolCreateGroup(
         mx_uint(len(symbols)),
         c_handle_array(symbols), ctypes.byref(handle)))
-    return Symbol(handle)
+    return create_fn(handle)
 
 
 def load(fname):
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index bd102412c6e2..bfe520b0137a 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -47,6 +47,8 @@
 from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from .ndarray import array
 from .symbol import Symbol
+from .symbol.numpy import _Symbol as np_symbol
+from .util import use_np  # pylint: disable=unused-import
 
 
 def default_context():
@@ -88,7 +90,8 @@ def get_etol(etol=None):
 
 def random_arrays(*shapes):
     """Generate some random numpy arrays."""
-    arrays = [np.random.randn(*s).astype(default_dtype())
+    arrays = [np.array(np.random.randn(), dtype=default_dtype())
+              if len(s) == 0 else np.random.randn(*s).astype(default_dtype())
               for s in shapes]
     if len(arrays) == 1:
         return arrays[0]
@@ -407,16 +410,20 @@ def create_sparse_array_zd(shape, stype, density, data_init=None,
                                density=density,
                                shuffle_csr_indices=shuffle_csr_indices)
 
-def rand_shape_2d(dim0=10, dim1=10):
-    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)
 
+def rand_shape_2d(dim0=10, dim1=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return rnd.randint(low, dim0 + 1), rnd.randint(low, dim1 + 1)
 
-def rand_shape_3d(dim0=10, dim1=10, dim2=10):
-    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)
 
+def rand_shape_3d(dim0=10, dim1=10, dim2=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return rnd.randint(low, dim0 + 1), rnd.randint(low, dim1 + 1), rnd.randint(low, dim2 + 1)
 
-def rand_shape_nd(num_dim, dim=10):
-    return tuple(rnd.randint(1, dim+1, size=num_dim))
+
+def rand_shape_nd(num_dim, dim=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return tuple(rnd.randint(low, dim+1, size=num_dim))
 
 
 def rand_coord_2d(x_low, x_high, y_low, y_high):
@@ -828,7 +835,7 @@ def as_stype(var, stype, dtype):
             continue
         stype = executor.arg_dict[k].stype
         old_value = v.copy()
-        for i in range(np.prod(v.shape)):
+        for i in range(int(np.prod(v.shape))):
             # inplace update
             v.ravel()[i] += eps/2.0
             executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
@@ -940,7 +947,12 @@ def random_projection(shape):
     input_shape = {k: v.shape for k, v in location.items()}
     _, out_shape, _ = sym.infer_shape(**input_shape)
     proj = mx.sym.Variable("__random_proj")
+    is_np_sym = bool(isinstance(sym, np_symbol))
+    if is_np_sym:  # convert to np symbol for using element-wise multiplication
+        proj = proj.as_np_ndarray()
     out = sym * proj
+    if is_np_sym:  # convert to classic symbol so that make_loss can be used
+        out = out.as_nd_ndarray()
     out = mx.sym.make_loss(out)
 
     location = dict(list(location.items()) +
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 5bc1dc809c88..d4e95e0c0c9c 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -20,6 +20,9 @@
 import os
 import sys
 import functools
+import itertools
+import inspect
+import threading
 
 from .base import _LIB, check_call
 
@@ -76,14 +79,24 @@ def set_np_shape(active):
     >>> print(mx.is_np_shape())
     True
     """
+    if active:
+        import logging
+        logging.info('NumPy-shape semantics has been activated in your code. '
+                     'This is required for creating and manipulating scalar and zero-size '
+                     'tensors, which were not supported in MXNet before, as in the official '
+                     'NumPy library. Please DO NOT manually deactivate this semantics while '
+                     'using `mxnet.numpy` and `mxnet.numpy_extension` modules.')
+    elif is_np_array():
+        raise ValueError('Deactivating NumPy shape semantics while NumPy array semantics is still'
+                         ' active is not allowed. Please consider calling `npx.reset_np()` to'
+                         ' deactivate both of them.')
     prev = ctypes.c_int()
     check_call(_LIB.MXSetIsNumpyShape(ctypes.c_int(active), ctypes.byref(prev)))
     return bool(prev.value)
 
 
 def is_np_shape():
-    """
-    Checks whether the NumPy shape semantics is currently turned on.
+    """Checks whether the NumPy shape semantics is currently turned on.
     In NumPy shape semantics, `()` represents the shape of scalar tensors,
     and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent
     the shapes of zero-size tensors. This is turned off by default for keeping
@@ -213,39 +226,379 @@ def np_shape(active=True):
     return _NumpyShapeScope(active)
 
 
+def wraps_safely(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS):
+    """This function is safe version of `functools.wraps` in Python2 which skips wrapping functions
+    for the attributes that do not exist."""
+    if sys.version_info[0] > 2:
+        return functools.wraps(wrapped)
+    else:
+        return functools.wraps(wrapped,
+                               assigned=itertools.ifilter(
+                                   functools.partial(hasattr, wrapped), assigned))
+
+
 def use_np_shape(func):
-    """Wraps a function with an activated NumPy-shape scope. This ensures
-    that the execution of the function is guaranteed with the support of
-    scalar and zero-size tensors as in NumPy.
+    """A decorator wrapping a function or class with activated NumPy-shape semantics.
+    When `func` is a function, this ensures that the execution of the function is scoped with NumPy
+    shape semantics, such as the support for zero-dim and zero size tensors. When
+    `func` is a class, it ensures that all the methods, static functions, and properties
+    of the class are executed with the NumPy shape semantics.
+
+    Example::
+        import mxnet as mx
+        @mx.use_np_shape
+        def scalar_one():
+            return mx.nd.ones(())
+        print(scalar_one())
+
+        @np.use_np_shape
+        class ScalarTensor(object):
+            def __init__(self, val=None):
+                if val is None:
+                    val = ScalarTensor.random().value
+                self._scalar = mx.nd.ones(()) * val
+
+            def __repr__(self):
+                print("Is __repr__ in np_shape semantics? {}!".format(str(np.is_np_shape())))
+                return str(self._scalar.asnumpy())
+
+            @staticmethod
+            def random():
+                val = mx.nd.random.uniform().asnumpy().item()
+                return ScalarTensor(val)
+
+            @property
+            def value(self):
+                print("Is value property in np_shape semantics? {}!".format(str(np.is_np_shape())))
+                return self._scalar.asnumpy().item()
+
+
+        print("Is global scope of np_shape activated? {}!".format(str(np.is_np_shape())))
+        scalar_tensor = ScalarTensor()
+        print(scalar_tensor)
+
+    Parameters
+    ----------
+    func : a user-provided callable function or class to be scoped by the NumPy-shape semantics.
+
+    Returns
+    -------
+    Function or class
+        A function or class wrapped in the NumPy-shape scope.
+    """
+
+    if inspect.isclass(func):
+        for name, method in inspect.getmembers(
+                func,
+                predicate=
+                lambda f: inspect.isfunction(f) or inspect.ismethod(f) or isinstance(f, property)):
+            if isinstance(method, property):
+                setattr(func, name, property(use_np_shape(method.__get__),
+                                             method.__set__,
+                                             method.__delattr__,
+                                             method.__doc__))
+            else:
+                setattr(func, name, use_np_shape(method))
+        return func
+    elif callable(func):
+        @wraps_safely(func)
+        def _with_np_shape(*args, **kwargs):
+            with np_shape(active=True):
+                return func(*args, **kwargs)
+        return _with_np_shape
+    else:
+        raise TypeError('use_np_shape can only decorate classes and callable objects, '
+                        'while received a {}'.format(str(type(func))))
+
+
+def _sanity_check_params(func_name, unsupported_params, param_dict):
+    for param_name in unsupported_params:
+        if param_name in param_dict:
+            raise NotImplementedError("function {} does not support parameter {}"
+                                      .format(func_name, param_name))
+
+
+def set_module(module):
+    """Decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module('mxnet.numpy')
+        def example():
+            pass
+
+        assert example.__module__ == 'numpy'
+    """
+    def decorator(func):
+        if module is not None:
+            func.__module__ = module
+        return func
+    return decorator
+
+
+class _NumpyArrayScope(object):
+    """Scope for managing NumPy array creation. This is often used
+    with `is_np_array=True` in initializer to enforce array creation
+    as type `mxnet.numpy.ndarray`, instead of `mx.nd.NDArray` in Gluon.
+
+    Do not use this class directly. Use `np_array(active)` instead.
+    """
+    _current = threading.local()
+
+    def __init__(self, is_np_array):  # pylint: disable=redefined-outer-name
+        self._old_scope = None
+        self._is_np_array = is_np_array
+
+    def __enter__(self):
+        if not hasattr(_NumpyArrayScope._current, "value"):
+            _NumpyArrayScope._current.value = _NumpyArrayScope(False)
+        self._old_scope = _NumpyArrayScope._current.value
+        _NumpyArrayScope._current.value = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        _NumpyArrayScope._current.value = self._old_scope
+
+
+def np_array(active=True):
+    """Returns an activated/deactivated NumPy-array scope to be used in 'with' statement
+    and captures code that needs the NumPy-array semantics.
+
+    Currently, this is used in Gluon to enforce array creation in `Block`s as type
+    `mxnet.numpy.ndarray`, instead of `mx.nd.NDArray`.
+
+    It is recommended to use the decorator `use_np_array` to decorate the classes
+    that need this semantics, instead of using this function in a `with` statement
+    unless you know exactly what has been scoped by this semantics.
 
     Please note that this is designed as an infrastructure for the incoming
     MXNet-NumPy operators. Legacy operators registered in the modules
     `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
     in NumPy even within this scope.
 
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to activate NumPy-array semantics.
+
+    Returns
+    -------
+    _NumpyShapeScope
+        A scope object for wrapping the code w/ or w/o NumPy-shape semantics.
+    """
+    return _NumpyArrayScope(active)
+
+
+def is_np_array():
+    """Checks whether the NumPy-array semantics is currently turned on.
+    This is currently used in Gluon for checking whether an array of type `mxnet.numpy.ndarray`
+    or `mx.nd.NDArray` should be created. For example, at the time when a parameter
+    is created in a `Block`, an `mxnet.numpy.ndarray` is created if this returns true; else
+    an `mx.nd.NDArray` is created.
+
+    Normally, users are not recommended to use this API directly unless you known exactly
+    what is going on under the hood.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy within this semantics.
+
+    Returns
+    -------
+        A bool value indicating whether the NumPy-array semantics is currently on.
+    """
+    return _NumpyArrayScope._current.value._is_np_array if hasattr(
+        _NumpyArrayScope._current, "value") else False
+
+
+def use_np_array(func):
+    """A decorator wrapping Gluon `Block`s and all its methods, properties, and static functions
+    with the semantics of NumPy-array, which means that where ndarrays are created,
+    `mxnet.numpy.ndarray`s should be created, instead of legacy ndarrays of type `mx.nd.NDArray`.
+    For example, at the time when a parameter is created in a `Block`, an `mxnet.numpy.ndarray`
+    is created if it's decorated with this decorator.
+
+    Example::
+        import mxnet as mx
+        from mxnet import gluon, np
+
+
+        class TestHybridBlock1(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock1, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.dot(x, w)
+
+
+        x = mx.nd.ones((2, 2))
+        net1 = TestHybridBlock1()
+        net1.initialize()
+        out = net1.forward(x)
+        for _, v in net1.collect_params().items():
+            assert type(v.data()) is mx.nd.NDArray
+        assert type(out) is mx.nd.NDArray
+
+
+        @np.use_np_array
+        class TestHybridBlock2(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock2, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.np.dot(x, w)
+
+
+        x = np.ones((2, 2))
+        net2 = TestHybridBlock2()
+        net2.initialize()
+        out = net2.forward(x)
+        for _, v in net2.collect_params().items():
+            print(type(v.data()))
+            assert type(v.data()) is np.ndarray
+        assert type(out) is np.ndarray
 
     Parameters
     ----------
-    func : a user-provided callable function to be scoped by the NumPy-shape semantics.
+    func : a user-provided callable function or class to be scoped by the NumPy-array semantics.
 
     Returns
     -------
-    Function
-        A function for wrapping the user functions in the NumPy-shape semantics.
+    Function or class
+        A function or class wrapped in the NumPy-array scope.
+    """
+    if inspect.isclass(func):
+        for name, method in inspect.getmembers(
+                func,
+                predicate=
+                lambda f: inspect.isfunction(f) or inspect.ismethod(f) or isinstance(f, property)):
+            if isinstance(method, property):
+                setattr(func, name, property(use_np_array(method.__get__),
+                                             method.__set__,
+                                             method.__delattr__,
+                                             method.__doc__))
+            else:
+                setattr(func, name, use_np_array(method))
+        return func
+    elif callable(func):
+        @wraps_safely(func)
+        def _with_np_array(*args, **kwargs):
+            with np_array(active=True):
+                return func(*args, **kwargs)
+        return _with_np_array
+    else:
+        raise TypeError('use_np_array can only decorate classes and callable objects, '
+                        'while received a {}'.format(str(type(func))))
 
 
-    Examples
-    --------
-    >>> import mxnet as mx
-    >>> @mx.use_np_shape
-    ... def scalar_one():
-    ...     return mx.nd.ones(())
-    ...
-    >>> print(scalar_one())
+def use_np(func):
+    """A convenience decorator for wrapping user provided functions and classes in the scope of
+    both NumPy-shape and NumPy-array semantics, which means that (1) empty tuples `()` and tuples
+    with zeros, such as `(0, 1)`, `(1, 0, 2)`, will be treated as scalar tensors' shapes and
+    zero-size tensors' shapes in shape inference functions of operators, instead of as unknown
+    in legacy mode; (2) ndarrays of type `mxnet.numpy.ndarray` should be created instead of
+    `mx.nd.NDArray`.
+
+    Example::
+        import mxnet as mx
+        from mxnet import gluon, np
+
+
+        class TestHybridBlock1(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock1, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.dot(x, w) + F.ones((1,))
+
+
+        x = mx.nd.ones((2, 2))
+        net1 = TestHybridBlock1()
+        net1.initialize()
+        out = net1.forward(x)
+        for _, v in net1.collect_params().items():
+            assert type(v.data()) is mx.nd.NDArray
+        assert type(out) is mx.nd.NDArray
+
+
+        @np.use_np
+        class TestHybridBlock2(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock2, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.np.dot(x, w) + F.np.ones(())
+
+
+        x = np.ones((2, 2))
+        net2 = TestHybridBlock2()
+        net2.initialize()
+        out = net2.forward(x)
+        for _, v in net2.collect_params().items():
+            print(type(v.data()))
+            assert type(v.data()) is np.ndarray
+        assert type(out) is np.ndarray
+
+    Parameters
+    ----------
+    func : a user-provided callable function or class to be scoped by the
+    NumPy-shape and NumPy-array semantics.
+
+    Returns
+    -------
+    Function or class
+        A function or class wrapped in the Numpy-shape and NumPy-array scope.
     """
-    @functools.wraps(func)
-    def _with_np_shape(*args, **kwargs):
-        with np_shape(active=True):
-            return func(*args, **kwargs)
+    return use_np_shape(use_np_array(func))
+
+
+def _set_np_array(active):
+    """Turns on/off NumPy array semantics for the current thread in which `mxnet.numpy.ndarray`
+    is expected to be created, instead of the legacy `mx.nd.NDArray`.
+
+    Parameters
+    ---------
+    active : bool
+        A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+
+    Returns
+    -------
+        A bool value indicating the previous state of NumPy array semantics.
+    """
+    if active:
+        import logging
+        logging.info('NumPy array semantics has been activated in your code. This allows you'
+                     ' to use operators from MXNet NumPy and NumPy Extension modules as well'
+                     ' as MXNet NumPy `ndarray`s.')
+    cur_state = is_np_array()
+    _NumpyArrayScope._current.value = _NumpyArrayScope(active)
+    return cur_state
+
+
+def set_np(shape=True, array=True):
+    """Setting NumPy shape and array semantics at the same time.
+    It is required to keep NumPy shape semantics active when activating NumPy array semantics.
+    Deactivating NumPy shape semantics while NumPy array semantics is still active is not allowed.
+
+    Parameters
+    ----------
+    shape : bool
+        A boolean value indicating whether the NumPy-shape semantics should be turned on or off.
+    array : bool
+        A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+    """
+    if not shape and array:
+        raise ValueError('NumPy Shape semantics is required in using NumPy array semantics.')
+    _set_np_array(array)
+    set_np_shape(shape)
+
 
-    return _with_np_shape
+def reset_np():
+    """Deactivate NumPy shape and array semantics at the same time."""
+    set_np(shape=False, array=False)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ffe6d8dcdbdc..13f22191955d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1604,3 +1604,12 @@ int MXStorageEmptyCache(int dev_type, int dev_id) {
   Storage::Get()->ReleaseAll(ctx);
   API_END();
 }
+
+int MXShallowCopyNDArray(NDArrayHandle src_handle, NDArrayHandle* out) {
+  NDArray* ret = nullptr;
+  API_BEGIN();
+  NDArray* src_array = static_cast<NDArray*>(src_handle);
+  ret = new NDArray(*src_array);
+  *out = ret;
+  API_END_HANDLE_ERROR(delete ret);
+}
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 013ecab93da8..233acc85f36b 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -31,6 +31,7 @@
 #include <mxnet/c_api.h>
 #include <mxnet/c_api_error.h>
 #include <mxnet/base.h>
+#include <mxnet/op_attr_types.h>
 #include <nnvm/graph.h>
 #include <vector>
 #include <string>
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4c6229ee29b0..930b03c4d366 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -855,11 +855,20 @@ int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_h
   API_BEGIN();
   nnvm::Symbol *source = static_cast<nnvm::Symbol *>(sym_handle);
   CHECK_EQ(source->outputs.size(), 1U)
-    << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
-  const auto& node = source->outputs[0];
+      << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
+  const auto &node = source->outputs[0];
   const auto *op = node.node->op();
   const auto attrs = source->ListAttrs(nnvm::Symbol::ListAttrOption::kShallow);
   *s = nnvm::Symbol::CreateFunctor(op, attrs);
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
+
+int MXShallowCopySymbol(SymbolHandle src, SymbolHandle* out) {
+  nnvm::Symbol* out_sym = new nnvm::Symbol;
+  API_BEGIN();
+  nnvm::Symbol* src_sym = static_cast<nnvm::Symbol*>(src);
+  *out_sym = *src_sym;
+  *out = out_sym;
+  API_END_HANDLE_ERROR(delete out_sym);
+}
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 067bb2ea750f..562e71775774 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -884,7 +884,6 @@ inline std::multimap<size_t, NDArray> AllocateMemory(
     }
     CHECK_EQ(stypes[i], kDefaultStorage);
     if (mem_plan[i].root == i) {
-      CHECK_GT(mem_plan[i].size, 0);
       auto iter = pool.lower_bound(mem_plan[i].size);
       if (iter != pool.end()) {
         *arrays[i] = iter->second.AsArray(shapes[i], dtypes[i]);
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index c0357998f31c..db9ac7682287 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -357,6 +357,7 @@ inline void copyMakeBorder(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_cvimdecode)
+.add_alias("_npi_cvimdecode")
 .describe("Decode image with OpenCV. \n"
           "Note: return image in RGB by default, "
           "instead of OpenCV's default BGR.")
@@ -368,6 +369,7 @@ NNVM_REGISTER_OP(_cvimdecode)
 .add_arguments(ImdecodeParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_cvimread)
+.add_alias("_npi_cvimread")
 .describe("Read and decode image with OpenCV. \n"
           "Note: return image in RGB by default, "
           "instead of OpenCV's default BGR.")
@@ -378,6 +380,7 @@ NNVM_REGISTER_OP(_cvimread)
 .add_arguments(ImreadParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_cvimresize)
+.add_alias("_npi_cvimresize")
 .describe("Resize image with OpenCV. \n")
 .set_num_inputs(1)
 .set_num_outputs(1)
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index bee8bef37b44..d8cb9317342e 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -312,7 +312,7 @@ NDArray NDArray::AtWithRecord(index_t idx) {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
   NDArray ret = this->SliceWithRecord(idx, idx+1);
-  if (shape_.ndim() > 1) {
+  if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
     return ret.ReshapeWithRecord(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
   } else {
     return ret;
@@ -1205,7 +1205,10 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
   CHECK(!mxnet::op::shape_is_none(from.shape()))
       << "source operands have undefined shape";
-  if (from.shape().Size() == 0U) return;
+  // zero-size array, no need to copy
+  if (from.shape().Size() == 0U) {
+    return;
+  }
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a = from_ctx.dev_mask();
@@ -1725,7 +1728,7 @@ bool NDArray::Load(dmlc::Stream *strm) {
     CHECK(!Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
-           " to scope of the code of loading the ndarray.";
+           " to scope the code of loading the ndarray.";
   }
   if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
     return LegacyLoad(strm, magic);
@@ -1865,6 +1868,10 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
+  // zero-size array, no need to copy
+  if (size == 0U) {
+    return;
+  }
   TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
@@ -1996,6 +2003,10 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
+  // zero-size array, no need to copy
+  if (size == 0U) {
+    return;
+  }
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 65fe5f1208bb..adb254853d2f 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -221,5 +221,9 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxDetection, MultiBoxDetectionProp)
 .add_argument("loc_pred", "NDArray-or-Symbol", "Location regression predictions.")
 .add_argument("anchor", "NDArray-or-Symbol", "Multibox prior anchor boxes")
 .add_arguments(MultiBoxDetectionParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_MultiBoxDetection)
+.add_alias("_npx_multibox_detection");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
index 2ad173a2dd93..66fd2c11517a 100644
--- a/src/operator/contrib/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -100,5 +100,8 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxPrior, MultiBoxPriorProp)
 .add_arguments(MultiBoxPriorParam::__FIELDS__())
 .describe("Generate prior(anchor) boxes from data, sizes and ratios.");
 
+NNVM_REGISTER_OP(_contrib_MultiBoxPrior)
+.add_alias("_npx_multibox_prior");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index a1808c5a7c81..feab3977f82c 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -307,5 +307,9 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxTarget, MultiBoxTargetProp)
 .add_argument("label", "NDArray-or-Symbol", "Object detection labels.")
 .add_argument("cls_pred", "NDArray-or-Symbol", "Class predictions.")
 .add_arguments(MultiBoxTargetParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_MultiBoxTarget)
+.add_alias("_npx_multibox_target");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/image/crop.cc b/src/operator/image/crop.cc
index 52d2f11a464b..6067f89d7033 100644
--- a/src/operator/image/crop.cc
+++ b/src/operator/image/crop.cc
@@ -35,6 +35,7 @@ namespace image {
 DMLC_REGISTER_PARAMETER(CropParam);
 
 NNVM_REGISTER_OP(_image_crop)
+.add_alias("_npx__image_crop")
 .describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C) 
 to the given size.
 Example:
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index 34f4cb4d395c..0c4603ecc475 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -39,6 +39,7 @@ DMLC_REGISTER_PARAMETER(RandomLightingParam);
 DMLC_REGISTER_PARAMETER(RandomColorJitterParam);
 
 NNVM_REGISTER_OP(_image_to_tensor)
+.add_alias("_npx__image_to_tensor")
 .describe(R"code(Converts an image NDArray of shape (H x W x C) or (N x H x W x C) 
 with values in the range [0, 255] to a tensor NDArray of shape (C x H x W) or (N x C x H x W)
 with values in the range [0, 1]
@@ -102,6 +103,7 @@ with values in the range [0, 1]
 .add_argument("data", "NDArray-or-Symbol", "Input ndarray");
 
 NNVM_REGISTER_OP(_image_normalize)
+.add_alias("_npx__image_normalize")
 .describe(R"code(Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
     standard deviation.
 
@@ -189,28 +191,34 @@ NNVM_REGISTER_OP(_backward_image_normalize)
 .set_attr<FCompute>("FCompute<cpu>", NormalizeOpBackward<cpu>);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_left_right)
+.add_alias("_npx__image_flip_left_right")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipLeftRight);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_left_right)
+.add_alias("_npx__image_random_flip_left_right")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipLeftRight);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_top_bottom)
+.add_alias("_npx__image_flip_top_bottom")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_top_bottom)
+.add_alias("_npx__image_random_flip_top_bottom")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_brightness)
+.add_alias("_npx__image_random_brightness")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomBrightness)
 .add_arguments(RandomEnhanceParam::__FIELDS__());
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_contrast)
+.add_alias("_npx__image_random_contrast")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomContrast)
@@ -218,6 +226,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_contrast)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_saturation)
+.add_alias("_npx__image_random_saturation")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomSaturation)
@@ -225,6 +234,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_saturation)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_hue)
+.add_alias("_npx__image_random_hue")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomHue)
@@ -232,6 +242,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_hue)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_color_jitter)
+.add_alias("_npx__image_random_color_jitter")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomColorJitterParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomColorJitter)
@@ -239,6 +250,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_color_jitter)
 
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_adjust_lighting)
+.add_alias("_npx__image_adjust_lighting")
 .describe(R"code(Adjust the lighting level of the input. Follow the AlexNet style.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<AdjustLightingParam>)
 .set_attr<FCompute>("FCompute<cpu>", AdjustLighting)
@@ -246,6 +258,7 @@ MXNET_REGISTER_IMAGE_AUG_OP(_image_adjust_lighting)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_lighting)
+.add_alias("_npx__image_random_lighting")
 .describe(R"code(Randomly add PCA noise. Follow the AlexNet style.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomLightingParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomLighting)
diff --git a/src/operator/image/resize.cc b/src/operator/image/resize.cc
index d93769faa8b3..d2397ea72685 100644
--- a/src/operator/image/resize.cc
+++ b/src/operator/image/resize.cc
@@ -34,6 +34,7 @@ namespace image {
 DMLC_REGISTER_PARAMETER(ResizeParam);
 
 NNVM_REGISTER_OP(_image_resize)
+.add_alias("_npx__image_resize")
 .describe(R"code(Resize an image NDArray of shape (H x W x C) or (N x H x W x C) 
 to the given size
 Example:
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 214e41a84611..c25833b799d0 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -71,6 +71,7 @@ The following modified ReLU Activation functions are supported:
 .add_arguments(LeakyReLUParam::__FIELDS__());
 
 NNVM_REGISTER_OP(LeakyReLU)
+.add_alias("_npx_leaky_relu")
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
       if (index == 1 && var->attrs.dict.find("__init__") == var->attrs.dict.end()) {
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 5b6cece4a92e..5abb6670c9b0 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -154,6 +154,7 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
+.add_alias("_npx_activation")
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 2564609c6b90..6382d46d272d 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -520,6 +520,7 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
 }
 
 NNVM_REGISTER_OP(BatchNorm)
+.add_alias("_npx_batch_norm")
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 8fb229889332..80469b5385eb 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -32,9 +32,9 @@
 namespace mxnet {
 namespace op {
 
-static bool ConcatShape(const nnvm::NodeAttrs& attrs,
-                        mxnet::ShapeVector *in_shape,
-                        mxnet::ShapeVector *out_shape) {
+bool ConcatShape(const nnvm::NodeAttrs& attrs,
+                 mxnet::ShapeVector *in_shape,
+                 mxnet::ShapeVector *out_shape) {
   using namespace mshadow;
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
@@ -138,9 +138,9 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(dshape);
 }
 
-static bool ConcatType(const nnvm::NodeAttrs& attrs,
-                       std::vector<int> *in_type,
-                       std::vector<int> *out_type) {
+bool ConcatType(const nnvm::NodeAttrs& attrs,
+                std::vector<int> *in_type,
+                std::vector<int> *out_type) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   int dtype = -1;
 
@@ -403,6 +403,7 @@ NNVM_REGISTER_OP(_backward_Concat)
 // which handles the case where the first one or two inputs may have
 // unknown shape that can be inferred from output shape.
 NNVM_REGISTER_OP(_rnn_param_concat)
+.add_alias("_npi_rnn_param_concat")
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 536e9a731171..32ed93e4a463 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -397,6 +397,7 @@ struct ConvolutionGrad {
 };
 
 NNVM_REGISTER_OP(Convolution)
+.add_alias("_npx_convolution")
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 09b255d009e0..9f461f4e9de3 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -408,6 +408,7 @@ struct DeconvolutionGrad {
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
 
 NNVM_REGISTER_OP(Deconvolution)
+.add_alias("_npx_deconvolution")
 .describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
     "input tensor. This operation can be seen as the gradient of Convolution operation with "
     "respect to its input. Convolution usually reduces the size of the input. Transposed "
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index d2fe3a5651a4..c4ad4250853e 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -63,6 +63,7 @@ struct DropoutGrad {
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
 NNVM_REGISTER_OP(Dropout)
+.add_alias("_npx_dropout")
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 27f6595aee9e..06ad6d034398 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -244,6 +244,7 @@ DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
 NNVM_REGISTER_OP(FullyConnected)
 MXNET_ADD_SPARSE_OP_ALIAS(FullyConnected)
+.add_alias("_npx_fully_connected")
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index e95f47255d7a..0b53d5091194 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -127,6 +127,7 @@ void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(LayerNorm)
+.add_alias("_npx_layer_norm")
 .describe(R"code(Layer normalization.
 
 Normalizes the channels of the input tensor by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 870557756128..8a3e90da3e71 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -364,7 +364,8 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
 NNVM_REGISTER_OP(Pooling)
-    .describe(R"code(Performs pooling on the input.
+.add_alias("_npx_pooling")
+.describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
 
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index e44bbbb6b8f6..8f1b2e06c371 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -68,6 +68,7 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
 #endif
 
 NNVM_REGISTER_OP(softmax)
+.add_alias("_npx_softmax")
 .describe(R"code(Applies the softmax function.
 
 The resulting array contains elements in the range (0,1) and the elements along the given axis sum up to 1.
@@ -182,6 +183,7 @@ NNVM_REGISTER_OP(_backward_softmin)
                                                         mxnet_op::softmax_bwd, true>);
 
 NNVM_REGISTER_OP(log_softmax)
+.add_alias("_npx_log_softmax")
 .describe(R"code(Computes the log softmax of the input.
 This is equivalent to computing softmax followed by log.
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
new file mode 100644
index 000000000000..c36423dff9fd
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_elemwise_binary_op.cc
+ * \brief CPU Implementation of basic functions for elementwise numpy binary broadcast operator.
+ */
+
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return in_attrs->at(0) != -1;
+}
+
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr_parser([](NodeAttrs* attrs) {                           \
+      attrs->parsed = std::stod(attrs->dict["scalar"]);             \
+    })                                                              \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .add_argument("data", "NDArray-or-Symbol", "source input")        \
+  .add_argument("scalar", "float", "scalar input")
+
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_add)
+.describe(R"code(Add arguments element-wise with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   add(x, y) = [[ 1.,  1.,  1.],
+                [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_subtract)
+.describe(R"code(Subtract arguments element-wise with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   subtract(x, y) = [[ 1.,  1.,  1.],
+                     [ 0.,  0.,  0.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_multiply)
+.describe(R"code(Multiply arguments with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   multiply(x, y) = [[ 0.,  0.,  0.],
+                     [ 1.,  1.,  1.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod)
+.describe(R"code(Return element-wise remainder of division.
+It is equivalent to the Python modulus operator``x1 % x2`` and has the same sign as the divisor x2.
+
+Example::
+
+   x = [[ 8.,  8.,  8.],
+        [ 8.,  8.,  8.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   mod(x, y) = [[ 0.,  0.,  0.],
+                [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"});
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_power)
+.describe(R"code(First array elements raised to powers from second array, element-wise.
+
+Raise each base in x1 to the positionally-corresponding power in x2. x1 and x2 must be
+broadcastable to the same shape.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   power(x, y) = [[ 2.,  2.,  2.],
+                  [ 4.,  4.,  4.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::power>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_add_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_subtract_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rsubtract_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_multiply_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_mod_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rmod_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_power_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rpower_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
new file mode 100644
index 000000000000..c858b3a4987a
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_elemwise_broadcast_op.cu
+ * \brief GPU Implementation of basic functions for elementwise binary broadcast operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+NNVM_REGISTER_OP(_npi_add)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::plus>);
+
+NNVM_REGISTER_OP(_npi_subtract)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::minus>);
+
+NNVM_REGISTER_OP(_npi_multiply)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::mul>);
+
+NNVM_REGISTER_OP(_npi_mod)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
+
+NNVM_REGISTER_OP(_npi_power)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::power>);
+
+NNVM_REGISTER_OP(_npi_maximum)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::maximum>);
+
+NNVM_REGISTER_OP(_npi_minimum)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::minimum>);
+
+NNVM_REGISTER_OP(_npi_add_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
+
+NNVM_REGISTER_OP(_npi_subtract_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>);
+
+NNVM_REGISTER_OP(_npi_rsubtract_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
+
+NNVM_REGISTER_OP(_npi_multiply_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>);
+
+NNVM_REGISTER_OP(_npi_mod_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
+
+NNVM_REGISTER_OP(_npi_rmod_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
+
+NNVM_REGISTER_OP(_npi_power_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
+
+NNVM_REGISTER_OP(_npi_rpower_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
+
+NNVM_REGISTER_OP(_npi_maximum_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::maximum>);
+
+NNVM_REGISTER_OP(_npi_minimum_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::minimum>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
new file mode 100644
index 000000000000..83a44c8ae280
--- /dev/null
+++ b/src/operator/numpy/np_init_op.cc
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.cc
+ * \brief CPU Implementation of numpy init op
+ */
+#include "../tensor/init_op.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_zeros)
+.describe("Return a new array of given shape, type, and context, filled with zeros.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
+.set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<InitOpParam, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.add_arguments(InitOpParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_npi_ones)
+.describe("Return a new array of given shape, type, and context, filled with ones.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
+.set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.add_arguments(InitOpParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_np_zeros_like)
+.describe(R"code(Return an array of zeros with the same shape and type as a given array.
+
+Examples::
+
+  x = [[ 1.,  1.,  1.],
+       [ 1.,  1.,  1.]]
+
+  zeros_like(x) = [[ 0.,  0.,  0.],
+                   [ 0.,  0.,  0.]]
+
+)code")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+  [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>(1, 0);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol",
+              "The shape and data-type of a define these same attributes of the returned array.");
+
+NNVM_REGISTER_OP(_np_ones_like)
+.describe(R"code(Return an array of ones with the same shape and type as a given array.
+
+Examples::
+
+  x = [[ 0.,  0.,  0.],
+       [ 0.,  0.,  0.]]
+
+  ones_like(x) = [[ 1.,  1.,  1.],
+                  [ 1.,  1.,  1.]]
+
+)code")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+  [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>(1, 0);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol",
+              "The shape and data-type of a define these same attributes of the returned array.");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
new file mode 100644
index 000000000000..2eb8ed6d83b7
--- /dev/null
+++ b/src/operator/numpy/np_init_op.cu
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.cu
+ * \brief GPU Implementation of numpy init op
+ */
+
+#include "../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_zeros)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(_npi_ones)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
+NNVM_REGISTER_OP(_np_zeros_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(_np_ones_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
new file mode 100644
index 000000000000..429762778700
--- /dev/null
+++ b/src/operator/numpy/np_true_divide.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_true_divide.cc
+ * \brief CPU Implementation of true_divide operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+template <int num_inputs>
+bool TrueDivideType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int>* in_attrs,
+                    std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(num_inputs));
+  CHECK_EQ(out_attrs->size(), 1U);
+  for (const int dtype : *in_attrs) {
+    if (dtype == -1) return false;
+  }
+  if (num_inputs == 2) {
+    const int lhs_dtype = in_attrs->at(0);
+    const int rhs_dtype = in_attrs->at(1);
+    CHECK_EQ(lhs_dtype, rhs_dtype)
+        << "_true_divide currently only supports same dtype for dividend and divisor";
+  }
+  auto is_float = [](const int dtype) {
+    return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
+  };
+
+  for (const int dtype : *in_attrs) {
+    CHECK(is_float(dtype)) << "_true_divide currently only supports float dtype";
+  }
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  return true;
+}
+
+NNVM_REGISTER_OP(_npi_true_divide)
+.describe(R"code(
+Returns a true division of the inputs, element-wise.
+
+It currently only supports dtype float16, float32, and float64.
+
+Example::
+
+   x = [[ 6.,  6.,  6.],
+        [ 6.,  6.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   _true_divide(x, y) = [[ 3.,  3.,  3.],
+                         [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<2>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
+.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
+.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+
+NNVM_REGISTER_OP(_npi_true_divide_scalar)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser([](NodeAttrs* attrs) {
+    attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
+.add_argument("data", "NDArray-or-Symbol", "source input")
+.add_argument("scalar", "float", "scalar input");
+
+NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser([](NodeAttrs* attrs) {
+  attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rdiv>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
+.add_argument("data", "NDArray-or-Symbol", "source input")
+.add_argument("scalar", "float", "scalar input");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
new file mode 100644
index 000000000000..be10c44f92a1
--- /dev/null
+++ b/src/operator/numpy/np_true_divide.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_true_divide.cu
+ * \brief GPU Implementation of true_divide operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_true_divide)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::div>);
+
+NNVM_REGISTER_OP(_npi_true_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::div>);
+
+NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_concat.cc b/src/operator/quantization/quantized_concat.cc
index f7a810b1e404..5835701497d9 100644
--- a/src/operator/quantization/quantized_concat.cc
+++ b/src/operator/quantization/quantized_concat.cc
@@ -28,8 +28,8 @@
 namespace mxnet {
 namespace op {
 
-static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_shape,
-                        mxnet::ShapeVector* out_shape) {
+static bool QuantizedConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_shape,
+                                 mxnet::ShapeVector* out_shape) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_shape->size(), 3U);
@@ -74,8 +74,8 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   return shape_is_known(dshape);
 }
 
-static bool ConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
-                       std::vector<int>* out_type) {
+static bool QuantizedConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
+                                std::vector<int>* out_type) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_type->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_type->size(), 3U);
@@ -130,8 +130,8 @@ If any input holds int8, then the output will be int8. Otherwise output will be
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-.set_attr<nnvm::FInferType>("FInferType", ConcatType)
-.set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedConcatType)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedConcatShape)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
 .add_arguments(ConcatParam::__FIELDS__());
diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc
index 56a162be5da4..543146257ddf 100644
--- a/src/operator/random/sample_op.cc
+++ b/src/operator/random/sample_op.cc
@@ -81,6 +81,7 @@ DMLC_REGISTER_PARAMETER(SampleGenNegBinomialLikeParam);
 MXNET_OPERATOR_REGISTER_SAMPLE(_random_uniform, SampleUniformParam)
 .add_alias("uniform")
 .add_alias("random_uniform")
+.add_alias("_npi_random_uniform")
 .describe(R"code(Draw random samples from a uniform distribution.
 
 .. note:: The existing alias ``uniform`` is deprecated.
@@ -99,6 +100,7 @@ Example::
 MXNET_OPERATOR_REGISTER_SAMPLE(_random_normal, SampleNormalParam)
 .add_alias("normal")
 .add_alias("random_normal")
+.add_alias("_npi_random_normal")
 .describe(R"code(Draw random samples from a normal (Gaussian) distribution.
 
 .. note:: The existing alias ``normal`` is deprecated.
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 70315716dea2..86797c136bab 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -122,6 +122,7 @@ void ShuffleForwardCPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_shuffle)
 .add_alias("shuffle")
+.add_alias("_np__random_shuffle")
 .describe(R"code(Randomly shuffle the elements.
 
 This shuffles the array along the first axis.
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 6a0dbd7a4e23..244e39335a91 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -634,6 +634,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
 #endif
 
 NNVM_REGISTER_OP(RNN)
+.add_alias("_npx_rnn")
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc
index 8862d0db1401..c72b203292fe 100644
--- a/src/operator/roi_pooling.cc
+++ b/src/operator/roi_pooling.cc
@@ -300,5 +300,9 @@ Example::
 "corners of designated region of interest. `batch_index` indicates the index of corresponding "
 "image in the input array")
 .add_arguments(ROIPoolingParam::__FIELDS__());
+
+NNVM_REGISTER_OP(ROIPooling)
+.add_alias("_npx_roi_pooling");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index f4f81a801e70..d7731026ce21 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -191,5 +191,8 @@ Example::
                   "vector of sequence lengths of the form [batch_size]")
     .add_arguments(SequenceMaskParam::__FIELDS__());
 
+NNVM_REGISTER_OP(SequenceMask)
+.add_alias("_npx_sequence_mask");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index b17a81f75bc6..fd9872db6ec8 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -47,7 +47,7 @@ enum SwapAxisOpOutputs {kOut};
 
 struct SwapAxisParam : public dmlc::Parameter<SwapAxisParam> {
   // use int for enumeration
-  uint32_t dim1, dim2;
+  int dim1, dim2;
   DMLC_DECLARE_PARAMETER(SwapAxisParam) {
     DMLC_DECLARE_FIELD(dim1)
     .set_default(0)
@@ -106,8 +106,6 @@ class SwapAxisOp : public Operator {
                 const std::vector<OpReqType> &req) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    int dim1 = param_.dim1;
-    int dim2 = param_.dim2;
 
     TBlob data_in = in_data[swapaxisenum::kData];
     TBlob data_out = out_data[swapaxisenum::kData];
@@ -115,10 +113,27 @@ class SwapAxisOp : public Operator {
 
     mxnet::TShape shape_in = data_in.shape_;
     mxnet::TShape shape_out = data_out.shape_;
+    int axis1 = param_.dim1;
+    if (axis1 < 0) {
+      axis1 += shape_in.ndim();
+    }
+    CHECK(axis1 >= 0 && axis1 < shape_in.ndim())
+        << "axis1: axis " << param_.dim1 << " is out of bounds for array of ndim "
+        << shape_in.ndim();
+
+    int axis2 = param_.dim2;
+    if (axis2 < 0) {
+      axis2 += shape_in.ndim();
+    }
+    CHECK(axis2 >= 0 && axis2 < shape_in.ndim())
+        << "axis2: axis " << param_.dim2 << " is out of bounds for array of ndim "
+        << shape_in.ndim();
+
+    if (shape_in.Size() == 0U) return;
 
     Shape<5> inter_shape;
 
-    Reshape2Five(&inter_shape, shape_in, dim1, dim2);
+    Reshape2Five(&inter_shape, shape_in, axis1, axis2);
 
     Tensor<xpu, 5, DType> inter_data_in = data_in.get_with_shape<xpu, 5, DType>(inter_shape, s);
 
@@ -187,13 +202,28 @@ class SwapAxisProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1U);
 
     mxnet::TShape &shape0 = (*in_shape)[swapaxisenum::kData];
+    if (!ndim_is_known(shape0)) return false;
+    int axis1 = param_.dim1;
+    if (axis1 < 0) {
+      axis1 += shape0.ndim();
+    }
+    CHECK(axis1 >= 0 && axis1 < shape0.ndim())
+        << "axis1: axis " << param_.dim1 << " is out of bounds for array of ndim " << shape0.ndim();
+
+    int axis2 = param_.dim2;
+    if (axis2 < 0) {
+      axis2 += shape0.ndim();
+    }
+    CHECK(axis2 >= 0 && axis2 < shape0.ndim())
+        << "axis2: axis " << param_.dim2 << " is out of bounds for array of ndim " << shape0.ndim();
+
     out_shape->clear();
     out_shape->push_back(shape0);
     mxnet::TShape &shape1 = (*out_shape)[swapaxisenum::kOut];
 
-    std::swap(shape1[param_.dim1], shape1[param_.dim2]);
+    std::swap(shape1[axis1], shape1[axis2]);
 
-    return true;
+    return shape_is_known(*out_shape);
   }
 
   bool InferType(std::vector<int> *in_type,
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index 45bcca4db9ae..32b26cc14f0c 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -69,6 +69,6 @@ Examples::
                        [ 3, 7]]]
 )code" ADD_FILELINE);
 
-NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes");
+NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes").add_alias("_npi_swapaxes");
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index c7c49937730c..27edf0195a1c 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -168,15 +168,27 @@ struct BroadcastLikeParam : public dmlc::Parameter<BroadcastLikeParam> {
   }
 };
 
-inline int CheckAxis(int axis, int ndim) {
-  CHECK(axis < ndim && axis >= -ndim)
-    << "axis " << axis << " exceeds the input dimension of " << ndim;
-  return (axis + ndim)%ndim;
+/*
+ * Check whether the axis is within the legal range.
+ */
+inline int CheckAxis(const int axis, const int ndim) {
+  if (ndim == 0) {
+    CHECK(axis == 0 || axis == -1) << "axis " << axis
+                                   << " is out of bounds for 0-dimension tensor";
+    return 0;
+  } else {
+    CHECK(axis < ndim && axis >= -ndim)
+        << "axis " << axis << " exceeds the input dimension of " << ndim;
+    return (axis + ndim) % ndim;
+  }
 }
 
 inline mxnet::TShape AxisShapeCompact(mxnet::TShape shape, int *axis, bool allow_2d) {
   int ndim = shape.ndim();
-  index_t leading = 1, trailing = 1, M = shape[*axis];
+  index_t leading = 1, trailing = 1, M = 1;
+  if (shape.ndim() > *axis) {
+    M = shape[*axis];
+  }
   for (int i = 0; i < *axis; ++i) leading *= shape[i];
   for (int i = *axis + 1; i < ndim; ++i) trailing *= shape[i];
   if (allow_2d && trailing == 1) {
@@ -553,14 +565,37 @@ void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (!param.axis) LOG(FATAL) << "Global reduction not supported yet";
+  int axis = inputs[0].ndim();
+  TBlob input = inputs[0];
+  if (param.axis.has_value()) {
+    axis = param.axis.value();
+  } else {
+    // If global reduction, reshape the input tensor into 2D shape (1, inputs[0].shape_.Size())
+    // and search on axis = 1.
+    mxnet::TShape shape_2d(2, 1);
+    shape_2d[1] = input.shape_.Size();
+    input = TBlob(input.dptr_, shape_2d, input.dev_mask(), input.type_flag_, input.dev_id());
+    axis = 1;
+  }
 
-  int axis = CheckAxis(param.axis.value(), inputs[0].shape_.ndim());
-  mxnet::TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, false);
+  axis = CheckAxis(axis, input.shape_.ndim());
+  if (inputs[0].shape_.ndim() != 0) {
+    if (param.axis.has_value()) {
+      // cannot do argmax in an empty dimension
+      CHECK_NE(inputs[0].shape_[axis], 0)
+          << "searching input tensor of shape " << inputs[0].shape_
+          << " along axis = " << axis << " of zero dim-size is not allowed";
+    } else {
+      // cannot do argmax on an empty array
+      CHECK_NE(inputs[0].shape_.Size(), 0U) << "attempt to search an empty sequence";
+    }
+  }
+  if (input.shape_.Size() == 0U) return;  // zero-size tensor
+  mxnet::TShape shape = AxisShapeCompact(input.shape_, &axis, false);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Tensor<xpu, 2, DType> out = outputs[0].get_with_shape<xpu, 2, DType>(
       Shape2(shape[0], shape[2]), s);
-    Tensor<xpu, 3, DType> in = inputs[0].get_with_shape<xpu, 3, DType>(
+    Tensor<xpu, 3, DType> in = input.get_with_shape<xpu, 3, DType>(
       shape.get<3>(), s);
     CHECK(req[0] != kAddTo) << "AddTo is not supported";
     ASSIGN_DISPATCH(out, req[0], (reduce_with_axis<reducer, true>(in, 1)));
@@ -865,8 +900,8 @@ struct reduce_axes_backward_broadcast {
                                   OType *out,
                                   DType *igrad,
                                   OType *ograd,
-                                  mshadow::Shape<5> in_shape,
-                                  mshadow::Shape<5> out_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape,
                                   const uint32_t ndim) {
     size_t in_stride = 1;
     size_t out_stride = 1;
@@ -902,9 +937,9 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
 
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-      mshadow::Shape<5> in_shape;
-      mshadow::Shape<5> out_shape;
-      for (int i = 0; i < 5; ++i) {
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+      for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
         if (i < dst_shape.ndim()) {
           in_shape[i] = src_shape[i];
           out_shape[i] = dst_shape[i];
@@ -914,36 +949,36 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
         }
       }
       if (dst_shape.ndim() == 2) {
-        Tensor<xpu, 2, DType> igrad =
-          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, OType> ograd =
-          inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> data =
-          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, OType> out =
-          inputs[2].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> igrad =
+          outputs[0].get_with_shape<xpu, 2, OType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> ograd =
+          inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> data =
+          inputs[1].get_with_shape<xpu, 2, OType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> out =
+          inputs[2].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
         MXNET_REQ_TYPE_SWITCH(req[0], Req, {
           Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
             s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
             in_shape, out_shape, src_shape.ndim());
         });
-        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+        if (normalize) igrad /= scalar<OType>(src_shape.Size()/dst_shape.Size());
       } else {
         const int ndim = MXNET_SPECIAL_MAX_NDIM;
-        Tensor<xpu, ndim, DType> igrad =
-          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, OType> ograd =
-          inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> data =
-          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, OType> out =
-          inputs[2].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> igrad =
+          outputs[0].get_with_shape<xpu, ndim, OType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> ograd =
+          inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> data =
+          inputs[1].get_with_shape<xpu, ndim, OType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> out =
+          inputs[2].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
         MXNET_REQ_TYPE_SWITCH(req[0], Req, {
           Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
             s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
             in_shape, out_shape, src_shape.ndim());
         });
-        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+        if (normalize) igrad /= scalar<OType>(src_shape.Size()/dst_shape.Size());
       }
     });
   });
@@ -968,6 +1003,34 @@ void ReduceAxesBackwardUseInOut(const nnvm::NodeAttrs& attrs,
   ReduceAxesBackwardUseInOutImpl<xpu, OP, normalize>(ctx, small, inputs, req, outputs);
 }
 
+template<typename OP>
+struct broadcast_kernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  IType *input,
+                                  OType *output,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape,
+                                  const OpReqType req,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t in_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % out_shape[iter];
+      in_idx -= dim_idx * out_stride;
+      if (in_shape[iter] != 1) {
+        in_idx += dim_idx * in_stride;
+      }
+      idx /= out_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+  }
+};
+
 template<typename xpu>
 inline void BroadcastComputeImpl(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
@@ -977,24 +1040,40 @@ inline void BroadcastComputeImpl(const nnvm::NodeAttrs& attrs,
                                  const mxnet::TShape& small) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(outputs[0].shape_, small, &dst_shape, &src_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (dst_shape.ndim() == 2) {
-      Tensor<xpu, 2, DType> out =
-        outputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> data =
-        inputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      ASSIGN_DISPATCH(out, req[0], broadcast_to(data, dst_shape));
-    } else {
-      const int ndim = MXNET_SPECIAL_MAX_NDIM;
-      Tensor<xpu, ndim, DType> out =
-        outputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> data =
-        inputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      ASSIGN_DISPATCH(out, req[0], broadcast_to(data, dst_shape));
-    }
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+      mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+      for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+        if (i < dst_shape.ndim()) {
+          in_shape[i] = src_shape[i];
+          out_shape[i] = dst_shape[i];
+        } else {
+          in_shape[i] = 1;
+          out_shape[i] = 1;
+        }
+      }
+      if (dst_shape.ndim() == 2) {
+        Tensor<xpu, 2, OType> out =
+          outputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, IType> data =
+          inputs[0].get_with_shape<xpu, 2, IType>(src_shape.get<2>(), s);
+        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
+          s, out.shape_.Size(), data.dptr_, out.dptr_, in_shape, out_shape, req[0], 2);
+      } else {
+        const int ndim = MXNET_SPECIAL_MAX_NDIM;
+        Tensor<xpu, ndim, OType> out =
+          outputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, IType> data =
+          inputs[0].get_with_shape<xpu, ndim, IType>(src_shape.get<ndim>(), s);
+        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
+          s, out.shape_.Size(), data.dptr_, out.dptr_, in_shape, out_shape, req[0], ndim);
+      }
+    });
   });
 }
 
@@ -1215,8 +1294,8 @@ struct norm_backward_broadcast {
                                   DType *igrad,
                                   OType *ograd,
                                   DType *data,
-                                  mshadow::Shape<5> in_shape,
-                                  mshadow::Shape<5> out_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape,
+                                  mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape,
                                   const uint32_t ndim) {
     size_t in_stride = 1;
     size_t out_stride = 1;
@@ -1258,9 +1337,9 @@ void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
     mxnet::TShape src_shape, dst_shape;
     BroadcastReduceShapeCompact(outputs[0].shape_, small, &src_shape, &dst_shape);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    mshadow::Shape<5> in_shape;
-    mshadow::Shape<5> out_shape;
-    for (int i = 0; i < 5; ++i) {
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+    for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
       if (i < dst_shape.ndim()) {
         in_shape[i] = src_shape[i];
         out_shape[i] = dst_shape[i];
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index 56af3887c763..52082f759e7a 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -110,6 +110,7 @@ Examples::
 
 NNVM_REGISTER_OP(pick)
 .add_alias("choose_element_0index")
+.add_alias("_npx_pick")
 .describe(R"code(Picks elements from an input array according to the input indices along the given axis.
 
 Given an input array of shape ``(d0, d1)`` and indices of shape ``(i0,)``, the result will be
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 7d7b6c06c846..11a056146e1d 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -111,6 +111,7 @@ NNVM_REGISTER_OP(_backward_dot)
 .add_arguments(DotParam::__FIELDS__());
 
 NNVM_REGISTER_OP(batch_dot)
+.add_alias("_npx_batch_dot")
 .describe(R"doc(Batchwise dot product.
 
 ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index f84767dd4b2f..8a81bbc1c475 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -292,6 +292,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                             const std::vector<TBlob>& inputs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
+  if (outputs[0].shape_.Size() == 0U) return;
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index cd433e00a770..e3c2e0e898d9 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -30,6 +30,7 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_equal)
+.add_alias("_npi_equal")
 .describe(R"code(Returns the result of element-wise **equal to** (==) comparison operation with broadcasting.
 
 Example::
@@ -48,6 +49,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_not_equal)
+.add_alias("_npi_not_equal")
 .describe(R"code(Returns the result of element-wise **not equal to** (!=) comparison operation with broadcasting.
 
 Example::
@@ -66,6 +68,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater)
+.add_alias("_npi_greater")
 .describe(R"code(Returns the result of element-wise **greater than** (>) comparison operation with broadcasting.
 
 Example::
@@ -84,6 +87,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater_equal)
+.add_alias("_npi_greater_equal")
 .describe(R"code(Returns the result of element-wise **greater than or equal to** (>=) comparison operation with broadcasting.
 
 Example::
@@ -102,6 +106,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser)
+.add_alias("_npi_less")
 .describe(R"code(Returns the result of element-wise **lesser than** (<) comparison operation with broadcasting.
 
 Example::
@@ -120,6 +125,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser_equal)
+.add_alias("_npi_less_equal")
 .describe(R"code(Returns the result of element-wise **lesser than or equal to** (<=) comparison operation with broadcasting.
 
 Example::
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 2fe3fd9919cf..9c1d8b17fdea 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -487,9 +487,11 @@ class ElemwiseBinaryOp : public OpBase {
         MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          if (size != 0) {
+            Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+            outputs[0].dptr<DType>(),
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          }
         });
       });
     }
@@ -510,9 +512,11 @@ class ElemwiseBinaryOp : public OpBase {
         MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          if (size != 0) {
+            Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+            outputs[0].dptr<DType>(),
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          }
         });
       });
     }
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index f027665a549b..3a687c2aa062 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -84,7 +84,8 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_hypot_scalar)
   cpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-  .describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
+.add_alias("_npx_smooth_l1")
+.describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
 
 .. math::
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 17e76153ebb2..87ba394c99b2 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -71,26 +71,32 @@ static bool BinaryScalarLogicStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_equal_scalar, mshadow_op::eq)
+.add_alias("_npi_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_EqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_not_equal_scalar, mshadow_op::ne)
+.add_alias("_npi_not_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_NotEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_greater_scalar, mshadow_op::gt)
+.add_alias("_npi_greater_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_greater_equal_scalar, mshadow_op::ge)
+.add_alias("_npi_greater_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_lesser_scalar, mshadow_op::lt)
+.add_alias("_npi_less_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_lesser_equal_scalar, mshadow_op::le)
+.add_alias("_npi_less_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserEqualScalar");
 
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 458106e02671..87964ac246f0 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -243,8 +243,10 @@ class UnaryOp : public OpBase {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        if (inputs[0].Size() != 0) {
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        }
       });
     });
   }
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index f2b8dd6b1314..729c9f5ce6e5 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -483,6 +483,7 @@ Negative indices are supported, and `None` can be used for either `lhs_end` or `
   - lhs shape = (30, 12), rhs shape = (4, 2, 2, 3), lhs_begin=-1, lhs_end=None, rhs_begin=1, rhs_end=None, output shape = (30, 2, 2, 3)
 
 )code" ADD_FILELINE)
+.add_alias("_npx_reshape_like")
 .set_num_inputs(2)
 .set_attr_parser(ParamParser<ReshapeLikeParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
@@ -621,6 +622,7 @@ Example::
 DMLC_REGISTER_PARAMETER(CastParam);
 NNVM_REGISTER_OP(Cast)
 .add_alias("cast")
+.add_alias("_npx_cast")
 .describe(R"code(Casts all elements of the input to a new type.
 
 .. note:: ``Cast`` is deprecated. Use ``cast`` instead.
@@ -1195,6 +1197,7 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_expm1, unary_bwd<msh
 // gamma
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(gamma, cpu, mshadow_op::gamma)
 MXNET_ADD_SPARSE_OP_ALIAS(gamma)
+.add_alias("_npx_gamma")
 .describe(R"code(Returns the gamma function (extension of the factorial function \
 to the reals), computed element-wise on the input array.
 
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 396d1c612cd2..ad4e54db54f1 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -466,6 +466,7 @@ DMLC_REGISTER_PARAMETER(ScatterNDParam);
 
 NNVM_REGISTER_OP(Embedding)
 MXNET_ADD_SPARSE_OP_ALIAS(Embedding)
+.add_alias("_npx_embedding")
 .describe(R"code(Maps integer indices to vector representations (embeddings).
 
 This operator maps words to real-valued vectors in a high-dimensional space,
@@ -764,6 +765,7 @@ Examples::
 .add_argument("indices", "NDArray-or-Symbol", "The index array");
 
 NNVM_REGISTER_OP(one_hot)
+.add_alias("_npx_one_hot")
 .describe(R"code(Returns a one-hot array.
 
 The locations represented by `indices` take value `on_value`, while all
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 5cd7bf6652d3..aa6e7bbaf5ee 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -265,11 +265,17 @@ void TransposeImpl(RunContext ctx,
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(src.type_flag_, ret.type_flag_);
+  // zero-size tensor, no need to compute
+  if (src.shape_.Size() == 0U) return;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(ret.type_flag_, DType, {
     switch (axes.ndim()) {
-     case 0:
+     case 0: {
+      Tensor<xpu, 1, DType> in = src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(1), s);
+      Tensor<xpu, 1, DType> out = ret.get_with_shape<xpu, 1, DType>(mshadow::Shape1(1), s);
+      Copy(out, in, s);
       break;
+     }
      case 1: {
       Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = ret.get<xpu, 1, DType>(s);
@@ -1781,9 +1787,6 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
-  for (int i = 0; i < reps.ndim(); ++i) {
-    CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
-  }
   mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()), -1);
   int i1 = ishape.ndim() - 1;
   int i2 = reps.ndim() - 1;
@@ -1796,6 +1799,11 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
       oshape[i] = reps[i2--];
     }
   }
+  // If reps contains 0s, oshape is a zero-size shape.
+  // Need to distinguish between np_shape mode and legacy mode.
+  if (!Imperative::Get()->is_np_shape()) {
+    common::ConvertToNumpyShape(&oshape);
+  }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return shape_is_known(oshape);
 }
@@ -1814,7 +1822,7 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
 
 /*!
  * \brief Reshape the input and output tensors for
- * using broadcast_to to achieve the funcitonality
+ * using broadcast_to to achieve the functionality
  * of operator tile.
  * \return a pair of mxnet::TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
@@ -1822,7 +1830,7 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   const mxnet::TShape& ishape,
   const mxnet::Tuple<int>& reps) {
-  if (ishape.ndim() == 0 || reps.ndim() == 0) {
+  if (reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }
 
@@ -2177,7 +2185,7 @@ inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
   for (int i = 0; i < shape->ndim(); ++i) {
-    if ((*shape)[i] == 0) {
+    if ((*shape)[i] == -1) {
       ++count;
     } else {
       std::swap((*shape)[i], (*shape)[i-count]);
@@ -2210,12 +2218,12 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       CHECK_EQ(dshape[axes[i]], 1)
         << "cannot select an axis to squeeze out which has size="
         << dshape[axes[i]] << " not equal to one";
-      CHECK_NE(oshape[axes[i]], 0) << "duplicate value in axis";
-      oshape[axes[i]] = 0;
+      CHECK_NE(oshape[axes[i]], -1) << "duplicate value in axis";
+      oshape[axes[i]] = -1;
     }
   } else {
     for (int i = 0; i < oshape.ndim(); ++i) {
-      if (oshape[i] == 1) oshape[i] = 0;
+      if (oshape[i] == 1) oshape[i] = -1;
     }
   }
   size_t oshape_size = SqueezeShapeHelper(&oshape);
@@ -2631,10 +2639,14 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < num_outputs; ++i) {
     int start = indices[i];
     int end = (i < num_outputs - 1) ? indices[i + 1] : ishape[real_axis];
-    CHECK(start < end)
-      << "start " << start << " is not less than end " << end << "for subarray " << i;
-    CHECK(end <= ishape[real_axis])
-      << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    if (ishape[real_axis] == 0U) {
+      end = start;
+    } else {
+      CHECK(start < end)
+        << "start " << start << " is not less than end " << end << "for subarray " << i;
+      CHECK(end <= ishape[real_axis])
+        << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    }
     dshape[real_axis] = (end - start);
     if (param.squeeze_axis) {
       CHECK_EQ(end - start, 1U) << "expected axis size of 1 but got " << end - start;
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index c2bcb29193a7..bff76bc6bbb0 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -263,6 +263,7 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
+.add_alias("_npx_batch_flatten")
 .describe(R"code(Flattens the input array into a 2-D array by collapsing the higher dimensions.
 
 .. note:: `Flatten` is deprecated. Use `flatten` instead.
@@ -408,6 +409,7 @@ Examples::
 
 
 NNVM_REGISTER_OP(expand_dims)
+.add_alias("_npi_expand_dims")
 .describe(R"code(Inserts a new axis of size 1 into the array shape
 
 For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
@@ -696,6 +698,7 @@ NNVM_REGISTER_OP(_backward_slice_like)
 
 NNVM_REGISTER_OP(clip)
 MXNET_ADD_SPARSE_OP_ALIAS(clip)
+.add_alias("_npi_clip")
 .describe(R"code(Clips (limits) the values in an array.
 
 Given an interval, values outside the interval are clipped to the interval edges.
@@ -770,6 +773,7 @@ NNVM_REGISTER_OP(_backward_clip)
 .set_attr<FCompute>("FCompute<cpu>", ClipGrad_<cpu>);
 
 NNVM_REGISTER_OP(repeat)
+.add_alias("_np_repeat")
 .describe(R"code(Repeats elements of an array.
 
 By default, ``repeat`` flattens the input array into 1-D and then repeats the
@@ -820,6 +824,7 @@ NNVM_REGISTER_OP(_backward_repeat)
 });
 
 NNVM_REGISTER_OP(tile)
+.add_alias("_npi_tile")
 .describe(R"code(Repeats the whole array multiple times.
 
 If ``reps`` has length *d*, and input array has dimension of *n*. There are
@@ -1121,6 +1126,7 @@ Example::
 .add_arguments(DepthToSpaceParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_split_v2)
+.add_alias("_npi_split")
 .describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
 
 Example::
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index e2f014d1ad41..f693601a8822 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -34,6 +34,7 @@ DMLC_REGISTER_PARAMETER(SortParam);
 DMLC_REGISTER_PARAMETER(ArgSortParam);
 
 NNVM_REGISTER_OP(topk)
+.add_alias("_npx_topk")
 .describe(R"code(Returns the top *k* elements in an input array along the given axis.
  The returned elements will be sorted.
 
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index e4ec98f9f1bd..84fbdc8b49a5 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -35,6 +35,7 @@
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
 from common import run_in_spawned_process
 from test_operator import *
+from test_numpy_ndarray import *
 from test_optimizer import *
 from test_random import *
 from test_exc_handling import *
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
new file mode 100644
index 000000000000..6024ac9b4acd
--- /dev/null
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -0,0 +1,672 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import absolute_import
+from __future__ import division
+import os
+import numpy as _np
+import mxnet as mx
+from mxnet import np, npx, autograd
+from mxnet.gluon import HybridBlock
+from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, retry, assert_exception, use_np
+from common import with_seed, TemporaryDirectory
+
+
+@with_seed()
+@use_np
+def test_array_creation():
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    objects = [
+        [],
+        (),
+        [[1, 2], [3, 4]],
+        _np.random.uniform(size=rand_shape_nd(3)),
+        _np.random.uniform(size=(3, 0, 4))
+    ]
+    for dtype in dtypes:
+        for src in objects:
+            mx_arr = np.array(src, dtype=dtype)
+            assert mx_arr.context == mx.current_context()
+            if isinstance(src, mx.nd.NDArray):
+                np_arr = _np.array(src.asnumpy(), dtype=dtype if dtype is not None else _np.float32)
+            else:
+                np_arr = _np.array(src, dtype=dtype if dtype is not None else _np.float32)
+            assert mx_arr.dtype == np_arr.dtype
+            assert same(mx_arr.asnumpy(), np_arr)
+
+
+@with_seed()
+@use_np
+def test_zeros():
+    # test np.zeros in Gluon
+    class TestZeros(HybridBlock):
+        def __init__(self, shape, dtype=None):
+            super(TestZeros, self).__init__()
+            self._shape = shape
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x + F.np.zeros(shape, dtype)
+
+    class TestZerosOutputType(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x, F.np.zeros(shape=())
+
+    # test np.zeros in imperative
+    def check_zero_array_creation(shape, dtype):
+        np_out = _np.zeros(shape=shape, dtype=dtype)
+        mx_out = np.zeros(shape=shape, dtype=dtype)
+        assert same(mx_out.asnumpy(), np_out)
+        if dtype is None:
+            assert mx_out.dtype == _np.float32
+            assert np_out.dtype == _np.float64
+
+    shapes = [(0,), (2, 0, 2), (0, 0, 0, 0), ()]
+    shapes += [rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)]
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    for shape in shapes:
+        for dtype in dtypes:
+            check_zero_array_creation(shape, dtype)
+            x = np.array(_np.random.uniform(size=shape), dtype=dtype)
+            if dtype is None:
+                x = x.astype('float32')
+            for hybridize in [True, False]:
+                test_zeros = TestZeros(shape, dtype)
+                test_zeros_output_type = TestZerosOutputType()
+                if hybridize:
+                    test_zeros.hybridize()
+                    test_zeros_output_type.hybridize()
+                y = test_zeros(x)
+                assert type(y) == np.ndarray
+                assert same(x.asnumpy(), y.asnumpy())
+                y = test_zeros_output_type(x)
+                assert type(y[1]) == np.ndarray
+
+
+@with_seed()
+@use_np
+def test_ones():
+    # test np.ones in Gluon
+    class TestOnes(HybridBlock):
+        def __init__(self, shape, dtype=None):
+            super(TestOnes, self).__init__()
+            self._shape = shape
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x * F.np.ones(shape, dtype)
+
+    class TestOnesOutputType(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x, F.np.ones(shape=())
+
+    # test np.ones in imperative
+    def check_ones_array_creation(shape, dtype):
+        np_out = _np.ones(shape=shape, dtype=dtype)
+        mx_out = np.ones(shape=shape, dtype=dtype)
+        assert same(mx_out.asnumpy(), np_out)
+        if dtype is None:
+            assert mx_out.dtype == _np.float32
+            assert np_out.dtype == _np.float64
+
+    shapes = [(0,), (2, 0, 2), (0, 0, 0, 0), ()]
+    shapes += [rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)]
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    for shape in shapes:
+        for dtype in dtypes:
+            check_ones_array_creation(shape, dtype)
+            x = mx.nd.array(_np.random.uniform(size=shape), dtype=dtype).as_np_ndarray()
+            if dtype is None:
+                x = x.astype('float32')
+            for hybridize in [True, False]:
+                test_ones = TestOnes(shape, dtype)
+                test_ones_output_type = TestOnesOutputType()
+                if hybridize:
+                    test_ones.hybridize()
+                    test_ones_output_type.hybridize()
+                y = test_ones(x)
+                assert type(y) == np.ndarray
+                assert same(x.asnumpy(), y.asnumpy())
+                y = test_ones_output_type(x)
+                assert type(y[1]) == np.ndarray
+
+
+@with_seed()
+def test_ndarray_binary_element_wise_ops():
+    np_op_map = {
+        '+': _np.add,
+        '*': _np.multiply,
+        '-': _np.subtract,
+        '/': _np.divide,
+        'mod': _np.mod,
+        'pow': _np.power,
+        '==': _np.equal,
+        '>': _np.greater,
+        '>=': _np.greater_equal,
+        '<': _np.less,
+        '<=': _np.less_equal
+    }
+
+    def get_np_ret(x1, x2, op):
+        return np_op_map[op](x1, x2)
+
+    @use_np
+    class TestBinaryElementWiseOp(HybridBlock):
+        def __init__(self, op, scalar=None, reverse=False):
+            super(TestBinaryElementWiseOp, self).__init__()
+            self._op = op
+            self._scalar = scalar
+            self._reverse = reverse  # if false, scalar is the right operand.
+
+        def hybrid_forward(self, F, x, *args):
+            if self._op == '+':
+                if self._scalar is not None:
+                    return x + self._scalar if not self._reverse else self._scalar + x
+                else:
+                    return x + args[0] if not self._reverse else args[0] + x
+            elif self._op == '*':
+                if self._scalar is not None:
+                    return x * self._scalar if not self._reverse else self._scalar * x
+                else:
+                    return x * args[0] if not self._reverse else args[0] * x
+            elif self._op == '-':
+                if self._scalar is not None:
+                    return x - self._scalar if not self._reverse else self._scalar - x
+                else:
+                    return x - args[0] if not self._reverse else args[0] - x
+            elif self._op == '/':
+                if self._scalar is not None:
+                    return x / self._scalar if not self._reverse else self._scalar / x
+                else:
+                    return x / args[0] if not self._reverse else args[0] / x
+            elif self._op == 'mod':
+                if self._scalar is not None:
+                    return x % self._scalar if not self._reverse else self._scalar % x
+                else:
+                    return x % args[0] if not self._reverse else args[0] % x
+            elif self._op == 'pow':
+                if self._scalar is not None:
+                    return x ** self._scalar if not self._reverse else self._scalar ** x
+                else:
+                    return x ** args[0] if not self._reverse else args[0] ** x
+            elif self._op == '>':
+                if self._scalar is not None:
+                    return x > self._scalar if not self._reverse else self._scalar > x
+                else:
+                    return x > args[0]
+            elif self._op == '>=':
+                if self._scalar is not None:
+                    return x >= self._scalar if not self._reverse else self._scalar >= x
+                else:
+                    return x >= args[0]
+            elif self._op == '<':
+                if self._scalar is not None:
+                    return x < self._scalar if not self._reverse else self._scalar < x
+                else:
+                    return x < args[0]
+            elif self._op == '<=':
+                if self._scalar is not None:
+                    return x <= self._scalar if not self._reverse else self._scalar <= x
+                else:
+                    return x <= args[0]
+            elif self._op == '==':
+                if self._scalar is not None:
+                    return x == self._scalar if not self._reverse else self._scalar == x
+                else:
+                    return x == args[0]
+            else:
+                print(self._op)
+                assert False
+
+    @use_np
+    def check_binary_op_result(shape1, shape2, op, dtype=None):
+        if shape1 is None:
+            mx_input1 = abs(_np.random.uniform()) + 1
+            np_input1 = mx_input1
+        else:
+            mx_input1 = rand_ndarray(shape1, dtype=dtype).abs() + 1
+            np_input1 = mx_input1.asnumpy()
+        if shape2 is None:
+            mx_input2 = abs(_np.random.uniform()) + 1
+            np_input2 = mx_input2
+        else:
+            mx_input2 = rand_ndarray(shape2, dtype=dtype).abs() + 1
+            np_input2 = mx_input2.asnumpy()
+
+        scalar = None
+        reverse = False
+        if isinstance(mx_input1, mx.nd.NDArray) and not isinstance(mx_input2, mx.nd.NDArray):
+            scalar = mx_input2
+            reverse = False
+        elif isinstance(mx_input2, mx.nd.NDArray) and not isinstance(mx_input1, mx.nd.NDArray):
+            scalar = mx_input1
+            reverse = True
+
+        np_out = get_np_ret(np_input1, np_input2, op)
+        for hybridize in [True, False]:
+            if scalar is None:
+                get_mx_ret_np = TestBinaryElementWiseOp(op)
+                get_mx_ret_classic = TestBinaryElementWiseOp(op)
+                if hybridize:
+                    get_mx_ret_np.hybridize()
+                    get_mx_ret_classic.hybridize()
+                mx_out = get_mx_ret_np(mx_input1.as_np_ndarray(), mx_input2.as_np_ndarray())
+                assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+            else:
+                get_mx_ret = TestBinaryElementWiseOp(op, scalar=scalar, reverse=reverse)
+                if hybridize:
+                    get_mx_ret.hybridize()
+                if reverse:
+                    mx_out = get_mx_ret(mx_input2.as_np_ndarray())
+                    assert type(mx_out) == np.ndarray
+                else:
+                    mx_out = get_mx_ret(mx_input1.as_np_ndarray())
+                    assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+
+    dtypes = [_np.float32, _np.float64, None]
+    ops = np_op_map.keys()
+    for dtype in dtypes:
+        for op in ops:
+            check_binary_op_result((3, 4), (3, 4), op, dtype)
+            check_binary_op_result(None, (3, 4), op, dtype)
+            check_binary_op_result((3, 4), None, op, dtype)
+            check_binary_op_result((1, 4), (3, 1), op, dtype)
+            check_binary_op_result(None, (3, 1), op, dtype)
+            check_binary_op_result((1, 4), None, op, dtype)
+            check_binary_op_result((1, 4), (3, 5, 4), op, dtype)
+            check_binary_op_result((), (3, 5, 4), op, dtype)
+            check_binary_op_result((), None, op, dtype)
+            check_binary_op_result(None, (), op, dtype)
+            check_binary_op_result((0, 2), (1, 1), op, dtype)
+            check_binary_op_result((0, 2), None, op, dtype)
+            check_binary_op_result(None, (0, 2), op, dtype)
+
+
+@with_seed()
+def test_hybrid_block_multiple_outputs():
+    @use_np
+    class TestAllNumpyOutputs(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return F.np.add(x, x), F.np.multiply(x, x)
+
+    class TestAllClassicOutputs(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x.as_nd_ndarray() + x.as_nd_ndarray(), x.as_nd_ndarray() * x.as_nd_ndarray()
+
+    data_np = np.ones((2, 3))
+    for block, expected_out_type in [(TestAllClassicOutputs, mx.nd.NDArray),
+                                     (TestAllNumpyOutputs, np.ndarray)]:
+        net = block()
+        for hybridize in [True, False]:
+            if hybridize:
+                net.hybridize()
+            out1, out2 = net(data_np)
+            assert type(out1) is expected_out_type
+            assert type(out2) is expected_out_type
+
+    @use_np
+    class TestMixedTypeOutputsFailure(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x.as_nd_ndarray() + x.as_nd_ndarray(), F.np.multiply(x, x)
+
+    net = TestMixedTypeOutputsFailure()
+    assert_exception(net, TypeError, data_np)
+    net.hybridize()
+    assert_exception(net, TypeError, data_np)
+
+
+@with_seed()
+@use_np
+def test_grad_ndarray_type():
+    data = np.array(2, dtype=_np.float32)
+    data.attach_grad()
+    assert type(data.grad) == np.ndarray
+    assert type(data.detach()) == np.ndarray
+
+
+@with_seed()
+def test_np_ndarray_astype():
+    mx_data = np.array([2, 3, 4, 5], dtype=_np.int32)
+    np_data = mx_data.asnumpy()
+
+    def check_astype_equal(dtype, copy, expect_zero_copy=False):
+        mx_ret = mx_data.astype(dtype=dtype, copy=copy)
+        assert type(mx_ret) is np.ndarray
+        np_ret = np_data.astype(dtype=dtype, copy=copy)
+        assert mx_ret.dtype == np_ret.dtype
+        assert same(mx_ret.asnumpy(), np_ret)
+        if expect_zero_copy:
+            assert id(mx_ret) == id(mx_data)
+            assert id(np_ret) == id(np_data)
+
+    for dtype in [_np.int8, _np.uint8, _np.int32, _np.float16, _np.float32, _np.float64]:
+        for copy in [True, False]:
+            check_astype_equal(dtype, copy, copy is False and mx_data.dtype == dtype)
+
+
+@with_seed()
+def test_np_ndarray_copy():
+    mx_data = np.array([2, 3, 4, 5], dtype=_np.int32)
+    assert_exception(mx_data.copy, NotImplementedError, order='F')
+    mx_ret = mx_data.copy()
+    np_ret = mx_data.asnumpy().copy()
+    assert same(mx_ret.asnumpy(), np_ret)
+
+
+@with_seed()
+@use_np
+def test_np_ndarray_indexing():
+    def test_getitem(np_array, index):
+        """`is_scalar` indicates whether we should expect a scalar for the result.
+        If so, the indexed array of NDArray should call asscalar to compare
+        with numpy's indexed array."""
+        np_index = index
+        if isinstance(index, np.ndarray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, np.ndarray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        np_indexed_array = np_array[np_index]
+        mx_array = np.array(np_array, dtype=np_array.dtype)
+        mx_indexed_array = mx_array[index].asnumpy()
+        assert same(np_indexed_array, mx_indexed_array), 'Failed with index=%s' % str(index)
+
+    def test_setitem(np_array, index):
+        def assert_same(np_array, np_index, mx_array, mx_index, mx_value, np_value=None):
+            if np_value is not None:
+                np_array[np_index] = np_value
+            elif isinstance(mx_value, np.ndarray):
+                np_array[np_index] = mx_value.asnumpy()
+            else:
+                np_array[np_index] = mx_value
+            mx_array[mx_index] = mx_value
+            assert same(np_array, mx_array.asnumpy())
+
+        np_index = index
+        if isinstance(index, np.ndarray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, np.ndarray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        mx_array = np.array(np_array, dtype=np_array.dtype)
+        np_array = mx_array.asnumpy()
+        indexed_array_shape = np_array[np_index].shape
+        np_indexed_array = _np.random.randint(low=-10000, high=0, size=indexed_array_shape)
+        # test value is a numpy array without broadcast
+        assert_same(np_array, np_index, mx_array, index, np_indexed_array)
+        # test value is an numeric_type
+        assert_same(np_array, np_index, mx_array, index, _np.random.randint(low=-10000, high=0))
+        if len(indexed_array_shape) > 1:
+            # test ndarray with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        _np.random.uniform(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+            # test numpy array with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        _np.random.randint(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+            # test list with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        [_np.random.randint(low=-10000, high=0)] * indexed_array_shape[-1])
+
+    def test_getitem_autograd(np_array, index):
+        x = np.array(np_array, dtype=np_array.dtype)
+        x.attach_grad()
+        with autograd.record():
+            y = x[index]
+        y.backward()
+        value = np.ones_like(y)
+        x_grad = np.zeros_like(x)
+        x_grad[index] = value
+        assert same(x_grad.asnumpy(), x.grad.asnumpy())
+
+    def test_setitem_autograd(np_array, index):
+        x = np.array(np_array, dtype=np_array.dtype)
+        out_shape = x[index].shape
+        y = np.array(_np.random.uniform(size=out_shape))
+        y.attach_grad()
+        try:
+            with autograd.record():
+                x[index] = y
+                assert False  # should not reach here
+        except mx.base.MXNetError as err:
+            assert str(err).find('Inplace operations (+=, -=, x[:]=, etc) are not supported when recording with') != -1
+
+    def np_int(index, int_type=_np.int32):
+        def convert(num):
+            if num is None:
+                return num
+            else:
+                return int_type(num)
+
+        if isinstance(index, slice):
+            return slice(convert(index.start), convert(index.stop), convert(index.step))
+        elif isinstance(index, tuple):  # tuple of slices and integers
+            ret = []
+            for elem in index:
+                if isinstance(elem, slice):
+                    ret.append(slice(convert(elem.start), convert(elem.stop), convert(elem.step)))
+                else:
+                    ret.append(convert(elem))
+            return tuple(ret)
+        else:
+            assert False
+
+    shape = (8, 16, 9, 9)
+    np_array = _np.arange(_np.prod(shape), dtype='int32').reshape(shape)
+    index_list = [
+        (),
+        0,
+        _np.int32(0),
+        _np.int64(0),
+        5,
+        _np.int32(5),
+        _np.int64(5),
+        -1,
+        _np.int32(-1),
+        _np.int64(-1),
+        slice(5),
+        np_int(slice(5), _np.int32),
+        np_int(slice(5), _np.int64),
+        slice(1, 5),
+        np_int(slice(1, 5), _np.int32),
+        np_int(slice(1, 5), _np.int64),
+        slice(1, 5, 2),
+        np_int(slice(1, 5, 2), _np.int32),
+        np_int(slice(1, 5, 2), _np.int64),
+        slice(7, 0, -1),
+        np_int(slice(7, 0, -1)),
+        np_int(slice(7, 0, -1), _np.int64),
+        slice(None, 6),
+        np_int(slice(None, 6)),
+        np_int(slice(None, 6), _np.int64),
+        slice(None, 6, 3),
+        np_int(slice(None, 6, 3)),
+        np_int(slice(None, 6, 3), _np.int64),
+        slice(1, None),
+        np_int(slice(1, None)),
+        np_int(slice(1, None), _np.int64),
+        slice(1, None, 3),
+        np_int(slice(1, None, 3)),
+        np_int(slice(1, None, 3), _np.int64),
+        slice(None, None, 2),
+        np_int(slice(None, None, 2)),
+        np_int(slice(None, None, 2), _np.int64),
+        slice(None, None, -1),
+        np_int(slice(None, None, -1)),
+        np_int(slice(None, None, -1), _np.int64),
+        slice(None, None, -2),
+        np_int(slice(None, None, -2), _np.int32),
+        np_int(slice(None, None, -2), _np.int64),
+        (slice(None), slice(None), 1, 8),
+        (slice(None), slice(None), -1, 8),
+        (slice(None), slice(None), 1, -8),
+        (slice(None), slice(None), -1, -8),
+        np_int((slice(None), slice(None), 1, 8)),
+        np_int((slice(None), slice(None), 1, 8), _np.int64),
+        (slice(None), slice(None), 1, 8),
+        np_int((slice(None), slice(None), -1, -8)),
+        np_int((slice(None), slice(None), -1, -8), _np.int64),
+        (slice(None), 2, slice(1, 5), 1),
+        np_int((slice(None), 2, slice(1, 5), 1)),
+        np_int((slice(None), 2, slice(1, 5), 1), _np.int64),
+        (1, 2, 3),
+        np_int((1, 2, 3)),
+        np_int((1, 2, 3), _np.int64),
+        (-1, -2, -3),
+        np_int((-1, -2, -3)),
+        np_int((-1, -2, -3), _np.int64),
+        (1, 2, 3, 4),
+        np_int((1, 2, 3, 4)),
+        np_int((1, 2, 3, 4), _np.int64),
+        (-4, -3, -2, -1),
+        np_int((-4, -3, -2, -1)),
+        np_int((-4, -3, -2, -1), _np.int64),
+        (slice(None, None, -1), 2, slice(1, 5), 1),
+        np_int((slice(None, None, -1), 2, slice(1, 5), 1)),
+        np_int((slice(None, None, -1), 2, slice(1, 5), 1), _np.int64),
+        (slice(None, None, -1), 2, slice(1, 7, 2), 1),
+        np_int((slice(None, None, -1), 2, slice(1, 7, 2), 1)),
+        np_int((slice(None, None, -1), 2, slice(1, 7, 2), 1), _np.int64),
+        (slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)),
+        np_int((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3))),
+        np_int((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)), _np.int64),
+        (slice(1, 8, 2), 1, slice(3, 8), 2),
+        np_int((slice(1, 8, 2), 1, slice(3, 8), 2)),
+        np_int((slice(1, 8, 2), 1, slice(3, 8), 2), _np.int64),
+        [1],
+        [1, 2],
+        [2, 1, 3],
+        [7, 5, 0, 3, 6, 2, 1],
+        _np.array([6, 3], dtype=_np.int32),
+        _np.array([[3, 4], [0, 6]], dtype=_np.int32),
+        _np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int32),
+        _np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int64),
+        _np.array([[2], [0], [1]], dtype=_np.int32),
+        _np.array([[2], [0], [1]], dtype=_np.int64),
+        np.array([4, 7], dtype=_np.int32),
+        np.array([4, 7], dtype=_np.int64),
+        np.array([[3, 6], [2, 1]], dtype=_np.int32),
+        np.array([[3, 6], [2, 1]], dtype=_np.int64),
+        np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int32),
+        np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int64),
+        (1, [2, 3]),
+        (1, [2, 3], _np.array([[3], [0]], dtype=_np.int32)),
+        (1, [2, 3]),
+        (1, [2, 3], _np.array([[3], [0]], dtype=_np.int64)),
+        (1, [2], _np.array([[5], [3]], dtype=_np.int32), slice(None)),
+        (1, [2], _np.array([[5], [3]], dtype=_np.int64), slice(None)),
+        (1, [2, 3], _np.array([[6], [0]], dtype=_np.int32), slice(2, 5)),
+        (1, [2, 3], _np.array([[6], [0]], dtype=_np.int64), slice(2, 5)),
+        (1, [2, 3], _np.array([[4], [7]], dtype=_np.int32), slice(2, 5, 2)),
+        (1, [2, 3], _np.array([[4], [7]], dtype=_np.int64), slice(2, 5, 2)),
+        (1, [2], _np.array([[3]], dtype=_np.int32), slice(None, None, -1)),
+        (1, [2], _np.array([[3]], dtype=_np.int64), slice(None, None, -1)),
+        (1, [2], _np.array([[3]], dtype=_np.int32), np.array([[5, 7], [2, 4]], dtype=_np.int64)),
+        (1, [2], np.array([[4]], dtype=_np.int32), np.array([[1, 3], [5, 7]], dtype='int64')),
+        [0],
+        [0, 1],
+        [1, 2, 3],
+        [2, 0, 5, 6],
+        ([1, 1], [2, 3]),
+        ([1], [4], [5]),
+        ([1], [4], [5], [6]),
+        ([[1]], [[2]]),
+        ([[1]], [[2]], [[3]], [[4]]),
+        (slice(0, 2), [[1], [6]], slice(0, 2), slice(0, 5, 2)),
+        ([[[[1]]]], [[1]], slice(0, 3), [1, 5]),
+        ([[[[1]]]], 3, slice(0, 3), [1, 3]),
+        ([[[[1]]]], 3, slice(0, 3), 0),
+        ([[[[1]]]], [[2], [12]], slice(0, 3), slice(None)),
+        ([1, 2], slice(3, 5), [2, 3], [3, 4]),
+        ([1, 2], slice(3, 5), (2, 3), [3, 4]),
+        range(4),
+        range(3, 0, -1),
+        (range(4,), [1]),
+        # slice(0, 0) does not support output zero-size tensor yet
+    ]
+    for index in index_list:
+        test_getitem(np_array, index)
+        test_setitem(np_array, index)
+        test_getitem_autograd(np_array, index)
+        if not isinstance(index, tuple) or len(index) != 0:
+            # When index = (), this is same a[()] = b is equivalent to b.copyto(a)
+            # which should have no problem to do autograd
+            test_setitem_autograd(np_array, index)
+
+
+@with_seed()
+@use_np
+def test_np_save_load_ndarrays():
+    shapes = [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 5, 0), (4, 5, 6), (0, 0, 0)]
+    array_list = [_np.random.randint(0, 10, size=shape) for shape in shapes]
+    array_list = [np.array(arr, dtype=arr.dtype) for arr in array_list]
+    # test save/load single ndarray
+    for i, arr in enumerate(array_list):
+        with TemporaryDirectory() as work_dir:
+            fname = os.path.join(work_dir, 'dataset.npy')
+            npx.save(fname, arr)
+            arr_loaded = npx.load(fname)
+            assert isinstance(arr_loaded, list)
+            assert len(arr_loaded) == 1
+            assert _np.array_equal(arr_loaded[0].asnumpy(), array_list[i].asnumpy())
+
+    # test save/load a list of ndarrays
+    with TemporaryDirectory() as work_dir:
+        fname = os.path.join(work_dir, 'dataset.npy')
+        npx.save(fname, array_list)
+        array_list_loaded = mx.nd.load(fname)
+        assert isinstance(arr_loaded, list)
+        assert len(array_list) == len(array_list_loaded)
+        assert all(isinstance(arr, np.ndarray) for arr in arr_loaded)
+        for a1, a2 in zip(array_list, array_list_loaded):
+            assert _np.array_equal(a1.asnumpy(), a2.asnumpy())
+
+    # test save/load a dict of str->ndarray
+    arr_dict = {}
+    keys = [str(i) for i in range(len(array_list))]
+    for k, v in zip(keys, array_list):
+        arr_dict[k] = v
+    with TemporaryDirectory() as work_dir:
+        fname = os.path.join(work_dir, 'dataset.npy')
+        npx.save(fname, arr_dict)
+        arr_dict_loaded = npx.load(fname)
+        assert isinstance(arr_dict_loaded, dict)
+        assert len(arr_dict_loaded) == len(arr_dict)
+        for k, v in arr_dict_loaded.items():
+            assert k in arr_dict
+            assert _np.array_equal(v.asnumpy(), arr_dict[k].asnumpy())
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index b553299ab4d7..ee56ba780a95 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -23,6 +23,7 @@
 from mxnet.attribute import AttrScope
 from mxnet.name import NameManager
 from mxnet.test_utils import set_default_context
+from mxnet.util import _NumpyArrayScope
 
 def test_context():
     ctx_list = []
@@ -163,6 +164,41 @@ def f():
     thread.join()
     assert status[0], "Failed to execute a symbolic graph within a thread"
 
+
+def test_np_array_scope():
+    np_array_scope_list = []
+    _NumpyArrayScope._current = _NumpyArrayScope(False)
+    np_array_scope_list.append(_NumpyArrayScope._current)
+
+    def f():
+        _NumpyArrayScope._current = _NumpyArrayScope(True)
+        np_array_scope_list.append(_NumpyArrayScope._current)
+
+    thread = threading.Thread(target=f)
+    thread.start()
+    thread.join()
+    assert len(np_array_scope_list) == 2
+    assert not np_array_scope_list[0]._is_np_array
+    assert np_array_scope_list[1]._is_np_array
+
+    event = threading.Event()
+    status = [False]
+
+    def g():
+        with mx.np_array(False):
+            event.wait()
+            if not mx.is_np_array():
+                status[0] = True
+
+    thread = threading.Thread(target=g)
+    thread.start()
+    _NumpyArrayScope._current = _NumpyArrayScope(True)
+    event.set()
+    thread.join()
+    event.clear()
+    assert status[0], "Spawned thread didn't set status correctly"
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()