diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index 6095230e14b3..28843ecbbc66 100644
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -468,6 +468,26 @@ struct DataType<bool> {
   static const int kFlag = kBool;
   static const int kLanes = 1;
 };
+template<>
+struct DataType<int16_t> {
+  static const int kFlag = kInt16;
+  static const int kLanes = 1;
+};
+template<>
+struct DataType<uint16_t> {
+  static const int kFlag = kUint16;
+  static const int kLanes = 1;
+};
+template<>
+struct DataType<uint32_t> {
+  static const int kFlag = kUint32;
+  static const int kLanes = 1;
+};
+template<>
+struct DataType<uint64_t> {
+  static const int kFlag = kUint64;
+  static const int kLanes = 1;
+};
 
 /*! \brief type enum value for default real type */
 const int default_type_flag = DataType<default_real_t>::kFlag;
@@ -1113,6 +1133,26 @@ struct minimum {
       {__VA_ARGS__}                                 \
     }                                               \
     break;                                          \
+  case mshadow::kBool:                              \
+    LOG(FATAL) << "This operation does not "        \
+                  "support bool type";              \
+    break;                                          \
+  case mshadow::kInt16:                             \
+    LOG(FATAL) << "This operation does not "        \
+                  "support int16 type";             \
+    break;                                          \
+  case mshadow::kUint16:                            \
+    LOG(FATAL) << "This operation does not "        \
+                  "support uint16 type";            \
+    break;                                          \
+  case mshadow::kUint32:                            \
+    LOG(FATAL) << "This operation does not "        \
+                  "support uint32 type";            \
+    break;                                          \
+  case mshadow::kUint64:                            \
+    LOG(FATAL) << "This operation does not "        \
+                  "support uint64 type";            \
+    break;                                          \
   default:                                          \
     LOG(FATAL) << "Unknown type enum " << type;     \
   }
@@ -1161,6 +1201,26 @@ struct minimum {
       {__VA_ARGS__}                                 \
     }                                               \
     break;                                          \
+  case mshadow::kBool:                              \
+    LOG(FATAL) << "This operation does not "        \
+                  "support bool type";              \
+    break;                                          \
+  case mshadow::kInt16:                             \
+    LOG(FATAL) << "This operation does not "        \
+                  "support int16 type";             \
+    break;                                          \
+  case mshadow::kUint16:                            \
+    LOG(FATAL) << "This operation does not "        \
+                  "support uint16 type";            \
+    break;                                          \
+  case mshadow::kUint32:                            \
+    LOG(FATAL) << "This operation does not "        \
+                  "support uint32 type";            \
+    break;                                          \
+  case mshadow::kUint64:                            \
+    LOG(FATAL) << "This operation does not "        \
+                  "support uint64 type";            \
+    break;                                          \
   default:                                          \
     LOG(FATAL) << "Unknown type enum " << type;     \
   }
@@ -1221,6 +1281,26 @@ struct minimum {
     LOG(FATAL) << "This operation only support "    \
                   "floating point types, not int64";\
     break;                                          \
+  case mshadow::kBool:                              \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not bool"; \
+    break;                                          \
+  case mshadow::kInt16:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int16";\
+    break;                                          \
+  case mshadow::kUint16:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint16";\
+    break;                                          \
+  case mshadow::kUint32:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint32";\
+    break;                                          \
+  case mshadow::kUint64:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint64";\
+    break;                                          \
   default:                                          \
     LOG(FATAL) << "Unknown type enum " << type;     \
   }
@@ -1272,6 +1352,26 @@ struct minimum {
     LOG(FATAL) << "This operation only support "    \
                   "floating point types, not int64";\
     break;                                          \
+  case mshadow::kBool:                              \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not bool"; \
+    break;                                          \
+  case mshadow::kInt16:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int16";\
+    break;                                          \
+  case mshadow::kUint16:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint16";\
+    break;                                          \
+  case mshadow::kUint32:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint32";\
+    break;                                          \
+  case mshadow::kUint64:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint64";\
+    break;                                          \
   default:                                          \
     LOG(FATAL) << "Unknown type enum " << type$;    \
   }
@@ -1315,6 +1415,26 @@ struct minimum {
     LOG(FATAL) << "This operation only support "    \
                   "floating point types, not int64";\
     break;                                          \
+  case mshadow::kBool:                              \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not bool"; \
+    break;                                          \
+  case mshadow::kInt16:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int16";\
+    break;                                          \
+  case mshadow::kUint16:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint16";\
+    break;                                          \
+  case mshadow::kUint32:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint32";\
+    break;                                          \
+  case mshadow::kUint64:                            \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint64";\
+    break;                                          \
   default:                                          \
     LOG(FATAL) << "Unknown type enum " << type$;    \
   }
@@ -1351,7 +1471,7 @@ struct minimum {
 
 /*!
  * \brief Only supports int64 index type for aux_data
- * in NDArray class fow now.
+ * in NDArray class for now.
  */
 #define MSHADOW_IDX_TYPE_SWITCH(type, DType, ...)   \
   switch (type) {                                   \
@@ -1421,6 +1541,184 @@ struct minimum {
       {__VA_ARGS__}                                           \
     }                                                         \
     break;                                                    \
+  case mshadow::kInt16:                                       \
+    LOG(FATAL) << "This operation does not "                  \
+                  "support int16 type";                       \
+    break;                                                    \
+  case mshadow::kUint16:                                      \
+    LOG(FATAL) << "This operation does not "                  \
+                  "support uint16 type";                      \
+    break;                                                    \
+  case mshadow::kUint32:                                      \
+    LOG(FATAL) << "This operation does not "                  \
+                  "support uint32 type";                      \
+    break;                                                    \
+  case mshadow::kUint64:                                      \
+    LOG(FATAL) << "This operation does not "                  \
+                  "support uint64 type";                      \
+    break;                                                    \
+  default:                                                    \
+    LOG(FATAL) << "Unknown type enum " << type;               \
+  }
+
+#define MSHADOW_TYPE_SWITCH_EXT(type, DType, ...)             \
+  switch (type) {                                             \
+  case mshadow::kFloat32:                                     \
+    {                                                         \
+      typedef float DType;                                    \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kFloat64:                                     \
+    {                                                         \
+      typedef double DType;                                   \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kFloat16:                                     \
+    {                                                         \
+      typedef mshadow::half::half_t DType;                    \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kBfloat16:                                    \
+    {                                                         \
+      typedef mshadow::bfloat::bf16_t DType;                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint8:                                       \
+    {                                                         \
+      typedef uint8_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt8:                                        \
+    {                                                         \
+      typedef int8_t DType;                                   \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt32:                                       \
+    {                                                         \
+      typedef int32_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt64:                                       \
+    {                                                         \
+      typedef int64_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt16:                                       \
+    {                                                         \
+      typedef int16_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint16:                                      \
+    {                                                         \
+      typedef uint16_t DType;                                 \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint32:                                      \
+    {                                                         \
+      typedef uint32_t DType;                                 \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint64:                                      \
+    {                                                         \
+      typedef uint64_t DType;                                 \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  default:                                                    \
+    LOG(FATAL) << "Unknown type enum " << type;               \
+  }
+
+#define MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(type, DType, ...)   \
+  switch (type) {                                             \
+  case mshadow::kFloat32:                                     \
+    {                                                         \
+      typedef float DType;                                    \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kFloat64:                                     \
+    {                                                         \
+      typedef double DType;                                   \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kFloat16:                                     \
+    {                                                         \
+      typedef mshadow::half::half_t DType;                    \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kBfloat16:                                    \
+    {                                                         \
+      typedef mshadow::bfloat::bf16_t DType;                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint8:                                       \
+    {                                                         \
+      typedef uint8_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt8:                                        \
+    {                                                         \
+      typedef int8_t DType;                                   \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt32:                                       \
+    {                                                         \
+      typedef int32_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt64:                                       \
+    {                                                         \
+      typedef int64_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kBool:                                        \
+    {                                                         \
+      typedef bool DType;                                     \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kInt16:                                       \
+    {                                                         \
+      typedef int16_t DType;                                  \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint16:                                      \
+    {                                                         \
+      typedef uint16_t DType;                                 \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint32:                                      \
+    {                                                         \
+      typedef uint32_t DType;                                 \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
+  case mshadow::kUint64:                                      \
+    {                                                         \
+      typedef uint64_t DType;                                 \
+      {__VA_ARGS__}                                           \
+    }                                                         \
+    break;                                                    \
   default:                                                    \
     LOG(FATAL) << "Unknown type enum " << type;               \
   }
@@ -1428,7 +1726,7 @@ struct minimum {
 /*! \brief get data type size from type enum */
 inline size_t mshadow_sizeof(int type) {
   int size = 0;
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(type, DType, size = sizeof(DType););
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(type, DType, size = sizeof(DType););
   return size;
 }
 
@@ -1451,6 +1749,14 @@ inline std::string dtype_string(const int dtype) {
       return "long long";
     case mshadow::kBool:
       return "bool";
+    case mshadow::kInt16:
+      return "short";
+    case mshadow::kUint16:
+      return "unsigned short";
+    case mshadow::kUint32:
+      return "unsigned int";
+    case mshadow::kUint64:
+      return "unsigned long long";
     default:
       LOG(FATAL) << "Unknown type enum " << dtype;
   }
diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
index 5d051b5787fb..e5f930d5ac86 100644
--- a/ci/docker/install/requirements
+++ b/ci/docker/install/requirements
@@ -52,3 +52,6 @@ decorator==4.4.0
 boto3==1.9.229
 h5py==2.10.0
 Pillow<6
+
+# Array API Standardization requirements
+hypothesis==6.14.0
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index e2702dd285e1..a623bfdcd564 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -806,6 +806,22 @@ unittest_ubuntu_python3_cpu_onednn() {
     pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
 }
 
+unittest_array_api_standardization() {
+    set -ex
+    python3 -m pip install -e /work/mxnet/python --user
+    cd ..
+    git clone https://github.com/data-apis/array-api-tests.git
+    pushd /work/array-api-tests
+    export ARRAY_API_TESTS_MODULE=mxnet.numpy pytest
+    # OverflowError: Python int too large to convert to C long
+    # when cython is enabled
+    export MXNET_ENABLE_CYTHON=0
+    export DMLC_LOG_STACK_TRACE_DEPTH=100
+    python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose \
+        array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_bool_type_promotion
+    popd
+}
+
 unittest_ubuntu_python3_gpu() {
     set -ex
     export PYTHONPATH=./python/
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 0f63ce0e7be3..34c56551c486 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -60,6 +60,12 @@ def python3_ut_onednn(docker_container_name) {
   }
 }
 
+def python3_ut_array_api(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    utils.docker_run(docker_container_name, 'unittest_array_api_standardization', false)
+  }
+}
+
 // GPU test has two parts. 1) run unittest on GPU, 2) compare the results on
 // both CPU and GPU
 // Python 3
@@ -665,6 +671,18 @@ def test_unix_python3_cpu(lib_name) {
     }]
 }
 
+def test_unix_python3_array_api(lib_name) {
+    return ['Python3: Array-API': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-python3-cpu') {
+          utils.unpack_and_init(lib_name, mx_lib, true)
+          python3_ut_array_api('ubuntu_cpu')
+          utils.publish_test_coverage()
+        }
+      }
+    }]
+}
+
 def test_unix_python3_mkl_cpu(lib_name) {
     return ['Python3: MKL-CPU': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 9e189b34bb50..9681270d8905 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -46,6 +46,7 @@ core_logic: {
   utils.parallel_stage('Tests', [
     custom_steps.test_unix_python3_cpu('cpu'),
     custom_steps.test_unix_python3_onnx_cpu('cpu'),
+    custom_steps.test_unix_python3_array_api('cpu'),
     custom_steps.test_unix_python3_mkl_cpu('cpu_mkl'),
     custom_steps.test_unix_python3_onednn_cpu('onednn_cpu'),
     custom_steps.test_unix_python3_onednn_mkl_cpu('onednn_mkl_cpu'),
diff --git a/contrib/tvmop/utils.py b/contrib/tvmop/utils.py
index 9c31eb304dce..293279919f1d 100644
--- a/contrib/tvmop/utils.py
+++ b/contrib/tvmop/utils.py
@@ -18,7 +18,8 @@
 # coding: utf-8
 import tvm
 
-AllTypes = ["float32", "float64", "float16", "uint8", "int8", "int32", "int64"]
+AllTypes = ["float32", "float64", "float16", "uint8", "uint16",
+            "uint32", "uint64", "int8", "int16", "int32", "int64"]
 RealTypes = ["float32", "float64", "float16"]
 
 
diff --git a/include/mxnet/runtime/packed_func.h b/include/mxnet/runtime/packed_func.h
index f498a692c1dc..964417638317 100644
--- a/include/mxnet/runtime/packed_func.h
+++ b/include/mxnet/runtime/packed_func.h
@@ -893,6 +893,14 @@ inline int String2MXNetTypeWithBool(const std::string& s) {
     return mshadow::kInt64;
   } else if (s == "bool") {
     return mshadow::kBool;
+  } else if (s == "int16") {
+    return mshadow::kInt16;
+  } else if (s == "uint16") {
+    return mshadow::kUint16;
+  } else if (s == "uint32") {
+    return mshadow::kUint32;
+  } else if (s == "uint64") {
+    return mshadow::kUint64;
   } else {
     LOG(FATAL) << "unknown type " << s;
   }
@@ -915,6 +923,14 @@ inline int String2MXNetType(const std::string& s) {
     return mshadow::kInt32;
   } else if (s == "int64") {
     return mshadow::kInt64;
+  } else if (s == "int16") {
+    return mshadow::kInt16;
+  } else if (s == "uint16") {
+    return mshadow::kUint16;
+  } else if (s == "uint32") {
+    return mshadow::kUint32;
+  } else if (s == "uint64") {
+    return mshadow::kUint64;
   } else {
     LOG(FATAL) << "unknown type " << s;
   }
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index aeb4a7d0f543..6169672a587c 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -386,6 +386,10 @@ class TBlob {
       case mshadow::kInt8: return DLDataType{kDLInt, 8, 1};
       case mshadow::kInt64: return DLDataType{kDLInt, 64, 1};
       case mshadow::kBool: return DLDataType{kDLUInt, 1, 1};
+      case mshadow::kInt16: return DLDataType{kDLInt, 16, 1};
+      case mshadow::kUint16: return DLDataType{kDLUInt, 16, 1};
+      case mshadow::kUint32: return DLDataType{kDLUInt, 32, 1};
+      case mshadow::kUint64: return DLDataType{kDLUInt, 64, 1};
       default: {
         LOG(FATAL) << "Unknown type_flag=" << type_flag;
         return DLDataType();
@@ -413,11 +417,15 @@ class TBlob {
         switch (dldata_type.bits) {
           case 1: return mshadow::kBool;
           case 8: return mshadow::kUint8;
+          case 16: return mshadow::kUint16;
+          case 32: return mshadow::kUint32;
+          case 64: return mshadow::kUint64;
         }
         break;
       case kDLInt:
         switch (dldata_type.bits) {
           case 8: return mshadow::kInt8;
+          case 16: return mshadow::kInt16;
           case 32: return mshadow::kInt32;
           case 64: return mshadow::kInt64;
         }
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index fa255f0ccac4..a7465865b707 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -509,9 +509,11 @@ def empty_like(prototype, dtype=None, order='C', subok=False, shape=None): # pyl
     array([[4.9e-324, 9.9e-324, 1.5e-323], # uninitialized
            [2.0e-323, 2.5e-323, 3.0e-323]])
     """
-    dtype_list = {None:'None', _np.int8:'int8', _np.uint8:'uint8', _np.int32:'int32',
-                  _np.int64:'int64', _np.float16:'float16', _np.float32:'float32',
-                  _np.float64:'float64', _np.bool_:'bool_', bool:'bool', int:'int64', float:'float64'}
+    dtype_list = {_np.float16: 'float16', _np.float32: 'float32', _np.float64: 'float64',
+                  float: 'float64', _np.int8: 'int8', _np.int16: 'int16', _np.int32: 'int32',
+                  _np.int64: 'int64', int:'int64', _np.uint8: 'uint8', _np.uint16: 'uint16',
+                  _np.uint32: 'uint32', _np.uint64: 'uint64', _np.bool: 'bool',
+                  _np.bool_: 'bool_', bool: 'bool', None: 'None'}
     if order != 'C':
         raise NotImplementedError("Only support C-order at this moment")
     if subok:
diff --git a/python/mxnet/numpy/utils.py b/python/mxnet/numpy/utils.py
index 186b3a034bbc..15b83c7f2b73 100644
--- a/python/mxnet/numpy/utils.py
+++ b/python/mxnet/numpy/utils.py
@@ -22,9 +22,12 @@
 import numpy as onp
 
 __all__ = ['float16', 'float32', 'float64', 'uint8', 'int32', 'int8', 'int64',
+           'int16', 'uint16', 'uint32', 'uint64',
            'bool', 'bool_', 'pi', 'inf', 'nan', 'PZERO', 'NZERO', 'newaxis', 'finfo',
            'e', 'NINF', 'PINF', 'NAN', 'NaN',
-           '_STR_2_DTYPE_']
+           '_STR_2_DTYPE_', '_DTYPE_2_STR_']
+
+py_bool = bool
 
 float16 = onp.float16
 float32 = onp.float32
@@ -35,6 +38,10 @@
 int64 = onp.int64
 bool_ = onp.bool_
 bool = onp.bool
+int16 = onp.int16
+uint16 = onp.uint16
+uint32 = onp.uint32
+uint64 = onp.uint64
 
 pi = onp.pi
 inf = onp.inf
@@ -50,10 +57,16 @@
 newaxis = None
 finfo = onp.finfo
 
-_STR_2_DTYPE_ = {'float16': float16, 'float32': float32, 'float64':float64, 'float': float64,
-                 'uint8': uint8, 'int8': int8, 'int32': int32, 'int64': int64, 'int': int64,
+_STR_2_DTYPE_ = {'float16': float16, 'float32': float32, 'float64': float64, 'float': float64,
+                 'int8': int8, 'int16': int16, 'int32': int32, 'int64': int64, 'int': int64,
+                 'uint8': uint8, 'uint16': uint16, 'uint32': uint32, 'uint64': uint64,
                  'bool': bool, 'bool_': bool_, 'None': None}
 
+_DTYPE_2_STR_ = {float16: 'float16', float32: 'float32', float64: 'float64', float: 'float64',
+                 int8: 'int8', int16: 'int16', int32: 'int32', int64: 'int64', int:'int64',
+                 uint8: 'uint8', uint16: 'uint16', uint32: 'uint32', uint64: 'uint64',
+                 bool: 'bool', bool_: 'bool_', py_bool: 'bool', None: 'None'}
+
 _ONP_OP_MODULES = [onp, onp.linalg, onp.random, onp.fft]
 
 
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 95c2f5d999eb..1b96ad88923e 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -339,7 +339,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
   CHECK_EQ(storage_type(), kDefaultStorage);
   NDArray ret   = this->Detach();
   size_t length = shape_.ProdShape(1, shape_.ndim());
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(
       ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); });
   ret.reuse_    = false;
   ret.shape_[0] = end - begin;
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index ed121899436a..e0a445814314 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -36,7 +36,7 @@ template<>
 void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                     Context from_ctx, Context to_ctx,
                     RunContext ctx) {
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(to->type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(to->type_flag_, DType, {
     if (to->type_flag_ == from.type_flag_) {
       if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
         CHECK_LT(from.Size(), (int64_t{1} << 31) - 1) <<
@@ -48,7 +48,7 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                << " bytes, to: " << to->Size() * sizeof(DType) << " bytes.";
       common::ParallelCopy(to->dptr<DType>(), from.dptr<DType>(), size);
     } else {
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(from.type_flag_, SrcDType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(from.type_flag_, SrcDType, {
           to->FlatTo1D<cpu, DType>() =
               mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
       })
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 2251ff81ea04..78b834db853c 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -650,6 +650,37 @@ struct AccType<mshadow::half::half_t> {
   .add_enum("bool", mshadow::kBool)
 
 
+#define MXNET_ADD_ALL_TYPES_EXT \
+  .add_enum("float32", mshadow::kFloat32) \
+  .add_enum("float64", mshadow::kFloat64) \
+  .add_enum("float16", mshadow::kFloat16) \
+  .add_enum("bfloat16", mshadow::kBfloat16) \
+  .add_enum("uint8", mshadow::kUint8) \
+  .add_enum("int8", mshadow::kInt8) \
+  .add_enum("int32", mshadow::kInt32) \
+  .add_enum("int64", mshadow::kInt64) \
+  .add_enum("int16", mshadow::kInt16) \
+  .add_enum("uint16", mshadow::kUint16) \
+  .add_enum("uint32", mshadow::kUint32) \
+  .add_enum("uint64", mshadow::kUint64)
+
+
+#define MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL \
+  .add_enum("float32", mshadow::kFloat32) \
+  .add_enum("float64", mshadow::kFloat64) \
+  .add_enum("float16", mshadow::kFloat16) \
+  .add_enum("bfloat16", mshadow::kBfloat16) \
+  .add_enum("uint8", mshadow::kUint8) \
+  .add_enum("int8", mshadow::kInt8) \
+  .add_enum("int32", mshadow::kInt32) \
+  .add_enum("int64", mshadow::kInt64) \
+  .add_enum("bool", mshadow::kBool) \
+  .add_enum("int16", mshadow::kInt16) \
+  .add_enum("uint16", mshadow::kUint16) \
+  .add_enum("uint32", mshadow::kUint32) \
+  .add_enum("uint64", mshadow::kUint64)
+
+
 /* \brief Compute flattened index given coordinates and shape. */
 template<int ndim>
 MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
@@ -768,11 +799,11 @@ template <typename xpu>
 MSHADOW_CINLINE void copy(mshadow::Stream<xpu> *s, const TBlob& to, const TBlob& from) {
   CHECK_EQ(from.Size(), to.Size());
   CHECK_EQ(from.dev_mask(), to.dev_mask());
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(to.type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(to.type_flag_, DType, {
     if (to.type_flag_ == from.type_flag_) {
       mshadow::Copy(to.FlatTo1D<xpu, DType>(s), from.FlatTo1D<xpu, DType>(s), s);
     } else {
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(from.type_flag_, SrcDType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(from.type_flag_, SrcDType, {
         to.FlatTo1D<xpu, DType>(s) = mshadow::expr::tcast<DType>(from.FlatTo1D<xpu, SrcDType>(s));
       })
     }
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
index 58ce40bb746d..6d3eb7a9a9b9 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
@@ -161,8 +161,8 @@ struct GetBinaryBroadcastCompute {
       } else {
         if (req[0] == kNullOp) return;
         mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-        MSHADOW_TYPE_SWITCH_WITH_BOOL(lhs.type_flag_, DType, {
-          MSHADOW_TYPE_SWITCH_WITH_BOOL(rhs.type_flag_, EType, {
+        MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(lhs.type_flag_, DType, {
+          MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(rhs.type_flag_, EType, {
             BROADCAST_NDIM_SWITCH(ndim, NDim, {
               mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
               mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 016c8892af98..f12e7f4f091e 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -62,7 +62,7 @@ struct NumpyEyeParam : public dmlc::Parameter<NumpyEyeParam> {
     DMLC_DECLARE_FIELD(dtype)
     .set_default(-1)
     .add_enum("None", -1)
-    MXNET_ADD_ALL_TYPES
+    MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
     .describe("Data-type of the returned array.");
   }
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index b09778722ffd..c240fbc52ce1 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -61,6 +61,10 @@ IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(uint8_t);
 IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int32_t);
 IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int64_t);
 IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(bool);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int16_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(uint16_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(uint32_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(uint64_t);
 
 /*!
  * \brief Init variable used to facilitate registering a tunable operator during
@@ -85,7 +89,11 @@ struct static_init_var {
   __macro$(__VA_ARGS__, uint8_t); \
   __macro$(__VA_ARGS__, int8_t); \
   __macro$(__VA_ARGS__, int32_t); \
-  __macro$(__VA_ARGS__, int64_t);
+  __macro$(__VA_ARGS__, int64_t); \
+  __macro$(__VA_ARGS__, int16_t); \
+  __macro$(__VA_ARGS__, uint16_t); \
+  __macro$(__VA_ARGS__, uint32_t); \
+  __macro$(__VA_ARGS__, uint64_t)
 
 #define MSHADOW_MACRO_FOREACH_TYPE_WITH_BOOL(__macro$, ...) \
   __macro$(__VA_ARGS__, float); \
@@ -96,7 +104,11 @@ struct static_init_var {
   __macro$(__VA_ARGS__, int8_t); \
   __macro$(__VA_ARGS__, int32_t); \
   __macro$(__VA_ARGS__, int64_t); \
-  __macro$(__VA_ARGS__, bool)
+  __macro$(__VA_ARGS__, bool); \
+  __macro$(__VA_ARGS__, int16_t); \
+  __macro$(__VA_ARGS__, uint16_t); \
+  __macro$(__VA_ARGS__, uint32_t); \
+  __macro$(__VA_ARGS__, uint64_t)
 
 #define IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$) \
   namespace mxnet_op { \
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index b1700c7a3882..62aa3eb6aa69 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -329,8 +329,8 @@ void BinaryBroadcastComputeLogic(const nnvm::NodeAttrs& attrs,
   } else {
     if (req[0] == kNullOp) return;
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    MSHADOW_TYPE_SWITCH_WITH_BOOL(lhs.type_flag_, DType, {
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(rhs.type_flag_, EType, {
+    MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(lhs.type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(rhs.type_flag_, EType, {
          BROADCAST_NDIM_SWITCH(ndim, NDim, {
           mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
           mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index a49d2968551c..69813f1f6dc4 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -599,8 +599,8 @@ template<typename xpu, typename OP>
     CHECK_EQ(inputs.size(), 2U);
     CHECK_EQ(outputs.size(), 1U);
     MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
-        MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[1].type_flag_, EType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(inputs[0].type_flag_, DType, {
+        MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(inputs[1].type_flag_, EType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
           if (size != 0) {
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 4d34c51edc3b..839d0ad67e7e 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -467,7 +467,7 @@ struct CastParam : public dmlc::Parameter<CastParam> {
   int dtype;
   DMLC_DECLARE_PARAMETER(CastParam) {
     DMLC_DECLARE_FIELD(dtype)
-    MXNET_ADD_ALL_TYPES_WITH_BOOL
+    MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
     .describe("Output data type.");
   }
 };
@@ -491,9 +491,9 @@ void CastCompute(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(outputs[0].type_flag_, DstDType, {
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(outputs[0].type_flag_, DstDType, {
     Tensor<xpu, 1, DstDType> out = outputs[0].FlatTo1D<xpu, DstDType>(s);
-    MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, SrcDType, {
+    MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(inputs[0].type_flag_, SrcDType, {
       Tensor<xpu, 1, SrcDType> data = inputs[0].FlatTo1D<xpu, SrcDType>(s);
       if ((outputs[0].type_flag_ != inputs[0].type_flag_ ||
           req[0] != kWriteInplace) && outputs[0].Size() != 0) {
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index c065c0877346..bc7b629287ed 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -62,7 +62,7 @@ struct InitOpParam : public dmlc::Parameter<InitOpParam> {
     DMLC_DECLARE_FIELD(dtype)
     .set_default(-1)
     .add_enum("None", -1)
-    MXNET_ADD_ALL_TYPES_WITH_BOOL
+    MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
     .describe("Target data type.");
   }
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
@@ -106,7 +106,7 @@ struct FullLikeOpParam : public dmlc::Parameter<FullLikeOpParam> {
                 "Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
       .set_default(dmlc::optional<int>())
-      MXNET_ADD_ALL_TYPES_WITH_BOOL
+      MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
       .describe("Target data type.");
   }
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
@@ -303,7 +303,7 @@ struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
     DMLC_DECLARE_FIELD(dtype)
       .set_default(-1)
       .add_enum("None", -1)
-      MXNET_ADD_ALL_TYPES_WITH_BOOL
+      MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
       .describe("Target data type.");
     DMLC_DECLARE_FIELD(value)
       .describe("Value with which to fill newly created tensor");
@@ -357,7 +357,7 @@ struct LinspaceParam : public dmlc::Parameter<LinspaceParam> {
     DMLC_DECLARE_FIELD(dtype)
     .set_default(-1)
     .add_enum("None", -1)
-    MXNET_ADD_ALL_TYPES
+    MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
     .describe("Target data type.");
   }
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
@@ -481,12 +481,12 @@ void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const OpReqType req, ValueTyp
     if (val == 0) {
       if (req != kAddTo) {
         if (b.dev_mask() == cpu::kDevMask && size < 50000) {
-          MSHADOW_TYPE_SWITCH_WITH_BOOL(b.type_flag_, DType, {
+          MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(b.type_flag_, DType, {
             memset(b.dptr_, 0, size * sizeof(DType));
           });
         } else {
           // Optimize common use-case of filling with ones
-          MSHADOW_TYPE_SWITCH_WITH_BOOL(b.type_flag_, DType, {
+          MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(b.type_flag_, DType, {
             MXNET_ASSIGN_REQ_SWITCH(req, Req, {
               mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::set_to_int<0>, Req>, xpu>::Launch(
                 s, b.Size(), b.dptr<DType>());
@@ -496,7 +496,7 @@ void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const OpReqType req, ValueTyp
       }
     } else if (is_integer && val == 1) {
       // Optimize common use-case of filling with ones
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(b.type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(b.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::set_one, Req>, xpu>::Launch(
             s, b.Size(), b.dptr<DType>());
@@ -504,7 +504,7 @@ void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const OpReqType req, ValueTyp
       });
     } else {
       // Generic fill kernel from variable
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(b.type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(b.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
             s, b.Size(), b.dptr<DType>(), static_cast<DType>(val));
@@ -648,7 +648,7 @@ inline void EyeFillImpl(const TBlob& out_data,
   const nnvm::dim_t nnz = k > 0 ? std::min(cnnz, N) :
                           std::min(rnnz, num_cols);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_EXT(out_data.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         Fill(s, out_data, req[0], static_cast<DType>(0));
         if (nnz > 0) {
@@ -692,7 +692,7 @@ void RangeCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_EXT(outputs[0].type_flag_, DType, {
       // Force unsigned params to take two's complement form on ARM to ensure consistency with x86
       // results.  Casting negative floats to unsigned types is undefined in the CPP standard.
       auto step = std::is_signed<DType>() ? param.step : static_cast<index_t>(param.step);
@@ -759,7 +759,7 @@ void LinspaceCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const LinspaceParam& param = nnvm::get<LinspaceParam>(attrs.parsed);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(outputs[0].type_flag_, DType, {
       index_t step_num = param.endpoint ? param.num - 1 : param.num;
       double step = step_num > 0 ? (param.stop - param.start) / step_num : 0.0f;
       Kernel<linspace_fwd, xpu>::Launch(s,
diff --git a/src/operator/tvmop/op_module.cc b/src/operator/tvmop/op_module.cc
index 352e885cd11f..05125726a283 100644
--- a/src/operator/tvmop/op_module.cc
+++ b/src/operator/tvmop/op_module.cc
@@ -71,6 +71,18 @@ PackedFunc GetFunction(const std::shared_ptr<Module> &module,
       case mshadow::kUint8:
         func_name << "uint8";
         break;
+      case mshadow::kUint16:
+        func_name << "uint16";
+        break;
+      case mshadow::kUint32:
+        func_name << "uint32";
+        break;
+      case mshadow::kUint64:
+        func_name << "uint64";
+        break;
+      case mshadow::kInt16:
+        func_name << "int16";
+        break;
       case mshadow::kInt32:
         func_name << "int32";
         break;