diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 07676c6382cc..905bcf8217b9 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -561,16 +561,15 @@ def insert(arr, obj, values, axis=None):
     ----------
     arr : ndarray
         Input array.
-    obj : int, slice or ndarray of ints
+    obj : int, slice or ndarray of int64
         Object that defines the index or indices before which `values` is
         inserted.
         Support for multiple insertions when `obj` is a single scalar or a
         sequence with one element (only support int32 and int64 element).
     values : ndarray
         Values to insert into `arr`.
-        The type of `values` should equal to the type of `arr`.
-        `values` should be shaped so that ``arr[...,obj,...] = values``
-        is legal.
+        If the type of values is different from that of arr, values is converted
+        to the type of arr.
     axis : int, optional
         Axis along which to insert `values`.  If `axis` is None then `arr`
         is flattened first.
@@ -584,9 +583,10 @@ def insert(arr, obj, values, axis=None):
 
     Notes
     -----
-    Note that for higher dimensional inserts `obj=0` behaves very different
+    - Note that for higher dimensional inserts `obj=0` behaves very different
     from `obj=[0]` just like `arr[:,0,:] = values` is different from
     `arr[:,[0],:] = values`.
+    - If obj is a ndarray, it's dtype only supports int64 
 
     Examples
     --------
@@ -604,7 +604,7 @@ def insert(arr, obj, values, axis=None):
 
     Difference between sequence and scalars:
 
-    >>> np.insert(a, np.array([1], dtype=np.int32), np.array([[1],[2],[3]]), axis=1)
+    >>> np.insert(a, np.array([1], dtype=np.int64), np.array([[1],[2],[3]]), axis=1)
     array([[1., 1., 1.],
            [2., 2., 2.],
            [3., 3., 3.]])
@@ -622,15 +622,27 @@ def insert(arr, obj, values, axis=None):
     >>> np.insert(b, slice(2, 4), np.array([5, 6]))
     array([1., 1., 5., 2., 6., 2., 3., 3.])
 
-    >>> np.insert(b, np.array([2, 2], dtype=np.int32), np.array([7.13, False]))
-    array([1.  , 1.  , 7.13, 0.  , 2.  , 2.  , 3.  , 3.  ])
+    # type casting
+    >>> np.insert(b.astype(np.int32), np.array([2, 2],dtype='int64'), np.array([7.13, False]))
+    array([1, 1, 7, 0, 2, 2, 3, 3], dtype=int32)
 
     >>> x = np.arange(8).reshape(2, 4)
-    >>> idx = np.array([1, 3], dtype=np.int32)
+    >>> idx = np.array([1, 3], dtype=np.int64)
     >>> np.insert(x, idx, np.array([999]), axis=1)
     array([[  0., 999.,   1.,   2., 999.,   3.],
            [  4., 999.,   5.,   6., 999.,   7.]])
     """
+    if isinstance(values, numeric_types):
+        if isinstance(obj, slice):
+            start = obj.start
+            stop = obj.stop
+            step = 1 if obj.step is None else obj.step
+            return _npi.insert(arr, val=values, start=start, stop=stop, step=step, axis=axis)
+        elif isinstance(obj, integer_types):
+            return _npi.insert(arr, val=values, int_ind=obj, axis=axis)
+        elif isinstance(obj, NDArray):
+            return _npi.insert(arr, obj, val=values, axis=axis)
+    
     if not isinstance(arr, NDArray):
         raise TypeError("'arr' can not support type {}".format(str(type(arr))))
     if not isinstance(values, NDArray):
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 7e9d3934f1d7..5d8572e4c343 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -7426,16 +7426,15 @@ def insert(arr, obj, values, axis=None):
     ----------
     arr : ndarray
         Input array.
-    obj : int, slice or ndarray of ints
+    obj : int, slice or ndarray of int64
         Object that defines the index or indices before which `values` is
         inserted.
         Support for multiple insertions when `obj` is a single scalar or a
         sequence with one element (only support int32 and int64 element).
     values : ndarray
         Values to insert into `arr`.
-        The type of `values` should equal to the type of `arr`.
-        `values` should be shaped so that ``arr[...,obj,...] = values``
-        is legal.
+        If the type of values is different from that of arr, values is converted
+        to the type of arr.
     axis : int, optional
         Axis along which to insert `values`.  If `axis` is None then `arr`
         is flattened first.
@@ -7449,9 +7448,10 @@ def insert(arr, obj, values, axis=None):
 
     Notes
     -----
-    Note that for higher dimensional inserts `obj=0` behaves very different
+    - Note that for higher dimensional inserts `obj=0` behaves very different
     from `obj=[0]` just like `arr[:,0,:] = values` is different from
     `arr[:,[0],:] = values`.
+    - If obj is a ndarray, it's dtype only supports int64 
 
     Examples
     --------
@@ -7469,7 +7469,7 @@ def insert(arr, obj, values, axis=None):
 
     Difference between sequence and scalars:
 
-    >>> np.insert(a, np.array([1], dtype=np.int32), np.array([[1],[2],[3]]), axis=1)
+    >>> np.insert(a, np.array([1], dtype=np.int64), np.array([[1],[2],[3]]), axis=1)
     array([[1., 1., 1.],
            [2., 2., 2.],
            [3., 3., 3.]])
@@ -7487,11 +7487,12 @@ def insert(arr, obj, values, axis=None):
     >>> np.insert(b, slice(2, 4), np.array([5, 6]))
     array([1., 1., 5., 2., 6., 2., 3., 3.])
 
-    >>> np.insert(b, np.array([2, 2], dtype=np.int32), np.array([7.13, False]))
-    array([1.  , 1.  , 7.13, 0.  , 2.  , 2.  , 3.  , 3.  ])
+    # type casting
+    >>> np.insert(b.astype(np.int32), np.array([2, 2],dtype='int64'), np.array([7.13, False]))
+    array([1, 1, 7, 0, 2, 2, 3, 3], dtype=int32)
 
     >>> x = np.arange(8).reshape(2, 4)
-    >>> idx = np.array([1, 3], dtype=np.int32)
+    >>> idx = np.array([1, 3], dtype=np.int64)
     >>> np.insert(x, idx, np.array([999]), axis=1)
     array([[  0., 999.,   1.,   2., 999.,   3.],
            [  4., 999.,   5.,   6., 999.,   7.]])
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 905dfffffff5..979c58754460 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -2871,16 +2871,15 @@ def insert(arr, obj, values, axis=None):
     ----------
     arr : _Symbol
         Input array.
-    obj : int, slice or _Symbol of ints
+    obj : int, slice or ndarray of int64
         Object that defines the index or indices before which `values` is
         inserted.
         Support for multiple insertions when `obj` is a single scalar or a
         sequence with one element (only support int32 and int64 element).
     values : _Symbol
         Values to insert into `arr`.
-        The type of `values` should equal to the type of `arr`.
-        `values` should be shaped so that ``arr[...,obj,...] = values``
-        is legal.
+        If the type of values is different from that of arr, values is converted
+        to the type of arr.
     axis : int, optional
         Axis along which to insert `values`.  If `axis` is None then `arr`
         is flattened first.
@@ -2894,10 +2893,21 @@ def insert(arr, obj, values, axis=None):
 
     Notes
     -----
-    Note that for higher dimensional inserts `obj=0` behaves very different
+    - Note that for higher dimensional inserts `obj=0` behaves very different
     from `obj=[0]` just like `arr[:,0,:] = values` is different from
     `arr[:,[0],:] = values`.
-    """
+    - If obj is a ndarray, it's dtype only supports int64 
+    """
+    if isinstance(values, numeric_types):
+        if isinstance(obj, slice):
+            start = obj.start
+            stop = obj.stop
+            step = 1 if obj.step is None else obj.step
+            return _npi.insert(arr, val=values, start=start, stop=stop, step=step, axis=axis)
+        elif isinstance(obj, integer_types):
+            return _npi.insert(arr, val=values, int_ind=obj, axis=axis)
+        elif isinstance(obj, NDArray):
+            return _npi.insert(arr, obj, val=values, axis=axis)
     if not isinstance(arr, ndarray): # pylint: disable= undefined-variable
         raise TypeError("'arr' can not support type {}".format(str(type(arr))))
     if not isinstance(values, ndarray): # pylint: disable= undefined-variable
diff --git a/src/operator/numpy/np_insert_op-inl.h b/src/operator/numpy/np_insert_op-inl.h
index 9679bbc2c5b0..29439e5110aa 100644
--- a/src/operator/numpy/np_insert_op-inl.h
+++ b/src/operator/numpy/np_insert_op-inl.h
@@ -29,6 +29,7 @@
 #include <memory>
 #include "../../common/utils.h"
 #include "../tensor/sort_op.h"
+#include "../tensor/init_op.h"
 #include "../operator_common.h"
 #include "../mxnet_op.h"
 
@@ -36,12 +37,16 @@ namespace mxnet {
 namespace op {
 
 struct NumpyInsertParam : public dmlc::Parameter<NumpyInsertParam> {
+  dmlc::optional<double> val;
   dmlc::optional<int> start;
   dmlc::optional<int> stop;
   dmlc::optional<int> step;
   dmlc::optional<int> int_ind;
   dmlc::optional<int> axis;
   DMLC_DECLARE_PARAMETER(NumpyInsertParam) {
+    DMLC_DECLARE_FIELD(val)
+    .set_default(dmlc::optional<double>())
+    .describe("A scaler to be inserted into 'array'");
     DMLC_DECLARE_FIELD(start)
     .set_default(dmlc::optional<int>())
     .describe("If 'obj' is slice, 'start' is one of it's arguments.");
@@ -61,30 +66,28 @@ struct NumpyInsertParam : public dmlc::Parameter<NumpyInsertParam> {
   }
 };
 
-namespace insert_ {
-  // insert 'values' to 'arr' according to 'obj'
-  enum InsertOpInputs {kArr, kValues, kObj};
-  enum InsertOpOutputs {kOut};
-}  // namespace insert_
-
-template<int req, int ndim>
+/*!
+ * \brief insert when obj is 'scaler' or a 'slice' with only one element.
+ * \tparam ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
+ * \param out_data - output: insert 'value' to 'arr' according to 'index'.
+ * \param in_arr - input: 'arr', original array.
+ * \param index - input(only for first Map): it's the only element in 'obj' indicats insert position.
+ * \param in_obj - input(only for second Map): It indicats insert position, it's ndim may equals to 0.
+ * \param in_val - input: 'value', insert to 'arr' according to 'index'.
+ * \param N - (only for first Map) arr.shape_[axis]
+ * \param numnew - extra dim size in 'out_data' compared with 'arr' in 'axis'.
+ * \param axis - insert 'value' to 'arr' in 'axis'.
+ * \param moveaxis - If 'obj' is a scaler, moveaxis is true;
+                     If 'obj' is a slice with one element, moveaxis is false.
+ * \note Different between the two Map:
+         The first one use a scaler index;
+         The second one use a sequence of indecies which only has one index.
+ */
+template<int ndim>
 struct InsertSingleIndexForward {
-  /*!
-   * \brief insert when obj is 'scaler' or a 'slice' with only one element.
-   * \tparam xpu - cpu or gpu.
-   * \param out_data - output: insert 'value' to 'arr' according to 'index'.
-   * \param in_arr - input: 'arr', original array.
-   * \param index - input: it's the only element in 'obj' indicats insert position.
-   * \param in_val - input: 'value', insert to 'arr' according to 'index'.
-   * \param numnew - extra dim size in 'out_data' compared with 'arr' in 'axis'.
-   * \param axis - insert 'value' to 'arr' in 'axis'.
-   * \tparam ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
-   * \param moveaxis - If 'obj' is a scaler, moveaxis is true;
-                       If 'obj' is a slice with one element, moveaxis is false.
-   */
-  template<typename DType>
+  template<typename DType, typename VType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data,
-                                  const DType* in_val, const DType* in_arr,
+                                  const VType* in_val, const DType* in_arr,
                                   const mshadow::Shape<ndim> outshape,
                                   const mshadow::Shape<ndim> valshape,
                                   const int index, const int numnew,
@@ -92,14 +95,15 @@ struct InsertSingleIndexForward {
                                   const mshadow::Shape<ndim> old_val_stride,
                                   const mshadow::Shape<ndim> arr_stride,
                                   const mshadow::Shape<ndim> out_stride,
-                                  const int axis,
-                                  bool moveaxis) {
+                                  const int axis, bool moveaxis, const int req) {
     // i is the global flattened index in the output
-    mshadow::Shape<ndim> out_idx = mxnet_op::unravel(i, outshape);  // i -> position in output's shape
+    // out_idx: i -> position in output's shape
+    mshadow::Shape<ndim> out_idx = mxnet_op::unravel(i, outshape);
     int64_t dest_idx;
     if (out_idx[axis] >= index && out_idx[axis] < index + numnew) {  // from 'value'
       int idx_val = out_idx[axis] - index;
-      mshadow::Shape<ndim> val_idx(out_idx);  // i -> position in values's shape
+      // val_idx: i -> position in values's shape
+      mshadow::Shape<ndim> val_idx(out_idx);
       val_idx[axis] = idx_val;
       for (int j = ndim - 1; j >= 0; --j) {
         if (valshape[j] == 1) {  // broadcast
@@ -118,53 +122,41 @@ struct InsertSingleIndexForward {
       } else {
         dest_idx = mxnet_op::dot(val_stride, val_idx);
       }
-      KERNEL_ASSIGN(out_data[i], req, in_val[dest_idx]);
+      KERNEL_ASSIGN(out_data[i], req, static_cast<DType>(in_val[dest_idx]));
     } else {  // from 'arr'
       int idx_arr = (out_idx[axis] < index) ?
                      out_idx[axis] : out_idx[axis] - numnew;
-      mshadow::Shape<ndim> arr_idx(out_idx);  // i -> position in arr's shape
+      // arr_idx: i -> position in arr's shape
+      mshadow::Shape<ndim> arr_idx(out_idx);
       arr_idx[axis] = idx_arr;
       dest_idx = mxnet_op::dot(arr_stride, arr_idx);
-
       KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
     }
   }
-  /*!
-   * \brief insert when obj is 'tensor' with only one element.
-   * \tparam xpu - cpu or gpu.
-   * \param out_data - output: insert 'value' to 'arr' according to 'index'.
-   * \param in_arr - input: 'arr', original array.
-   * \param in_obj - input: It indicats insert position, ndim may equals to 0.
-   * \param in_val - input: 'value', insert to 'arr' according to 'index'.
-   * \param N - arr.shape_[axis]
-   * \param numnew - extra dim size in 'out_data' compared with 'arr' in 'axis'.
-   * \param axis - insert 'value' to 'arr' in 'axis'.
-   * \tparam ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
-   * \param moveaxis - If 'obj' is a tensor with ndim == 0, regard it as a scaler and moveaxis is true;
-                       If 'obj' is a tensor with  ndim > 0 but has only one element, moveaxis is false.
-   */
-  template<typename DType, typename IType>
+
+  template<typename DType, typename VType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data,
-                                  const DType* in_val, const DType* in_arr,
+                                  const VType* in_val, const DType* in_arr,
                                   const mshadow::Shape<ndim> outshape,
                                   const mshadow::Shape<ndim> valshape,
-                                  const int N, const IType* in_obj, const int numnew,
+                                  const int N, const int64_t* in_obj, const int numnew,
                                   const mshadow::Shape<ndim> val_stride,
                                   const mshadow::Shape<ndim> old_val_stride,
                                   const mshadow::Shape<ndim> arr_stride,
                                   const mshadow::Shape<ndim> out_stride,
-                                  const int axis,
-                                  bool moveaxis) {
+                                  const int axis, bool moveaxis, const int req) {
     // i is the global flattened index in the output
-    mshadow::Shape<ndim> out_idx= mxnet_op::unravel(i, outshape);  // i -> position in output's shape
+    // out_idx: i -> position in output's shape
+    mshadow::Shape<ndim> out_idx = mxnet_op::unravel(i, outshape);
     int64_t dest_idx;
-    IType index = in_obj[0];
+    int64_t index = in_obj[0];
     if (static_cast<int64_t>(index) < 0) {
-      index += static_cast<IType>(N);
+      index += static_cast<int64_t>(N);
     }
     if (out_idx[axis] >= index && out_idx[axis] < index + numnew) {  // from 'value'
       int idx_val = out_idx[axis] - index;
-      mshadow::Shape<ndim> val_idx(out_idx);  // i -> position in values's shape
+      // val_idx: i -> position in values's shape
+      mshadow::Shape<ndim> val_idx(out_idx);
       val_idx[axis] = idx_val;
       for (int j = ndim - 1; j >= 0; --j) {
         if (valshape[j] == 1) {  // broadcast
@@ -183,12 +175,12 @@ struct InsertSingleIndexForward {
       } else {
         dest_idx = mxnet_op::dot(val_stride, val_idx);
       }
-      KERNEL_ASSIGN(out_data[i], req, in_val[dest_idx]);
+      KERNEL_ASSIGN(out_data[i], req, static_cast<DType>(in_val[dest_idx]));
     } else {  // from 'arr'
       int idx_arr = (out_idx[axis] < index) ? out_idx[axis] : out_idx[axis] - numnew;
-      mshadow::Shape<ndim> arr_idx(out_idx);  // i -> position in arr's shape
+      // arr_idx: i -> position in arr's shape
+      mshadow::Shape<ndim> arr_idx(out_idx);
       arr_idx[axis] = idx_arr;
-      //dest_idx = 0;
       dest_idx = mxnet_op::dot(arr_stride, arr_idx);
       KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
     }
@@ -215,23 +207,25 @@ inline mshadow::Shape<ndim> GetKernelShape(const mxnet::TShape& shape) {
   return k_shape;
 }
 
-template<int req, int ndim>
-struct InsertSeqForward {
-  /*!
-   * \brief insert when obj is 'tensor' or 'slice' with more than one element.
-   * \tparam xpu - cpu or gpu.
-   * \param out_data - output: insert 'value' to 'arr' according to 'index'.
-   * \param in_arr - input: 'arr', original array.
-   * \param in_obj - input: It indicats insert position, ndim may equals to 0.
-   * \param in_val - input: 'value', insert to 'arr' according to 'index'.
-   * \param is_insert - if is_insert[out_idx[axis]] is true, it's from 'values', else from 'arr'.
-   * \param origin_idx - indicate the original position in 'arr' or 'values' in 'axis'. 
-   * \param axis - insert 'value' to 'arr' in 'axis'.
-   * \tparam ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
-   */
-  template<typename DType>
+/*!
+ * \brief insert when obj is 'tensor' or 'slice' with more than one element.
+ * \tparam ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
+ * \param out_data - output: insert 'value' to 'arr' according to 'index'.
+ * \param in_arr - input: 'arr', original array.
+ * \param in_obj - input: It indicats insert position, ndim may equals to 0.
+ * \param in_val - input: 'value', insert to 'arr' according to 'index'.
+ * \param is_insert - if is_insert[out_idx[axis]] is true, it's from 'values', else from 'arr'.
+ * \param origin_idx - indicate the original position in 'arr' or 'values' in 'axis'. 
+ * \param axis - insert 'value' to 'arr' in 'axis'.
+ * \note Different between the two Map:
+         The first one insert a block of data, param 'in_val' is a tensor;
+         The second one insert only a single data, param 'in_val' is a scaler.
+ */
+template<int ndim>
+struct InsertSeqIndicesForward {
+  template<typename DType, typename VType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data,
-                                  const DType* in_val, const DType* in_arr,
+                                  const VType* in_val, const DType* in_arr,
                                   const mshadow::Shape<ndim> outshape,
                                   const mshadow::Shape<ndim> valshape,
                                   const int* is_insert,
@@ -239,25 +233,57 @@ struct InsertSeqForward {
                                   const mshadow::Shape<ndim> val_stride,
                                   const mshadow::Shape<ndim> arr_stride,
                                   const mshadow::Shape<ndim> out_stride,
-                                  const int axis) {
+                                  const int axis, const int req) {
     // i is the global flattened index in the output
-    mshadow::Shape<ndim> out_idx = mxnet_op::unravel(i, outshape);  // i -> position in output's shape
+    // out_idx: i -> position in output's shape
+    mshadow::Shape<ndim> out_idx = mxnet_op::unravel(i, outshape);
     int64_t dest_idx;
-    if (is_insert[out_idx[axis]]) {  // from 'values'
+    if (is_insert[out_idx[axis]]) {
+      // the data of output[i] is from 'values'
       int idx_val = origin_idx[out_idx[axis]];
-      mshadow::Shape<ndim> insert_idx(out_idx);  // i -> position in insert's shape
+      // insert_idx: i -> position in insert's shape
+      mshadow::Shape<ndim> insert_idx(out_idx);
       insert_idx[axis] = idx_val;
-      mshadow::Shape<ndim> val_idx(insert_idx);  // i -> position in values's shape
+      // val_idx: i -> position in values's shape
+      mshadow::Shape<ndim> val_idx(insert_idx);
       for (int j = ndim - 1; j >= 0; --j) {  // broadcast
         if (valshape[j] == 1) {
           val_idx[j] = 0;
         }
       }
       dest_idx = mxnet_op::dot(val_idx, val_stride);
-      KERNEL_ASSIGN(out_data[i], req, in_val[dest_idx]);
-    } else {  // from 'arr'
+      KERNEL_ASSIGN(out_data[i], req, static_cast<DType>(in_val[dest_idx]));
+    } else {
+      // the data of output[i] is from 'arr'
       int idx_arr = origin_idx[out_idx[axis]];
-      mshadow::Shape<ndim> arr_idx(out_idx);  // i -> position in arr's shape
+      // arr_idx: i -> position in arr's shape
+      mshadow::Shape<ndim> arr_idx(out_idx);
+      arr_idx[axis] = idx_arr;
+      dest_idx = mxnet_op::dot(arr_idx, arr_stride);
+      KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
+    }
+  }
+
+  template<typename DType, typename VType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data,
+                                  const VType in_val, const DType* in_arr,
+                                  const mshadow::Shape<ndim> outshape,
+                                  const int* is_insert,
+                                  const int* origin_idx,
+                                  const mshadow::Shape<ndim> arr_stride,
+                                  const mshadow::Shape<ndim> out_stride,
+                                  const int axis, const int req) {
+    // i is the global flattened index in the output
+    // out_idx: i -> position in output's shape
+    mshadow::Shape<ndim> out_idx = mxnet_op::unravel(i, outshape);
+    int64_t dest_idx;
+    if (is_insert[out_idx[axis]]) {
+      KERNEL_ASSIGN(out_data[i], req, static_cast<DType>(in_val));
+    } else {
+      // the data of output[i] is from 'arr'
+      int idx_arr = origin_idx[out_idx[axis]];
+      // arr_idx: i -> position in arr's shape
+      mshadow::Shape<ndim> arr_idx(out_idx);
       arr_idx[axis] = idx_arr;
       dest_idx = mxnet_op::dot(arr_idx, arr_stride);
       KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
@@ -266,23 +292,21 @@ struct InsertSeqForward {
 };
 
 struct SliceToIndices {
-  template<typename IType>
-  MSHADOW_XINLINE static void Map(int i, IType* indices, int N,
+  MSHADOW_XINLINE static void Map(int i, int64_t* indices, int N,
                                   int start, int step) {
     indices[i] = start + i * step;
     if (indices[i] < 0) {
-      indices[i] += static_cast<IType>(N);
+      indices[i] += static_cast<int64_t>(N);
     }
   }
 };
 
 struct ObjToIndices {
-  template<typename IType>
-  MSHADOW_XINLINE static void Map(int i, IType* indices,
-                                  int N, const IType* obj) {
+  MSHADOW_XINLINE static void Map(int i, int64_t* indices,
+                                  int N, const int64_t* obj) {
     indices[i] = obj[i];
     if (indices[i] < 0) {
-      indices[i] += static_cast<IType>(N);
+      indices[i] += static_cast<int64_t>(N);
     }
   }
 };
@@ -294,22 +318,19 @@ struct AssignId {
 };
 
 struct IndicesModify {
-  template<typename IType>
-  MSHADOW_XINLINE static void Map(int i, IType* indices, const int* order) {
+  MSHADOW_XINLINE static void Map(int i, int64_t* indices, const int* order) {
     indices[order[i]] += i;
   }
 };
 
 struct SetIsInsert {
-  template<typename IType>
-  MSHADOW_XINLINE static void Map(int i, IType* indices, int* is_insert) {
+  MSHADOW_XINLINE static void Map(int i, int64_t* indices, int* is_insert) {
     is_insert[static_cast<int>(indices[i])] = 1;
   }
 };
 
 struct SetOriginValuesIdx {
-  template<typename IType>
-  MSHADOW_XINLINE static void Map(int i, const IType* indices, int* origin_idx) {
+  MSHADOW_XINLINE static void Map(int i, const int64_t* indices, int* origin_idx) {
     origin_idx[static_cast<int>(indices[i])] = i;
   }
 };
@@ -329,6 +350,47 @@ struct SetOriginArrIdx {
   }
 };
 
+/*!
+ * /brief equals to numpy's slice.indices(range)
+ * /param pstart - slice.start
+ * /param pstep - slice.step
+ * /param pstop - slice.stop
+ * /return start - slice.indices(range).start
+ * /return stop - slice.indices(range).stop
+ * /return step - slice.indices(range).step
+ * /return tot - total number of slice.indices(range)
+ */
+inline void SliceIndices(const dmlc::optional<int>& pstart,
+                         const dmlc::optional<int>& pstop,
+                         const dmlc::optional<int>& pstep,
+                         const int range,
+                         int* start, int* stop, int* step,
+                         size_t* tot) {
+  *step = pstep.has_value() ? pstep.value() : 1;
+  CHECK_NE(*step, 0) << "'step' can not equal to 0.";
+  if (pstop.has_value()) {
+    *stop = pstop.value();
+    *stop += (*stop < 0) ? range : 0;
+    *stop = (*stop < 0) ? ((*step < 0) ? -1 : 0) : *stop;
+    *stop = (*stop >= range) ? ((*step < 0) ? range - 1 : range) : *stop;
+  } else {
+    *stop = (*step > 0) ? range : -1;
+  }
+  if (pstart.has_value()) {
+    *start = pstart.value();
+    *start += (*start < 0) ? range : 0;
+    *start = (*start < 0) ? ((*step < 0) ? -1 : 0) : *start;
+    *start = (*start >= range) ? ((*step < 0) ? range - 1 : range) : *start;
+  } else {
+    *start = (*step > 0) ? 0 : range - 1;
+  }
+  if (*step > 0 && *stop >= *start) {
+    *tot = static_cast<size_t>((*stop - *start + *step - 1) / *step);
+  } else if (*step < 0 && *stop <= *start) {
+    *tot = static_cast<size_t>((*stop - *start + *step + 1) / *step);
+  }
+}
+
 template<typename xpu>
 void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
@@ -339,27 +401,42 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
 
   const NumpyInsertParam& param = nnvm::get<NumpyInsertParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(),
-           (param.step.has_value() || param.int_ind.has_value()) ? 2U : 3U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
+  int input_count = param.val.has_value() ? 1 : 2;
+  int insize = (param.step.has_value() || param.int_ind.has_value()) ?
+               input_count : input_count + 1;
+  bool obj_is_tensor = (param.val.has_value() && insize == 2) ||
+                       (!param.val.has_value() && insize == 3);
+  CHECK_EQ(inputs.size(), insize);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(req.size(), 1);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int ndim = inputs[insert_::kArr].shape_.ndim();
+  const int arr_pos = 0;
+  const int val_pos = param.val.has_value() ? 0 : 1;
+  const int obj_pos = val_pos + 1;
+  const int out_pos = 0;
+  int ndim = inputs[arr_pos].shape_.ndim();
   int axis = param.axis.has_value() ? param.axis.value() : 0;
   TBlob arr;
-  TBlob values = inputs[insert_::kValues];
+  TBlob values = param.val.has_value() ?
+                 TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_) :
+                 inputs[val_pos];
   if (!param.axis.has_value()) {
-    arr = inputs[insert_::kArr].reshape(Shape1(inputs[insert_::kArr].shape_.Size()));
+    arr = inputs[arr_pos].reshape(Shape1(inputs[arr_pos].shape_.Size()));
     ndim = 1;
   } else if (ndim == 0) {
-    arr = inputs[insert_::kArr];
-    CHECK_EQ(inputs[insert_::kValues].shape_.ndim(), 0)
-      << "'arr' is a 0-d array, 'values' can not assign to it. "
-      << "alueError: assignment to 0-d array.";
-    mxnet_op::copy(s, outputs[insert_::kOut], inputs[insert_::kValues]);
+    if (param.val.has_value()) {
+      CHECK_EQ(inputs[val_pos].shape_.ndim(), 0)
+        << "'arr' is a 0-d array, 'values' can not assign to it. "
+        << "alueError: assignment to 0-d array.";
+      mxnet_op::copy(s, outputs[out_pos], inputs[val_pos]);
+    } else {
+      MSHADOW_TYPE_SWITCH(outputs[out_pos].type_flag_, DType, {
+        Fill(s, outputs[out_pos], req[0], static_cast<DType>(param.val.value()));
+      });
+    }
     return;
   } else {
-    arr = inputs[insert_::kArr];
+    arr = inputs[arr_pos];
     CHECK(axis >= -1 * arr.shape_.ndim() && axis < arr.shape_.ndim())
       << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
     axis += (axis < 0) ? arr.shape_.ndim() : 0;
@@ -367,41 +444,18 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
 
   int N = arr.shape_[axis];
   size_t indices_len = 0;  // indices amount
-  int start = 0, stop = 0, step = 0;  // arguments when 'obj' is 'slice'
+  int start = 0, stop = 0, step = 0;  // arguments from 'obj' when it's 'slice'
 
   // get and check indices from slice or sequence of ints
-  if (inputs.size() == 3U) {  // indices from 'tensor'
-    indices_len = inputs[insert_::kObj].shape_.Size();
+  if (obj_is_tensor) {  // indices from 'tensor'
+    indices_len = inputs[obj_pos].shape_.Size();
   } else if (param.step.has_value()) {  // indices from 'slice'
-    step = param.step.value();
-    CHECK_NE(step, 0) << "'step' can not equal to 0.";
-    if (param.stop.has_value()) {
-      stop = param.stop.value();
-      stop += (stop < 0) ? N : 0;
-      stop = (stop < 0) ? ((step < 0) ? -1 : 0) : stop;
-      stop = (stop >= N) ? ((step < 0) ? N - 1 : N) : stop;
-    } else {
-      stop = (step > 0) ? N : -1;
-    }
-    if (param.start.has_value()) {
-      start = param.start.value();
-      start += (start < 0) ? N : 0;
-      start = (start < 0) ? ((step < 0) ? -1 : 0) : start;
-      start = (start >= N) ? ((step < 0) ? N - 1 : N) : start;
-    } else {
-      start = (step > 0) ? 0 : N - 1;
-    }
-    int seq_cnt = 0;
-    if (step > 0 && stop >= start) {
-      seq_cnt = (stop - start + step - 1) / step;
-    } else if (step < 0 && stop <= start) {
-      seq_cnt = (stop - start + step + 1) / step;
-    }
-    indices_len = static_cast<size_t>(seq_cnt);
+    SliceIndices(param.start, param.stop, param.step,
+                 N, &start, &stop, &step, &indices_len);
   }
 
-  int numnew = 0;  // output.shape[axis] - arr.shape[axis]
-  int index = 0;  // modified index
+  int numnew = 0;  // numnew = output.shape[axis] - arr.shape[axis]
+  int index = 0;  // save modified index, because index may be negative integer
   mxnet::TShape val_newshape(arr.shape_.ndim(), -1);
   // modify values's ndim to arr's ndim, for broadcast easily later
   // e.g. value shape: (2,) arr shape: (3, 2) => value shape: (1, 2)
@@ -421,7 +475,7 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
   // get numnew
   mxnet::TShape old_valshape(values.shape_);
   if (param.int_ind.has_value() ||
-    (inputs.size() == 3U && inputs[insert_::kObj].shape_.ndim() == 0)) {  // scaler
+    (obj_is_tensor && inputs[obj_pos].shape_.ndim() == 0)) {  // scaler
     if (param.int_ind.has_value()) {
       index = param.int_ind.value();
       CHECK(index >= -1 * N && index <= N)
@@ -461,7 +515,7 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
     numnew = static_cast<int>(indices_len);
   }
 
-  const mxnet::TShape& outshape = outputs[insert_::kOut].shape_;
+  const mxnet::TShape& outshape = outputs[out_pos].shape_;
   MXNET_NDIM_SWITCH(outshape.ndim(), ndim, {
     mshadow::Shape<ndim> arr_strides = mxnet_op::calc_stride(arr.shape_.get<ndim>());
     mshadow::Shape<ndim> val_strides = mxnet_op::calc_stride(values.shape_.get<ndim>());
@@ -469,107 +523,110 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
     mshadow::Shape<ndim> out_strides = mxnet_op::calc_stride(outshape.get<ndim>());
     mshadow::Shape<ndim> k_outshape = outshape.get<ndim>();
     mshadow::Shape<ndim> k_valshape = values.shape_.get<ndim>();
-    MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
-      MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
-        MSHADOW_TYPE_SWITCH((inputs.size() == 3U) ?
-                            inputs[insert_::kObj].type_flag_ :
-                            mshadow::DataType<int64_t>::kFlag, IType, {
-          if (param.int_ind.has_value()) {
-            Kernel<InsertSingleIndexForward<req_type, ndim>, xpu>::Launch(
-              s, outshape.Size(),
-              outputs[insert_::kOut].dptr<DType>(),
-              values.dptr<DType>(), arr.dptr<DType>(),
-              k_outshape, k_valshape, index, numnew,
-              val_strides, old_val_strides, arr_strides,
-              out_strides,
-              axis, true);
-          } else if (inputs.size() == 3U && inputs[insert_::kObj].shape_.ndim() == 0) {
-            Kernel<InsertSingleIndexForward<req_type, ndim>, xpu>::Launch(
-              s, outshape.Size(),
-              outputs[insert_::kOut].dptr<DType>(),
-              values.dptr<DType>(), arr.dptr<DType>(),
-              k_outshape, k_valshape, N,
-              inputs[insert_::kObj].dptr<IType>(), numnew,
-              val_strides, old_val_strides, arr_strides,
-              out_strides,
-              axis, true);
-          } else if (indices_len == 1) {
-            if (param.step.has_value()) {
-              Kernel<InsertSingleIndexForward<req_type, ndim>, xpu>::Launch(
-                s, outshape.Size(),
-                outputs[insert_::kOut].dptr<DType>(),
-                values.dptr<DType>(), arr.dptr<DType>(),
-                k_outshape, k_valshape, start, numnew,
-                val_strides, old_val_strides, arr_strides, out_strides,
-                axis, false);
-            } else {
-              Kernel<InsertSingleIndexForward<req_type, ndim>, xpu>::Launch(
-                s, outshape.Size(),
-                outputs[insert_::kOut].dptr<DType>(),
-                values.dptr<DType>(), arr.dptr<DType>(),
-                k_outshape, k_valshape,
-                N, inputs[insert_::kObj].dptr<IType>(), numnew,
-                val_strides, old_val_strides,
-                arr_strides, out_strides,
-                axis, false);
-            }
+    int vtype = param.val.has_value() ?
+                mshadow::DataType<double>::kFlag :
+                inputs[val_pos].type_flag_;
+    MSHADOW_TYPE_SWITCH(outputs[out_pos].type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH(vtype, VType, {
+        if ((param.int_ind.has_value() ||
+            (obj_is_tensor && inputs[obj_pos].shape_.ndim() == 0) ||
+            (indices_len == 1)) &&
+            param.val.has_value()) {
+          // If insert use single index and 'value' is inputed as numerical parameter
+          values = TBlob(ctx.requested[0].get_space_typed<xpu, 1, VType>(Shape1(1), s));
+          Fill(s, values, kWriteTo, param.val.value());
+        }
+        if (param.int_ind.has_value()) {
+          // 'obj' is integer, need to moveaxis
+          Kernel<InsertSingleIndexForward<ndim>, xpu>::Launch(
+            s, outshape.Size(), outputs[out_pos].dptr<DType>(),
+            values.dptr<VType>(), arr.dptr<DType>(),
+            k_outshape, k_valshape, index, numnew,
+            val_strides, old_val_strides, arr_strides, out_strides,
+            axis, true, req[out_pos]);
+        } else if (obj_is_tensor && inputs[obj_pos].shape_.ndim() == 0) {
+          // 'obj' is tensor and the tensor's ndim is 0, also need to moveaxis
+            Kernel<InsertSingleIndexForward<ndim>, xpu>::Launch(
+              s, outshape.Size(), outputs[out_pos].dptr<DType>(),
+              values.dptr<VType>(), arr.dptr<DType>(),
+              k_outshape, k_valshape, N, inputs[obj_pos].dptr<int64_t>(), numnew,
+              val_strides, old_val_strides, arr_strides, out_strides,
+              axis, true, req[out_pos]);
+        } else if (indices_len == 1) {
+          if (param.step.has_value()) {
+            Kernel<InsertSingleIndexForward<ndim>, xpu>::Launch(
+              s, outshape.Size(), outputs[out_pos].dptr<DType>(),
+              values.dptr<VType>(), arr.dptr<DType>(),
+              k_outshape, k_valshape, start, numnew,
+              val_strides, old_val_strides, arr_strides, out_strides,
+              axis, false, req[out_pos]);
           } else {
-            // broadcast check
-            for (int i = outshape.ndim() - 1; i >= 0; --i) {
-              int sz = outshape[i];
-              if (i == axis) {
-                sz = numnew;
-              }
-              CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz));
-            }
-            size_t temp_storage_bytes, temp_mem_size;
-            temp_storage_bytes = SortByKeyWorkspaceSize<IType, int, xpu>(indices_len, false, true);
-            temp_mem_size = indices_len * sizeof(IType) * 2 +
-                            indices_len * sizeof(int) +
-                            outshape[axis] * sizeof(int) * 2 +
-                            temp_storage_bytes;
-            Tensor<xpu, 1, char> temp_mem =
-              ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s);
-            IType* indices_ptr = reinterpret_cast<IType*>(temp_mem.dptr_);
-            IType* sorted_indices_ptr =
-              reinterpret_cast<IType*>(temp_mem.dptr_ + indices_len * sizeof(IType));
-            int* order_ptr = reinterpret_cast<int*>(temp_mem.dptr_ + indices_len * sizeof(IType) * 2);
-            int* is_insert = reinterpret_cast<int*>(temp_mem.dptr_ + indices_len * sizeof(IType) * 2 +
-                                                    indices_len * sizeof(int));
-            int* origin_idx = reinterpret_cast<int*>(temp_mem.dptr_ +  indices_len * sizeof(IType) * 2 +
-                                                    indices_len * sizeof(int) + outshape[axis] * sizeof(int));
-            Tensor<xpu, 1, char> temp_storage(temp_mem.dptr_ +  indices_len * sizeof(IType) * 2 +
-                                              indices_len * sizeof(int) + outshape[axis] * sizeof(int) * 2,
-                                              Shape1(temp_storage_bytes), s);
-            Tensor<xpu, 1, IType> indices(indices_ptr, Shape1(indices_len), s);
-            Tensor<xpu, 1, IType> sorted_indices(sorted_indices_ptr, Shape1(indices_len), s);
-            Tensor<xpu, 1, int> order(order_ptr, Shape1(indices_len), s);
-            int num_bits = common::ilog2ui(static_cast<unsigned int>(indices_len) - 1);
-            if (param.step.has_value()) {
-              Kernel<SliceToIndices, xpu>::Launch(s, indices_len, indices_ptr, N, start, step);
-            } else {
-              Kernel<ObjToIndices, xpu>::Launch(s, indices_len, indices_ptr, N,
-                                                inputs[insert_::kObj].dptr<IType>());
+            Kernel<InsertSingleIndexForward<ndim>, xpu>::Launch(
+              s, outshape.Size(), outputs[out_pos].dptr<DType>(),
+              values.dptr<VType>(), arr.dptr<DType>(),
+              k_outshape, k_valshape, N, inputs[obj_pos].dptr<int64_t>(), numnew,
+              val_strides, old_val_strides, arr_strides, out_strides,
+              axis, false, req[out_pos]);
+          }
+        } else {
+          // broadcast check
+          for (int i = outshape.ndim() - 1; i >= 0; --i) {
+            int sz = outshape[i];
+            if (i == axis) {
+              sz = numnew;
             }
-            Kernel<AssignId, xpu>::Launch(s, indices_len, order_ptr);
-            mxnet::op::SortByKey(indices, order, true, &temp_storage, 0, num_bits, &sorted_indices);
-            Kernel<IndicesModify, xpu>::Launch(s, indices_len, indices_ptr, order_ptr);
-
-            mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, outshape[axis], is_insert);
-            Kernel<SetIsInsert, xpu>::Launch(s, indices_len, indices_ptr, is_insert);
+            CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz));
+          }
+          size_t temp_storage_bytes, temp_mem_size;
+          temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, int, xpu>(indices_len, false, true);
+          temp_mem_size = indices_len * sizeof(int64_t) * 2 +
+                          indices_len * sizeof(int) +
+                          outshape[axis] * sizeof(int) * 2 +
+                          temp_storage_bytes;
+          Tensor<xpu, 1, char> temp_mem =
+            ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s);
+          int64_t* indices_ptr = reinterpret_cast<int64_t*>(temp_mem.dptr_);
+          int64_t* sorted_indices_ptr = reinterpret_cast<int64_t*>(indices_ptr + indices_len);
+          int* order_ptr = reinterpret_cast<int*>(sorted_indices_ptr + indices_len);
+          int* is_insert = reinterpret_cast<int*>(order_ptr + indices_len);
+          int* origin_idx = reinterpret_cast<int*>(is_insert + outshape[axis]);
+          Tensor<xpu, 1, char> temp_storage(reinterpret_cast<char*>(origin_idx + outshape[axis]),
+                                            Shape1(temp_storage_bytes), s);
+          Tensor<xpu, 1, int64_t> indices(indices_ptr, Shape1(indices_len), s);
+          Tensor<xpu, 1, int64_t> sorted_indices(sorted_indices_ptr, Shape1(indices_len), s);
+          Tensor<xpu, 1, int> order(order_ptr, Shape1(indices_len), s);
+          int num_bits = common::ilog2ui(static_cast<unsigned int>(indices_len) - 1);
+          if (param.step.has_value()) {
+            Kernel<SliceToIndices, xpu>::Launch(s, indices_len, indices_ptr, N, start, step);
+          } else {
+            Kernel<ObjToIndices, xpu>::Launch(s, indices_len, indices_ptr, N,
+                                              inputs[obj_pos].dptr<int64_t>());
+          }
+          Kernel<AssignId, xpu>::Launch(s, indices_len, order_ptr);
+          mxnet::op::SortByKey(indices, order, true, &temp_storage, 0, num_bits, &sorted_indices);
+          Kernel<IndicesModify, xpu>::Launch(s, indices_len, indices_ptr, order_ptr);
 
-            Kernel<SetOriginValuesIdx, xpu>::Launch(s, indices_len, indices_ptr, origin_idx);
-            Kernel<SetOriginArrIdx, xpu>::Launch(s, outshape[axis], is_insert, origin_idx);
+          mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, outshape[axis], is_insert);
+          Kernel<SetIsInsert, xpu>::Launch(s, indices_len, indices_ptr, is_insert);
 
-            Kernel<InsertSeqForward<req_type, ndim>, xpu>::Launch(
+          Kernel<SetOriginValuesIdx, xpu>::Launch(s, indices_len, indices_ptr, origin_idx);
+          Kernel<SetOriginArrIdx, xpu>::Launch(s, outshape[axis], is_insert, origin_idx);
+          if (param.val.has_value()) {
+            Kernel<InsertSeqIndicesForward<ndim>, xpu>::Launch(
               s, outshape.Size(),
-              outputs[insert_::kOut].dptr<DType>(),
-              values.dptr<DType>(), arr.dptr<DType>(),
+              outputs[out_pos].dptr<DType>(),
+              param.val.value(), arr.dptr<DType>(),
+              k_outshape, is_insert, origin_idx,
+              arr_strides, out_strides, axis, req[out_pos]);
+          } else {
+            Kernel<InsertSeqIndicesForward<ndim>, xpu>::Launch(
+              s, outshape.Size(),
+              outputs[out_pos].dptr<DType>(),
+              values.dptr<VType>(), arr.dptr<DType>(),
               k_outshape, k_valshape, is_insert, origin_idx,
-              val_strides, arr_strides, out_strides,
-              axis);
+              val_strides, arr_strides, out_strides, axis, req[out_pos]);
           }
-        });
+        }
       });
     });
   });
diff --git a/src/operator/numpy/np_insert_op.cc b/src/operator/numpy/np_insert_op.cc
index e825ab40577b..2fef5f881f5d 100644
--- a/src/operator/numpy/np_insert_op.cc
+++ b/src/operator/numpy/np_insert_op.cc
@@ -35,32 +35,41 @@ bool NumpyInsertType(const nnvm::NodeAttrs& attrs,
                      std::vector<int> *in_type,
                      std::vector<int> *out_type) {
   const NumpyInsertParam& param = nnvm::get<NumpyInsertParam>(attrs.parsed);
-  int insize = (param.step.has_value() || param.int_ind.has_value()) ? 2 : 3;
+  int input_count = param.val.has_value() ? 1 : 2;
+  int insize = (param.step.has_value() || param.int_ind.has_value()) ?
+               input_count : input_count + 1;
+  bool obj_is_tensor = !param.step.has_value() && !param.int_ind.has_value();
   CHECK_EQ(in_type->size(), insize);
   CHECK_EQ(out_type->size(), 1U);
-  if (insize == 3) {
-    CHECK_NE((*in_type)[2], -1) << "Index type must be set for insert operator\n";
-    CHECK(((*in_type)[2] == mshadow::DataType<int64_t>::kFlag) ||
-          ((*in_type)[2] == mshadow::DataType<int32_t>::kFlag))
-      << "Index type only support int32 or int64.\n";
+  if (obj_is_tensor) {
+    int obj_pos = input_count;
+    CHECK_NE((*in_type)[obj_pos], -1) << "Index type must be set for insert operator\n";
+    CHECK_EQ((*in_type)[obj_pos], mshadow::DataType<int64_t>::kFlag)
+      << "Index type only support int64.\n";
   }
-  TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[0]);
-  TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[1]);
+  TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[0]);  // output type equals to input arr's
   TYPE_ASSIGN_CHECK(*in_type, 0, (*out_type)[0]);
   return (*in_type)[0] != -1;
 }
 
 bool NumpyInsertShape(const nnvm::NodeAttrs& attrs,
-                            mxnet::ShapeVector *in_shape,
-                            mxnet::ShapeVector *out_shape) {
+                      mxnet::ShapeVector *in_shape,
+                      mxnet::ShapeVector *out_shape) {
   using namespace mshadow;
   const NumpyInsertParam& param = nnvm::get<NumpyInsertParam>(attrs.parsed);
-  CHECK_EQ(in_shape->size(),
-    (param.step.has_value() || param.int_ind.has_value()) ? 2U : 3U);
-  mxnet::TShape &arrshape = (*in_shape)[insert_::kArr];
-  mxnet::TShape &valshape = (*in_shape)[insert_::kValues];
-  mxnet::TShape &objShape = (*in_shape)[insert_::kObj];
-  if (in_shape->size() == 3U) {
+  int input_count = param.val.has_value() ? 1 : 2;
+  int insize = (param.step.has_value() || param.int_ind.has_value()) ?
+               input_count : input_count + 1;
+  bool obj_is_tensor = !param.step.has_value() && !param.int_ind.has_value();
+  const int arr_pos = 0;
+  const int val_pos = param.val.has_value() ? 0 : 1;
+  const int obj_pos = val_pos + 1;
+  CHECK_EQ(in_shape->size(), insize);
+  mxnet::TShape scale_shape(0, 1);
+  mxnet::TShape &arrshape = (*in_shape)[arr_pos];
+  mxnet::TShape &valshape = param.val.has_value() ? scale_shape : (*in_shape)[val_pos];
+  mxnet::TShape &objShape = obj_is_tensor ? (*in_shape)[obj_pos] : scale_shape;
+  if (obj_is_tensor) {
     CHECK_LE(objShape.ndim(), 1)
       << "index array argument obj to insert must be one dimensional or scale.\n";
   }
@@ -73,11 +82,15 @@ bool NumpyInsertShape(const nnvm::NodeAttrs& attrs,
     arrshape = Shape1(arrshape.Size());
     ndim = 1;
   } else if (ndim == 0) {
-    CHECK_EQ(valshape.ndim(), 0)
-      << "'arr' is a 0-d array, 'values' can not assign to it. "
-      << "alueError: assignment to 0-d array.";
-    out_shape->push_back(valshape);
-    return shape_is_known(valshape);
+    if (param.val.has_value()) {
+      out_shape->push_back(scale_shape);
+    } else {
+      CHECK_EQ(valshape.ndim(), 0)
+        << "'arr' is a 0-d array, 'values' can not assign to it. "
+        << "alueError: assignment to 0-d array.";
+      out_shape->push_back(valshape);
+    }
+    return shape_is_known(out_shape[0]);
   } else {
     CHECK(axis >= -1 * arrshape.ndim() && axis < arrshape.ndim())
       << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
@@ -86,7 +99,7 @@ bool NumpyInsertShape(const nnvm::NodeAttrs& attrs,
 
   int seq_cnt = -1;
   int N = arrshape[axis];
-  if (in_shape->size() == 3U) {
+  if (obj_is_tensor) {
     seq_cnt = objShape.Size();
   } else if (param.step.has_value()) {
     int step = param.step.value();
@@ -132,9 +145,9 @@ bool NumpyInsertShape(const nnvm::NodeAttrs& attrs,
   valshape.assign(val_newshape.begin(), val_newshape.end());
 
   if (param.int_ind.has_value() ||
-      (in_shape->size() == 3U && objShape.ndim() == 0)) {
+      (obj_is_tensor && objShape.ndim() == 0)) {
     // because of moveaxis(values, 0, axis)
-    numnew =  valshape[0];
+    numnew = valshape[0];
   } else if (seq_cnt == 1) {
     numnew = valshape[axis];
   } else {
@@ -151,15 +164,22 @@ NNVM_REGISTER_OP(_npi_insert)
 .set_attr_parser(ParamParser<NumpyInsertParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
     const NumpyInsertParam& params = nnvm::get<NumpyInsertParam>(attrs.parsed);
-    return (params.step.has_value() || params.int_ind.has_value()) ? 2U : 3U;
+    int input_count = params.val.has_value() ? 1 : 2;
+    return (params.step.has_value() || params.int_ind.has_value()) ? input_count : input_count + 1;
 })
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     const NumpyInsertParam& params = nnvm::get<NumpyInsertParam>(attrs.parsed);
-    return (params.step.has_value() || params.int_ind.has_value()) ?
-            std::vector<std::string>{"arr", "values"} :
-            std::vector<std::string>{"arr", "values", "obj"};
+    if (params.val.has_value()) {
+      return (params.step.has_value() || params.int_ind.has_value()) ?
+             std::vector<std::string>{"arr"} :
+             std::vector<std::string>{"arr", "obj"};
+    } else {
+      return (params.step.has_value() || params.int_ind.has_value()) ?
+             std::vector<std::string>{"arr", "values"} :
+             std::vector<std::string>{"arr", "values", "obj"};
+    }
 })
 .set_attr<mxnet::FInferShape>("FInferShape", NumpyInsertShape)
 .set_attr<nnvm::FInferType>("FInferType", NumpyInsertType)
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index cbe8761c23c0..e2fd2635cc59 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -961,13 +961,12 @@ def _add_workload_inner():
 
 def _add_workload_insert():
     a = np.arange(10)
-    for dt in [np.int32, np.int64]:
-        OpArgMngr.add_workload('insert', a, 0, np.array([0]))
-        OpArgMngr.add_workload('insert', a, np.array([], dtype=dt), np.array([]))
-        OpArgMngr.add_workload('insert', a, np.array([0, 1], dtype=dt), np.array([1, 2]))
-        OpArgMngr.add_workload('insert', a, slice(1, 2), np.array([1, 2]))
-        OpArgMngr.add_workload('insert', a, slice(1, -2, -1), np.array([]))
-        OpArgMngr.add_workload('insert', np.array([0, 1, 2]), np.array([1, 1, 1], dtype=dt), np.array([3, 4, 5]))
+    OpArgMngr.add_workload('insert', a, 0, np.array([0]))
+    OpArgMngr.add_workload('insert', a, np.array([], dtype=np.int64), np.array([]))
+    OpArgMngr.add_workload('insert', a, np.array([0, 1], dtype=np.int64), np.array([1, 2]))
+    OpArgMngr.add_workload('insert', a, slice(1, 2), np.array([1, 2]))
+    OpArgMngr.add_workload('insert', a, slice(1, -2, -1), np.array([]))
+    OpArgMngr.add_workload('insert', np.array([0, 1, 2]), np.array([1, 1, 1], dtype=np.int64), np.array([3, 4, 5]))
     OpArgMngr.add_workload('insert', np.array(1), 0, np.array([0]))
 
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 468c819b2763..0cc0daf12a83 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2313,23 +2313,24 @@ def GetNdim(tp):
         for ed in [-5, -3, -1, 0, 1, 3, 5, None]:
             for stp in [-1, 1, 2, None]:
                 config.append(tuple([A, slice(st, ed, stp), F, 1]))
+    dtypes = ['int32', 'float16', 'float32', 'float64', None]
     
     for arr_shape, obj, val_shape, axis in config:
-        for objtype in ['int32', 'int64']:
+        for atype, btype in itertools.product(dtypes, dtypes):
             if type(obj) == list:
-                obj_mxnp = np.array(obj, dtype=objtype)
-                obj_onp = _np.array(obj, dtype=objtype)
+                obj_mxnp = np.array(obj, dtype='int64')
+                obj_onp = _np.array(obj)
             elif type(obj) == slice:
                 obj_mxnp = obj
                 obj_onp = obj
-            else:
-                obj_mxnp = (_np.int32(obj) if objtype == 'int32' else _np.int64(obj)) 
-                obj_onp = (_np.int32(obj) if objtype == 'int32' else _np.int64(obj)) 
+            else:  # integer
+                obj_mxnp = obj
+                obj_onp = obj
             test_insert = TestInsert(obj=obj_mxnp, axis=axis)
 
-            a = mx.nd.random.uniform(-1.0, 1.0, shape=arr_shape).as_np_ndarray()
+            a = mx.nd.random.uniform(-1.0, 1.0, shape=arr_shape).as_np_ndarray().astype(atype)
             a.attach_grad()
-            b = mx.nd.random.uniform(-1.0, 1.0, shape=val_shape).as_np_ndarray()
+            b = mx.nd.random.uniform(-1.0, 1.0, shape=val_shape).as_np_ndarray().astype(btype)
             b.attach_grad()
             expected_ret = _np.insert(a.asnumpy(), obj_onp, b.asnumpy(), axis=axis)