diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index ee3d6e53db59..114e9a810934 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -7111,6 +7111,7 @@ def insert(arr, obj, values, axis=None):
     """
     return _mx_nd_np.insert(arr, obj, values, axis=axis)
 
+
 @set_module('mxnet.numpy')
 def nonzero(a):
     """
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 19af9c2853be..5e45bd3cfe75 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -2494,14 +2494,14 @@ def insert(arr, obj, values, axis=None):
 
     Parameters
     ----------
-    arr : ndarray
+    arr : _Symbol
         Input array.
-    obj : int, slice or ndarray of ints
+    obj : int, slice or _Symbol of ints
         Object that defines the index or indices before which `values` is
         inserted.
         Support for multiple insertions when `obj` is a single scalar or a
         sequence with one element (only support int32 and int64 element).
-    values : ndarray
+    values : _Symbol
         Values to insert into `arr`.
         The type of `values` should equal to the type of `arr`.
         `values` should be shaped so that ``arr[...,obj,...] = values``
@@ -2512,7 +2512,7 @@ def insert(arr, obj, values, axis=None):
 
     Returns
     -------
-    out : ndarray
+    out : _Symbol
         A copy of `arr` with `values` inserted.  Note that `insert`
         does not occur in-place: a new array is returned. If
         `axis` is None, `out` is a flattened array.
@@ -2522,49 +2522,6 @@ def insert(arr, obj, values, axis=None):
     Note that for higher dimensional inserts `obj=0` behaves very different
     from `obj=[0]` just like `arr[:,0,:] = values` is different from
     `arr[:,[0],:] = values`.
-
-    Examples
-    --------
-    >>> a = np.array([[1, 1], [2, 2], [3, 3]])
-    >>> a
-    array([[1., 1.],
-           [2., 2.],
-           [3., 3.]])
-    >>> np.insert(a, 1, np.array(5))
-    array([1., 5., 1., 2., 2., 3., 3.])
-    >>> np.insert(a, 1, np.array(5), axis=1)
-    array([[1., 5., 1.],
-           [2., 5., 2.],
-           [3., 5., 3.]])
-
-    Difference between sequence and scalars:
-
-    >>> np.insert(a, np.array([1], dtype=np.int32), np.array([[1],[2],[3]]), axis=1)
-    array([[1., 1., 1.],
-           [2., 2., 2.],
-           [3., 3., 3.]])
-    >>> np.insert(a, 1, np.array([1, 2, 3]), axis=1)
-    array([[1., 1., 1.],
-           [2., 2., 2.],
-           [3., 3., 3.]])
-
-    >>> b = a.flatten()
-    >>> b
-    array([1., 1., 2., 2., 3., 3.])
-    >>> np.insert(b, np.array([2, 2], dtype=np.int64), np.array([5, 6]))
-    array([1., 1., 5., 6., 2., 2., 3., 3.])
-
-    >>> np.insert(b, slice(2, 4), np.array([5, 6]))
-    array([1., 1., 5., 2., 6., 2., 3., 3.])
-
-    >>> np.insert(b, np.array([2, 2], dtype=np.int32), np.array([7.13, False]))
-    array([1.  , 1.  , 7.13, 0.  , 2.  , 2.  , 3.  , 3.  ])
-
-    >>> x = np.arange(8).reshape(2, 4)
-    >>> idx = np.array([1, 3], dtype=np.int32)
-    >>> np.insert(x, idx, np.array([999]), axis=1)
-    array([[  0., 999.,   1.,   2., 999.,   3.],
-           [  4., 999.,   5.,   6., 999.,   7.]])
     """
     if not isinstance(arr, ndarray): # pylint: disable= undefined-variable
         raise TypeError("'arr' can not support type {}".format(str(type(arr))))
diff --git a/src/operator/numpy/np_insert_op-inl.h b/src/operator/numpy/np_insert_op-inl.h
index 1e54cd760975..4fd17deca84c 100644
--- a/src/operator/numpy/np_insert_op-inl.h
+++ b/src/operator/numpy/np_insert_op-inl.h
@@ -61,12 +61,16 @@ struct NumpyInsertParam : public dmlc::Parameter<NumpyInsertParam> {
 };
 
 namespace insert_ {
-enum InsertOpInputs {kArr, kValues, kObj};
-enum InsertOpOutputs {kOut};
+  // insert 'values' to 'arr' according to 'obj'
+  enum InsertOpInputs {kArr, kValues, kObj};
+  enum InsertOpOutputs {kOut};
 }  // namespace insert_
 
 template<int req>
-struct InsertZeroNdimForward {
+struct InsertZeroNdimForward { 
+  /*!
+   * \brief when axis is not None but arr.mdim == 0, output = values
+   */
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data) {
     KERNEL_ASSIGN(out_data[i], req, in_data[i]);
@@ -75,6 +79,19 @@ struct InsertZeroNdimForward {
 
 template<int req>
 struct InsertSingleIndexForward {
+  /*!
+   * \brief insert when obj is 'scaler' or a 'slice' with only one element.
+   * \tparam xpu - cpu or gpu.
+   * \param out_data - output: insert 'value' to 'arr' according to 'index'.
+   * \param in_arr - input: 'arr', original array.
+   * \param index - input: it's the only element in 'obj' indicats insert position.
+   * \param in_val - input: 'value', insert to 'arr' according to 'index'.
+   * \param numnew - extra dim size in 'out_data' compared with 'arr' in 'axis'.
+   * \param axis - insert 'value' to 'arr' in 'axis'.
+   * \param ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
+   * \param moveaxis - If 'obj' is a scaler, moveaxis is true;
+                       If 'obj' is a slice with one element, moveaxis is false.
+   */
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data,
                                   const DType* in_val, const DType* in_arr,
@@ -85,60 +102,66 @@ struct InsertSingleIndexForward {
                                   const mshadow::Shape<10> old_val_stride,
                                   const mshadow::Shape<10> arr_stride,
                                   const mshadow::Shape<10> out_stride,
-                                  const int arr_ndim, const int val_ndim,
-                                  const int out_ndim, const int axis,
+                                  const int ndim, const int axis,
                                   bool moveaxis) {
-    const int64_t out_head = i / out_stride[axis];
-    const int64_t out_mid = out_head % outshape[axis];
+    // i is the global flattened index in the output
     mshadow::Shape<10> out_idx;  // i -> position in output's shape
-    for (int j = 0; j < out_ndim; ++j) {
+    for (int j = 0; j < ndim; ++j) {
       const int64_t head = i / out_stride[j];
       const int64_t mid = head % outshape[j];
       out_idx[j] = mid;
     }
     int64_t dest_idx;
-    if (out_mid >= index && out_mid < index + numnew) {
-      int idx_val = out_mid - index;
+    if (out_idx[axis] >= index && out_idx[axis] < index + numnew) {  // from 'value'
+      int idx_val = out_idx[axis] - index;
       mshadow::Shape<10> val_idx(out_idx);  // i -> position in values's shape
       val_idx[axis] = idx_val;
-      for (int j = out_ndim - 1, k = val_ndim - 1; j >= 0 || k >= 0; --j, --k) {
-        if (j >= 0 && k >= 0) {
-          if (valshape[k] == 1) {
-            val_idx[k] = 0;
-          }
-        } else if (j >= 0) {
-          val_idx[j] = 1;
-        } else {
-          break;
+      for (int j = ndim - 1; j >= 0; --j) {
+        if (valshape[j] == 1) {  // broadcast
+          val_idx[j] = 0;
         }
       }
       dest_idx = 0;
-      if (moveaxis) {
-        for (int _i = 0; _i < axis; ++_i) {
-          dest_idx += old_val_stride[_i + 1] * val_idx[_i];
+      if (moveaxis) {  // moveaxis(values, 0, axis)
+        for (int j = 0; j < axis; ++j) {
+          dest_idx += old_val_stride[j + 1] * val_idx[j];
         }
         dest_idx += old_val_stride[0] * val_idx[axis];
-        for (int _i = axis + 1; _i < val_ndim ; ++_i) {
-          dest_idx += old_val_stride[_i] *val_idx[_i];
+        for (int j = axis + 1; j < ndim ; ++j) {
+          dest_idx += old_val_stride[j] *val_idx[j];
         }
       } else {
-        for (int _i =0; _i < val_ndim; ++_i) {
-          dest_idx += val_stride[_i] * val_idx[_i];
+        for (int j =0; j < ndim; ++j) {
+          dest_idx += val_stride[j] * val_idx[j];
         }
       }
       KERNEL_ASSIGN(out_data[i], req, in_val[dest_idx]);
-    } else {
-      int idx_arr = (out_mid < index) ? out_mid : out_mid - numnew;
+    } else {  // from 'arr'
+      int idx_arr = (out_idx[axis] < index) ?
+                     out_idx[axis] : out_idx[axis] - numnew;
       mshadow::Shape<10> arr_idx(out_idx);  // i -> position in arr's shape
       arr_idx[axis] = idx_arr;
       dest_idx = 0;
-      for (int _i =0; _i < arr_ndim; ++_i) {
-        dest_idx += arr_stride[_i] * arr_idx[_i];
+      for (int j =0; j < ndim; ++j) {
+        dest_idx += arr_stride[j] * arr_idx[j];
       }
       KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
     }
   }
-
+  /*!
+   * \brief insert when obj is 'tensor' with only one element.
+   * \tparam xpu - cpu or gpu.
+   * \param out_data - output: insert 'value' to 'arr' according to 'index'.
+   * \param in_arr - input: 'arr', original array.
+   * \param in_obj - input: It indicats insert position, ndim may equals to 0.
+   * \param in_val - input: 'value', insert to 'arr' according to 'index'.
+   * \param N - arr.shape_[axis]
+   * \param numnew - extra dim size in 'out_data' compared with 'arr' in 'axis'.
+   * \param axis - insert 'value' to 'arr' in 'axis'.
+   * \param ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
+   * \param moveaxis - If 'obj' is a tensor with ndim == 0, regard it as a scaler and moveaxis is true;
+                       If 'obj' is a tensor with  ndim > 0 but has only one element, moveaxis is false.
+   */
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data,
                                   const DType* in_val, const DType* in_arr,
@@ -149,13 +172,11 @@ struct InsertSingleIndexForward {
                                   const mshadow::Shape<10> old_val_stride,
                                   const mshadow::Shape<10> arr_stride,
                                   const mshadow::Shape<10> out_stride,
-                                  const int arr_ndim, const int val_ndim,
-                                  const int out_ndim, const int axis,
+                                  const int ndim, const int axis,
                                   bool moveaxis) {
-    const int64_t out_head = i / out_stride[axis];
-    const int64_t out_mid = out_head % outshape[axis];
+    // i is the global flattened index in the output
     mshadow::Shape<10> out_idx;  // i -> position in output's shape
-    for (int j = 0; j < out_ndim; ++j) {
+    for (int j = 0; j < ndim; ++j) {
       const int64_t head = i / out_stride[j];
       const int64_t mid = head % outshape[j];
       out_idx[j] = mid;
@@ -163,53 +184,79 @@ struct InsertSingleIndexForward {
     int64_t dest_idx;
     IType index = in_obj[0];
     if (static_cast<int64_t>(index) < 0) {
-        index += static_cast<IType>(N);
+      index += static_cast<IType>(N);
     }
-    if (out_mid >= index && out_mid < index + numnew) {
-      int idx_val = out_mid - index;
-      mshadow::Shape<10> val_idx(out_idx);
+    if (out_idx[axis] >= index && out_idx[axis] < index + numnew) {  // from 'value'
+      int idx_val = out_idx[axis] - index;
+      mshadow::Shape<10> val_idx(out_idx);  // i -> position in values's shape
       val_idx[axis] = idx_val;
-      for (int j = out_ndim - 1, k = val_ndim - 1; j >= 0 || k >= 0; --j, --k) {
-        if (j >= 0 && k >= 0) {
-          if (valshape[k] == 1) {
-            val_idx[k] = 0;
-          }
-        } else if (j >= 0) {
-          val_idx[j] = 1;
-        } else {
-          break;
+      for (int j = ndim - 1; j >= 0; --j) {
+        if (valshape[j] == 1) {  // broadcast
+          val_idx[j] = 0;
         }
       }
       dest_idx = 0;
-      if (moveaxis) {
-        for (int _i = 0; _i < axis; ++_i) {
-          dest_idx += old_val_stride[_i + 1] * val_idx[_i];
+      if (moveaxis) {  // moveaxis(values, 0, axis)
+        for (int j = 0; j < axis; ++j) {
+          dest_idx += old_val_stride[j + 1] * val_idx[j];
         }
         dest_idx += old_val_stride[0] * val_idx[axis];
-        for (int _i = axis + 1; _i < val_ndim ; ++_i) {
-          dest_idx += old_val_stride[_i] *val_idx[_i];
+        for (int j = axis + 1; j < ndim ; ++j) {
+          dest_idx += old_val_stride[j] *val_idx[j];
         }
       } else {
-        for (int _i =0; _i < val_ndim; ++_i) {
-          dest_idx += val_stride[_i] * val_idx[_i];
+        for (int j =0; j < ndim; ++j) {
+          dest_idx += val_stride[j] * val_idx[j];
         }
       }
       KERNEL_ASSIGN(out_data[i], req, in_val[dest_idx]);
-    } else {
-      int idx_arr = (out_mid < index) ? out_mid : out_mid - numnew;
+    } else {  // from 'arr'
+      int idx_arr = (out_idx[axis] < index) ? out_idx[axis] : out_idx[axis] - numnew;
       mshadow::Shape<10> arr_idx(out_idx);  // i -> position in arr's shape
       arr_idx[axis] = idx_arr;
       dest_idx = 0;
-      for (int _i =0; _i < arr_ndim; ++_i) {
-        dest_idx += arr_stride[_i] * arr_idx[_i];
+      for (int j =0; j < ndim; ++j) {
+        dest_idx += arr_stride[j] * arr_idx[j];
       }
       KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
     }
   }
 };
 
+template<int ndim>
+inline mshadow::Shape<ndim> GetStride(const mxnet::TShape& shape) {
+  mshadow::Shape<ndim>stride;
+  size_t tmp = 1;
+  for (int i = shape.ndim() - 1; i >= 0; --i) {
+    stride[i] = tmp;
+    tmp *= shape[i];
+  }
+  return stride;
+}
+
+template<int ndim>
+inline mshadow::Shape<ndim> GetKernelShape(const mxnet::TShape& shape) {
+  mshadow::Shape<ndim>k_shape;
+  for (int i = 0 ; i < shape.ndim() ; ++i) {
+    k_shape[i] = shape[i];
+  }
+  return k_shape;
+}
+
 template<int req>
 struct InsertSeqForward {
+  /*!
+   * \brief insert when obj is 'tensor' or 'slice' with more than one element.
+   * \tparam xpu - cpu or gpu.
+   * \param out_data - output: insert 'value' to 'arr' according to 'index'.
+   * \param in_arr - input: 'arr', original array.
+   * \param in_obj - input: It indicats insert position, ndim may equals to 0.
+   * \param in_val - input: 'value', insert to 'arr' according to 'index'.
+   * \param is_insert - if is_insert[out_idx[axis]] is true, it's from 'values', else from 'arr'.
+   * \param origin_idx - indicate the original position in 'arr' or 'values' in 'axis'. 
+   * \param axis - insert 'value' to 'arr' in 'axis'.
+   * \param ndim - both 'in_arr', 'in_val' and 'out_data' have same ndim before call this.
+   */
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data,
                                   const DType* in_val, const DType* in_arr,
@@ -220,45 +267,37 @@ struct InsertSeqForward {
                                   const mshadow::Shape<10> val_stride,
                                   const mshadow::Shape<10> arr_stride,
                                   const mshadow::Shape<10> out_stride,
-                                  const int arr_ndim, const int val_ndim,
-                                  const int out_ndim, const int axis) {
-    const int64_t out_head = i / out_stride[axis];
-    const int64_t out_mid = out_head % outshape[axis];
+                                  const int ndim, const int axis) {
+    // i is the global flattened index in the output
     mshadow::Shape<10> out_idx;  // i -> position in output's shape
-    for (int j = 0; j < out_ndim; ++j) {
+    for (int j = 0; j < ndim; ++j) {
       const int64_t head = i / out_stride[j];
       const int64_t mid = head % outshape[j];
       out_idx[j] = mid;
     }
     int64_t dest_idx;
-    if (is_insert[out_mid]) {
-      int idx_val = origin_idx[out_mid];
+    if (is_insert[out_idx[axis]]) {  // from 'values'
+      int idx_val = origin_idx[out_idx[axis]];
       mshadow::Shape<10> insert_idx(out_idx);  // i -> position in insert's shape
       insert_idx[axis] = idx_val;
       mshadow::Shape<10> val_idx(insert_idx);  // i -> position in values's shape
-      for (int j = out_ndim - 1, k = val_ndim - 1; j >= 0 || k >= 0; --j, --k) {
-        if (j >= 0 && k >= 0) {
-          if (valshape[k] == 1) {
-            val_idx[k] = 0;
-          }
-        } else if (j >= 0) {
+      for (int j = ndim - 1; j >= 0; --j) {  // broadcast
+        if (valshape[j] == 1) {
           val_idx[j] = 0;
-        } else {
-          break;
         }
       }
       dest_idx = 0;
-      for (int _i =0; _i < val_ndim; ++_i) {
-        dest_idx += val_stride[_i] * val_idx[_i];
+      for (int j =0; j < ndim; ++j) {
+        dest_idx += val_stride[j] * val_idx[j];
       }
       KERNEL_ASSIGN(out_data[i], req, in_val[dest_idx]);
-    } else {
-      int idx_arr = origin_idx[out_mid];
+    } else {  // from 'arr'
+      int idx_arr = origin_idx[out_idx[axis]];
       mshadow::Shape<10> arr_idx(out_idx);  // i -> position in arr's shape
       arr_idx[axis] = idx_arr;
       dest_idx = 0;
-      for (int _i =0; _i < arr_ndim; ++_i) {
-        dest_idx += arr_stride[_i] * arr_idx[_i];
+      for (int j =0; j < ndim; ++j) {
+        dest_idx += arr_stride[j] * arr_idx[j];
       }
       out_data[i] = in_arr[dest_idx];
       KERNEL_ASSIGN(out_data[i], req, in_arr[dest_idx]);
@@ -338,247 +377,210 @@ struct SetOriginArrIdx {
 
 template<typename xpu>
 void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-    using namespace mshadow;
-    using namespace mxnet_op;
-
-    const NumpyInsertParam& param = nnvm::get<NumpyInsertParam>(attrs.parsed);
-    CHECK_EQ(inputs.size(),
-            (param.step.has_value() || param.int_ind.has_value()) ? 2U : 3U);
-    CHECK_EQ(outputs.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    int ndim = inputs[insert_::kArr].shape_.ndim();
-    int axis = param.axis.has_value() ? param.axis.value() : 0;
-    TBlob arr, values;
-    if (!param.axis.has_value()) {
-        arr = inputs[insert_::kArr].reshape(Shape1(inputs[insert_::kArr].shape_.Size()));
-        ndim = 1;
-    } else if (ndim == 0) {
-        arr = inputs[insert_::kArr];
-        CHECK_EQ(inputs[insert_::kValues].shape_.ndim(), 0)
-            << "'arr' is a 0-d array, 'values' can not assign to it. "
-            << "alueError: assignment to 0-d array.";
-        MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
-            MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
-                Kernel<InsertZeroNdimForward<req_type>, xpu>::Launch(
-                    s, outputs[insert_::kOut].shape_.Size(),
-                    outputs[insert_::kOut].dptr<DType>(), inputs[insert_::kValues].dptr<DType>());
-            });
-        });
-        return;
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  const NumpyInsertParam& param = nnvm::get<NumpyInsertParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(),
+          (param.step.has_value() || param.int_ind.has_value()) ? 2U : 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  int ndim = inputs[insert_::kArr].shape_.ndim();
+  int axis = param.axis.has_value() ? param.axis.value() : 0;
+  TBlob arr;
+  TBlob values = inputs[insert_::kValues];
+  if (!param.axis.has_value()) {
+    arr = inputs[insert_::kArr].reshape(Shape1(inputs[insert_::kArr].shape_.Size()));
+    ndim = 1;
+  } else if (ndim == 0) {
+    arr = inputs[insert_::kArr];
+    CHECK_EQ(inputs[insert_::kValues].shape_.ndim(), 0)
+      << "'arr' is a 0-d array, 'values' can not assign to it. "
+      << "alueError: assignment to 0-d array.";
+    MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
+        Kernel<InsertZeroNdimForward<req_type>, xpu>::Launch(  // output = value 
+          s, outputs[insert_::kOut].shape_.Size(),
+          outputs[insert_::kOut].dptr<DType>(), inputs[insert_::kValues].dptr<DType>());
+      });
+    });
+    return;
+  } else {
+    arr = inputs[insert_::kArr];
+    CHECK(axis >= -1 * arr.shape_.ndim() && axis < arr.shape_.ndim())
+      << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
+    axis += (axis < 0) ? arr.shape_.ndim() : 0;
+  }
+
+  int N = arr.shape_[axis];
+  //mxnet::TShape newshape(arr.shape_);  // output's shape
+  size_t indices_len = 0;  // indices amount
+  int start = 0, stop = 0, step = 0;  // arguments when 'obj' is 'slice'
+
+  // get and check indices from slice or sequence of ints
+  if (inputs.size() == 3U) {  // indices from 'tensor'
+    indices_len = inputs[insert_::kObj].shape_.Size();
+  } else if (param.step.has_value()) {  // indices from 'slice'
+    step = param.step.value();
+    CHECK_NE(step, 0) << "'step' can not equal to 0.";
+    if (param.stop.has_value()) {
+      stop = param.stop.value();
+      stop += (stop < 0) ? N : 0;
+      stop = (stop < 0) ? ((step < 0) ? -1 : 0) : stop;
+      stop = (stop >= N) ? ((step < 0) ? N - 1 : N) : stop;
     } else {
-        arr = inputs[insert_::kArr];
-        CHECK(axis >= -1 * arr.shape_.ndim() && axis < arr.shape_.ndim())
-          << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
-        axis += (axis < 0) ? arr.shape_.ndim() : 0;
+      stop = (step > 0) ? N : -1;
     }
-
-    int N = arr.shape_[axis];
-    mxnet::TShape newshape(arr.shape_);
-    size_t indices_len = 0;
-    int start = 0, stop = 0, step = 0;
-
-    // get and check indices from slice or sequence of ints
-    if (inputs.size() == 3U) {
-        indices_len = inputs[insert_::kObj].shape_.Size();
-    } else if (param.step.has_value()) {
-        step = param.step.value();
-        CHECK_NE(step, 0) << "'step' can not equal to 0.";
-        if (param.stop.has_value()) {
-          stop = param.stop.value();
-          stop += (stop < 0) ? N : 0;
-          stop = (stop < 0) ? ((step < 0) ? -1 : 0) : stop;
-          stop = (stop >= N) ? ((step < 0) ? N - 1 : N) : stop;
-        } else {
-          stop = (step > 0) ? N : -1;
-        }
-        if (param.start.has_value()) {
-          start = param.start.value();
-          start += (start < 0) ? N : 0;
-          start = (start < 0) ? ((step < 0) ? -1 : 0) : start;
-          start = (start >= N) ? ((step < 0) ? N - 1 : N) : start;
-        } else {
-          start = (step > 0) ? 0 : N - 1;
-        }
-        int seq_cnt = 0;
-        if (step > 0 && stop >= start) {
-            seq_cnt = (stop - start + step - 1) / step;
-        } else if (step < 0 && stop <= start) {
-            seq_cnt = (stop - start + step + 1) / step;
-        }
-        indices_len = static_cast<size_t>(seq_cnt);
+    if (param.start.has_value()) {
+      start = param.start.value();
+      start += (start < 0) ? N : 0;
+      start = (start < 0) ? ((step < 0) ? -1 : 0) : start;
+      start = (start >= N) ? ((step < 0) ? N - 1 : N) : start;
+    } else {
+      start = (step > 0) ? 0 : N - 1;
     }
-
-    int numnew, index = 0;
-    mxnet::TShape val_newshape(arr.shape_.ndim(), -1);
-    for (int i = inputs[insert_::kValues].shape_.ndim() - 1, j = arr.shape_.ndim() - 1;
-         i >= 0 || j >= 0; --i, --j) {
-        if (i >= 0 && j >= 0) {
-            val_newshape[j] = inputs[insert_::kValues].shape_[i];
-        } else if (i >= 0) {
-            CHECK_EQ(inputs[insert_::kValues].shape_[i], 1) << "index exceed limits.";
-        } else {
-            val_newshape[j] = 1;
-        }
+    int seq_cnt = 0;
+    if (step > 0 && stop >= start) {
+      seq_cnt = (stop - start + step - 1) / step;
+    } else if (step < 0 && stop <= start) {
+      seq_cnt = (stop - start + step + 1) / step;
     }
-    values = inputs[insert_::kValues].reshape(val_newshape);
+    indices_len = static_cast<size_t>(seq_cnt);
+  }
 
-    mxnet::TShape old_valshape(values.shape_);
-    if (param.int_ind.has_value() ||
-      (inputs.size() == 3U && inputs[insert_::kObj].shape_.ndim() == 0)) {
-        if (param.int_ind.has_value()) {
-          index = param.int_ind.value();
-          CHECK(index >= -1 * N && index <= N)
-            << "Index should be in the range of [-r, r-1] where r is the dim size in 'axis'";
-          if (index < 0) {
-            index += N;
-          }
-        }
-        numnew = values.shape_[0];
-
-        // If 'obj' is a int, then, values = moveaxis(values, 0, axis)
-        mxnet::TShape axes(values.ndim(), -1);
-        mxnet::TShape val_newshape(values.ndim(), -1);
-        int axes_id = 0;
-        for (int i = 1; i <= axis; ++i) {
-            axes[axes_id++] = i;
-        }
-        axes[axes_id++] = 0;
-        for (int i = axis + 1; i < values.ndim(); ++i) {
-            axes[axes_id++] = i;
-        }
-        for (int i = 0; i < values.ndim(); ++i) {
-            val_newshape[i] = values.shape_[axes[i]];
-        }
-        values.shape_.assign(val_newshape.begin(), val_newshape.end());
-        newshape[axis] += numnew;
-    } else if (indices_len == 1) {
-        numnew = values.shape_[axis];
-        newshape[axis] += numnew;
-        if (param.step.has_value()) {
-            index = start;
-            CHECK(index >= -1 * N && index <= N)
-                << "Index should be in the range of [-r, r-1] where r is the dim size in 'axis'";
-            if (index < 0) {
-                index += N;
-            }
-        }
+  int numnew = 0;  // output.shape[axis] - arr.shape[axis]
+  int index = 0;  // modified index 
+  mxnet::TShape val_newshape(arr.shape_.ndim(), -1);
+  // modify values's ndim to arr's ndim, for broadcast easily later
+  // e.g. value shape: (2,) arr shape: (3, 2) => value shape: (1, 2)
+  for (int i = values.shape_.ndim() - 1, j = arr.shape_.ndim() - 1;
+       i >= 0 || j >= 0; --i, --j) {
+    if (i >= 0 && j >= 0) {
+      val_newshape[j] = values.shape_[i];
+    } else if (i >= 0) {
+      CHECK_EQ(values.shape_[i], 1) << "index exceed limits.";
     } else {
-      numnew = static_cast<int>(indices_len);
-      newshape[axis] += numnew;
+      val_newshape[j] = 1;
     }
+  }
+  values.shape_.assign(val_newshape.begin(), val_newshape.end());
 
-    const mxnet::TShape& outshape = outputs[insert_::kOut].shape_;
-    mshadow::Shape<10> arr_strides;
-    int stride = 1;
-    for (int i = arr.shape_.ndim() - 1; i >= 0; --i) {
-        arr_strides[i] = stride;
-        stride *= arr.shape_[i];
-    }
-    mshadow::Shape<10> val_strides;
-    stride = 1;
-    for (int i = values.shape_.ndim() - 1; i >= 0; --i) {
-        val_strides[i] = stride;
-        stride *= values.shape_[i];
+  // get numnew
+  mxnet::TShape old_valshape(values.shape_);
+  if (param.int_ind.has_value() ||
+    (inputs.size() == 3U && inputs[insert_::kObj].shape_.ndim() == 0)) {  // scaler
+    if (param.int_ind.has_value()) {
+      index = param.int_ind.value();
+      CHECK(index >= -1 * N && index <= N)
+        << "Index should be in the range of [-r, r-1] where r is the dim size in 'axis'";
+      if (index < 0) {
+        index += N;
+      }
     }
-    mshadow::Shape<10> old_val_strides;
-    stride = 1;
-    for (int i = old_valshape.ndim() - 1; i >= 0; --i) {
-        old_val_strides[i] = stride;
-        stride *= old_valshape[i];
+
+    // values = moveaxis(values, 0, axis), will change values's shape
+    numnew = values.shape_[0];
+    mxnet::TShape axes(values.ndim(), -1);  // moved axes
+    mxnet::TShape val_newshape(values.ndim(), -1);
+    int axes_id = 0;
+    for (int i = 1; i <= axis; ++i) {
+      axes[axes_id++] = i;
     }
-    mshadow::Shape<10> out_strides;
-    stride = 1;
-    for (int i = outshape.ndim() - 1; i >= 0; --i) {
-        out_strides[i] = stride;
-        stride *= outshape[i];
+    axes[axes_id++] = 0;
+    for (int i = axis + 1; i < values.ndim(); ++i) {
+      axes[axes_id++] = i;
     }
-    mshadow::Shape<10> k_outshape;
-    for (int i = 0 ; i < outshape.ndim() ; ++i) {
-        k_outshape[i] = outshape[i];
+    for (int i = 0; i < values.ndim(); ++i) {
+      val_newshape[i] = values.shape_[axes[i]];
     }
-    mshadow::Shape<10> k_valshape;
-    for (int i = 0 ; i < values.shape_.ndim() ; ++i) {
-        k_valshape[i] = values.shape_[i];
+    values.shape_.assign(val_newshape.begin(), val_newshape.end());
+  } else if (indices_len == 1) {  // tensor with only one element
+    numnew = values.shape_[axis];
+    if (param.step.has_value()) {
+      index = start;
+      CHECK(index >= -1 * N && index <= N)
+        << "Index should be in the range of [-r, r-1] where r is the dim size in 'axis'";
+      if (index < 0) {
+        index += N;
+      }
     }
+  } else {
+    numnew = static_cast<int>(indices_len);
+  }
 
-    if (param.int_ind.has_value()) {
-      MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
-        MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
+  const mxnet::TShape& outshape = outputs[insert_::kOut].shape_;
+  mshadow::Shape<10> arr_strides = GetStride<10>(arr.shape_);
+  mshadow::Shape<10> val_strides = GetStride<10>(values.shape_);
+  mshadow::Shape<10> old_val_strides = GetStride<10>(old_valshape);
+  mshadow::Shape<10> out_strides = GetStride<10>(outshape);
+  mshadow::Shape<10> k_outshape = GetKernelShape<10>(outshape);
+  for (int i = 0 ; i < outshape.ndim() ; ++i) {
+    k_outshape[i] = outshape[i];
+  }
+  mshadow::Shape<10> k_valshape = GetKernelShape<10>(values.shape_);
+  for (int i = 0 ; i < values.shape_.ndim() ; ++i) {
+    k_valshape[i] = values.shape_[i];
+  }
+  MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
+    MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH((inputs.size() == 3U) ?
+                           inputs[insert_::kObj].type_flag_ :
+                           mshadow::DataType<int64_t>::kFlag, IType, {
+        if (param.int_ind.has_value()) {
           Kernel<InsertSingleIndexForward<req_type>, xpu>::Launch(s, outshape.Size(),
-                                            outputs[insert_::kOut].dptr<DType>(),
-                                            values.dptr<DType>(), arr.dptr<DType>(),
-                                            k_outshape, k_valshape, index, numnew,
-                                            val_strides, old_val_strides, arr_strides,
-                                            out_strides, arr.shape_.ndim(),
-                                            values.shape_.ndim(), outshape.ndim(),
-                                            axis, true);
-        });
-      });
-    } else if (inputs.size() == 3U && inputs[insert_::kObj].shape_.ndim() == 0) {
-      MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
-        MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
-          MSHADOW_TYPE_SWITCH(inputs[insert_::kObj].type_flag_, IType, {
-            Kernel<InsertSingleIndexForward<req_type>, xpu>::Launch(s, outshape.Size(),
-                                            outputs[insert_::kOut].dptr<DType>(),
-                                            values.dptr<DType>(), arr.dptr<DType>(),
-                                            k_outshape, k_valshape, N,
-                                            inputs[insert_::kObj].dptr<IType>(), numnew,
-                                            val_strides, old_val_strides, arr_strides,
-                                            out_strides, arr.shape_.ndim(),
-                                            values.shape_.ndim(), outshape.ndim(),
-                                            axis, true);
-          });
-        });
-      });
-    } else if (indices_len == 1) {
-      MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
-        MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
+                                          outputs[insert_::kOut].dptr<DType>(),
+                                          values.dptr<DType>(), arr.dptr<DType>(),
+                                          k_outshape, k_valshape, index, numnew,
+                                          val_strides, old_val_strides, arr_strides,
+                                          out_strides, outshape.ndim(),
+                                          axis, true);
+        } else if (inputs.size() == 3U && inputs[insert_::kObj].shape_.ndim() == 0) {
+          Kernel<InsertSingleIndexForward<req_type>, xpu>::Launch(s, outshape.Size(),
+                                          outputs[insert_::kOut].dptr<DType>(),
+                                          values.dptr<DType>(), arr.dptr<DType>(),
+                                          k_outshape, k_valshape, N,
+                                          inputs[insert_::kObj].dptr<IType>(), numnew,
+                                          val_strides, old_val_strides, arr_strides,
+                                          out_strides, outshape.ndim(),
+                                          axis, true);
+        } else if (indices_len == 1) {
           if (param.step.has_value()) {
             Kernel<InsertSingleIndexForward<req_type>, xpu>::Launch(s, outshape.Size(),
                                             outputs[insert_::kOut].dptr<DType>(),
                                             values.dptr<DType>(), arr.dptr<DType>(),
                                             k_outshape, k_valshape, start, numnew,
                                             val_strides, old_val_strides, arr_strides, out_strides,
-                                            arr.shape_.ndim(), values.shape_.ndim(),
                                             outshape.ndim(), axis, false);
           } else {
-            MSHADOW_TYPE_SWITCH(inputs[insert_::kObj].type_flag_, IType, {
-              Kernel<InsertSingleIndexForward<req_type>, xpu>::Launch(s, outshape.Size(),
-                                                outputs[insert_::kOut].dptr<DType>(),
-                                                values.dptr<DType>(), arr.dptr<DType>(),
-                                                k_outshape, k_valshape,
-                                                N, inputs[insert_::kObj].dptr<IType>(), numnew,
-                                                val_strides, old_val_strides,
-                                                arr_strides, out_strides,
-                                                arr.shape_.ndim(), values.shape_.ndim(),
-                                                outshape.ndim(), axis, false);
-            });
+            Kernel<InsertSingleIndexForward<req_type>, xpu>::Launch(s, outshape.Size(),
+                                            outputs[insert_::kOut].dptr<DType>(),
+                                            values.dptr<DType>(), arr.dptr<DType>(),
+                                            k_outshape, k_valshape,
+                                            N, inputs[insert_::kObj].dptr<IType>(), numnew,
+                                            val_strides, old_val_strides,
+                                            arr_strides, out_strides,
+                                            outshape.ndim(), axis, false);
           }
-        });
-      });
-    } else {
-        // broadcast check
-        for (int i = outshape.ndim() - 1; i >= 0; --i) {
+        } else {
+          // broadcast check
+          for (int i = outshape.ndim() - 1; i >= 0; --i) {
             int sz = outshape[i];
             if (i == axis) {
-                sz = numnew;
+              sz = numnew;
             }
             CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz));
-        }
-        size_t temp_storage_bytes, temp_mem_size;
-        MSHADOW_TYPE_SWITCH((inputs.size() == 3U) ?
-                             inputs[insert_::kObj].type_flag_ :
-                             mshadow::DataType<int64_t>::kFlag, IType, {
+          }
+          size_t temp_storage_bytes, temp_mem_size;
           temp_storage_bytes = SortByKeyWorkspaceSize<IType, int, xpu>(indices_len, false, true);
           temp_mem_size = indices_len * sizeof(IType) * 2 +
-                                      indices_len * sizeof(int) +
-                                      newshape[axis] * sizeof(int) * 2 +
-                                      temp_storage_bytes;
+                          indices_len * sizeof(int) +
+                          outshape[axis] * sizeof(int) * 2 +
+                          temp_storage_bytes;
           Tensor<xpu, 1, char> temp_mem =
                   ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s);
           IType* indices_ptr = reinterpret_cast<IType*>(temp_mem.dptr_);
@@ -588,9 +590,9 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
           int* is_insert = reinterpret_cast<int*>(temp_mem.dptr_ + indices_len * sizeof(IType) * 2
                             + indices_len * sizeof(int));
           int* origin_idx = reinterpret_cast<int*>(temp_mem.dptr_ +  indices_len * sizeof(IType) * 2
-                            + indices_len * sizeof(int) + newshape[axis] * sizeof(int));
+                            + indices_len * sizeof(int) + outshape[axis] * sizeof(int));
           Tensor<xpu, 1, char> temp_storage(temp_mem.dptr_ +  indices_len * sizeof(IType) * 2
-                            + indices_len * sizeof(int) + newshape[axis] * sizeof(int) * 2,
+                            + indices_len * sizeof(int) + outshape[axis] * sizeof(int) * 2,
                             Shape1(temp_storage_bytes), s);
           Tensor<xpu, 1, IType> indices(indices_ptr, Shape1(indices_len), s);
           Tensor<xpu, 1, IType> sorted_indices(sorted_indices_ptr, Shape1(indices_len), s);
@@ -598,38 +600,31 @@ void NumpyInsertCompute(const nnvm::NodeAttrs& attrs,
           int num_bits = common::ilog2ui(static_cast<unsigned int>(indices_len) - 1);
 
           if (param.step.has_value()) {
-            Kernel<SliceToIndices, xpu>::Launch(s, indices_len,
-                                        indices_ptr, N,
-                                        start, step);
+            Kernel<SliceToIndices, xpu>::Launch(s, indices_len, indices_ptr, N, start, step);
           } else {
-            Kernel<ObjToIndices, xpu>::Launch(s, indices_len,
-                                      indices_ptr, N,
-                                      inputs[insert_::kObj].dptr<IType>());
+            Kernel<ObjToIndices, xpu>::Launch(s, indices_len, indices_ptr, N, 
+                                              inputs[insert_::kObj].dptr<IType>());
           }
-
           Kernel<AssignId, xpu>::Launch(s, indices_len, order_ptr);
           mxnet::op::SortByKey(indices, order, true, &temp_storage, 0, num_bits, &sorted_indices);
           Kernel<IndicesModify, xpu>::Launch(s, indices_len, indices_ptr, order_ptr);
 
-          Kernel<AssignInsertZero, xpu>::Launch(s, newshape[axis], is_insert);
+          Kernel<AssignInsertZero, xpu>::Launch(s, outshape[axis], is_insert);
           Kernel<SetIsInsert, xpu>::Launch(s, indices_len, indices_ptr, is_insert);
 
           Kernel<SetOriginValuesIdx, xpu>::Launch(s, indices_len, indices_ptr, origin_idx);
-          Kernel<SetOriginArrIdx, xpu>::Launch(s, newshape[axis], is_insert, origin_idx);
-
-          MSHADOW_TYPE_SWITCH(outputs[insert_::kOut].type_flag_, DType, {
-            MXNET_ASSIGN_REQ_SWITCH(req[insert_::kOut], req_type, {
-              Kernel<InsertSeqForward<req_type>, xpu>::Launch(s, outshape.Size(),
-                                              outputs[insert_::kOut].dptr<DType>(),
-                                              values.dptr<DType>(), arr.dptr<DType>(),
-                                              k_outshape, k_valshape, is_insert, origin_idx,
-                                              val_strides, arr_strides, out_strides,
-                                              arr.shape_.ndim(), values.shape_.ndim(),
-                                              outshape.ndim(), axis);
-            });
-          });
-        });
-    }
+          Kernel<SetOriginArrIdx, xpu>::Launch(s, outshape[axis], is_insert, origin_idx);
+
+          Kernel<InsertSeqForward<req_type>, xpu>::Launch(s, outshape.Size(),
+                                  outputs[insert_::kOut].dptr<DType>(),
+                                  values.dptr<DType>(), arr.dptr<DType>(),
+                                  k_outshape, k_valshape, is_insert, origin_idx,
+                                  val_strides, arr_strides, out_strides,
+                                  outshape.ndim(), axis);
+        }
+      });
+    });
+  });
 }
 
 }  // namespace op
diff --git a/src/operator/numpy/np_insert_op.cc b/src/operator/numpy/np_insert_op.cc
index 1f4151883a3e..5fa23a510826 100644
--- a/src/operator/numpy/np_insert_op.cc
+++ b/src/operator/numpy/np_insert_op.cc
@@ -42,7 +42,7 @@ bool NumpyInsertType(const nnvm::NodeAttrs& attrs,
     CHECK_NE((*in_type)[2], -1) << "Index type must be set for insert operator\n";
     CHECK(((*in_type)[2] == mshadow::DataType<int64_t>::kFlag)
           || ((*in_type)[2] == mshadow::DataType<int32_t>::kFlag))
-          << "Index type only support int32 or int64.\n";
+      << "Index type only support int32 or int64.\n";
   }
   TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[0]);
   TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[1]);
@@ -62,7 +62,7 @@ bool NumpyInsertShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape &objShape = (*in_shape)[insert_::kObj];
   if (in_shape->size() == 3U) {
     CHECK_LE(objShape.ndim(), 1)
-        << "index array argument obj to insert must be one dimensional or scale.\n";
+      << "index array argument obj to insert must be one dimensional or scale.\n";
   }
 
   out_shape->clear();
@@ -117,7 +117,9 @@ bool NumpyInsertShape(const nnvm::NodeAttrs& attrs,
 
   mxnet::TShape newshape(arrshape);
   mxnet::TShape val_newshape(arrshape.ndim(), -1);
-  int numnew;
+  int numnew = 0;  // amount of new column insert to 'arr' in 'axis'
+  // modify values's ndim to arr's ndim, for broadcast easily later
+  // e.g. value shape: (2,) arr shape: (3, 2) => value shape: (1, 2)
   for (int i = valshape.ndim() - 1, j = arrshape.ndim() - 1; i >= 0 || j >= 0; --i, --j) {
     if (i >= 0 && j >= 0) {
       val_newshape[j] = valshape[i];