diff --git a/src/operator/contrib/index_copy-inl.h b/src/operator/contrib/index_copy-inl.h
index d93bf47949a8..903dee13272b 100644
--- a/src/operator/contrib/index_copy-inl.h
+++ b/src/operator/contrib/index_copy-inl.h
@@ -37,108 +37,19 @@
 namespace mxnet {
 namespace op {
 
-template<int req>
-struct index_copy_forward {
-  template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i,
-                                  int dim,
-                                  IType* index,
-                                  DType* new_tensor,
-                                  DType* out_tensor) {
-    DType* out_ptr = out_tensor + static_cast<int>(index[i]) * dim;
-    DType* new_ptr = new_tensor + i * dim;
-    for (int idx = 0; idx < dim; ++idx) {
-      KERNEL_ASSIGN(out_ptr[idx], req, new_ptr[idx]);
-    }
-  }
-};
-
 template<typename xpu>
 void IndexCopyForward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 3U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob& out = outputs[0];
-  const TBlob& original_tensor = inputs[0];
-  const TBlob& idx_vector = inputs[1];
-  const TBlob& copied_tensor = inputs[2];
-  int dim = inputs[2].Size() / inputs[1].Size();
-  // copy original tensor to output
-  mxnet_op::copy(s, out, original_tensor);
-  // index copy
-  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        mxnet_op::Kernel<index_copy_forward<req_type>, xpu>::Launch(s,
-                              idx_vector.Size(), dim,
-                              idx_vector.dptr<IType>(),
-                              copied_tensor.dptr<DType>(),
-                              out.dptr<DType>());
-      });
-    });
-  });
-}
-
-struct index_copy_backward {
-  template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i,
-                                  int dim,
-                                  int index_size,
-                                  int req1, int req2,
-                                  DType* out_grad,
-                                  IType* index,
-                                  DType* in_grad_1,
-                                  DType* in_grad_2) {
-    // Copy to in_grad_2
-    for (int p = 0; p < index_size; ++p) {
-      int idx = static_cast<int>(index[p]);
-      if (i >= idx*dim && i < (idx+1)*dim) {
-        int offset = i - idx*dim;
-        KERNEL_ASSIGN(in_grad_2[p*dim+offset], req2, out_grad[i]);
-        return;
-      }
-    }
-    // Copy to in_grad_1
-    KERNEL_ASSIGN(in_grad_1[i], req1, out_grad[i]);
-  }
-};
+                      const std::vector<TBlob>& outputs);
 
 template<typename xpu>
 void IndexCopyBackward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 4U);
-  CHECK_EQ(outputs.size(), 3U);
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob& out_grad = inputs[0];
-  const TBlob& index = inputs[2];
-  const TBlob& in_grad_1 = outputs[0];
-  const TBlob& in_grad_2 = outputs[2];
-  int dim = inputs[3].Size() / inputs[2].Size();
-  int index_size = inputs[2].Size();
-  Fill<false>(s, outputs[0], req[0], 0);
-  Fill<false>(s, outputs[2], req[2], 0);
-  // index_copy_backward
-  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
-      mxnet_op::Kernel<index_copy_backward, xpu>::Launch(s,
-                                      out_grad.Size(),
-                                      dim, index_size,
-                                      req[0], req[2],
-                                      out_grad.dptr<DType>(),
-                                      index.dptr<IType>(),
-                                      in_grad_1.dptr<DType>(),
-                                      in_grad_2.dptr<DType>());
-    });
-  });
-}
+                       const std::vector<TBlob>& outputs);
 
 inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
                            mxnet::ShapeVector *in_attrs,
diff --git a/src/operator/contrib/index_copy.cc b/src/operator/contrib/index_copy.cc
index bcf6c02d3d37..f272a8860d85 100644
--- a/src/operator/contrib/index_copy.cc
+++ b/src/operator/contrib/index_copy.cc
@@ -26,6 +26,122 @@
 namespace mxnet {
 namespace op {
 
+struct index_copy_fwd_cpu {
+  template<typename DType, typename IType>
+  static void Map(int i,
+                  const DType* new_tensor,
+                  const IType* idx,
+                  DType* out_tensor,
+                  int dim_size) {
+    DType* out_ptr = out_tensor + static_cast<int>(idx[i]) * dim_size;
+    const DType* new_ptr = new_tensor + i * dim_size;
+    std::memcpy(out_ptr, new_ptr, sizeof(DType) * dim_size);
+  }
+};
+
+template<>
+void IndexCopyForward<cpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK(req[0] != kAddTo);
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  const TBlob& out = outputs[0];
+  const TBlob& original_tensor = inputs[0];
+  const TBlob& idx_vector = inputs[1];
+  const TBlob& copied_tensor = inputs[2];
+  int dim_size = inputs[2].Size() / inputs[1].Size();
+  // copy original tensor to output
+  copy(s, out, original_tensor);
+  // index copy
+  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
+      Kernel<index_copy_fwd_cpu, cpu>::Launch(
+        s, idx_vector.Size(), copied_tensor.dptr<DType>(),
+        idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
+    });
+  });
+}
+
+struct index_copy_bwd_cpu {
+  template<typename DType, typename IType>
+  static void Map(int i,
+                  const DType* out_tensor_grad,
+                  DType* orig_tensor_grad,
+                  DType* new_tensor_grad,
+                  const IType* idx,
+                  int dim_size,
+                  int idx_size,
+                  OpReqType orig_req,
+                  OpReqType new_req) {
+    const int index = idx[i];
+    DType* new_ptr = new_tensor_grad + i * dim_size;
+    DType* orig_ptr = orig_tensor_grad + index * dim_size;
+    const DType* src_ptr = out_tensor_grad + index * dim_size;
+    for (int iter = 0; iter < dim_size; ++iter) {
+      KERNEL_ASSIGN(new_ptr[iter], new_req, src_ptr[iter]);
+    }
+    if (orig_req == kAddTo) {
+      for (int iter = 0; iter < dim_size; ++iter) {
+        orig_ptr[iter] -= src_ptr[iter];
+      }
+    } else if (orig_req == kNullOp) {
+      return;
+    } else {
+      std::memset(orig_ptr, 0, sizeof(DType) * dim_size);
+    }
+  }
+};
+
+template<>
+void IndexCopyBackward<cpu>(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 4U);
+  CHECK_EQ(outputs.size(), 3U);
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const TBlob& out_grad = inputs[0];
+  const TBlob& index = inputs[2];
+  const TBlob& in_grad_1 = outputs[0];
+  const TBlob& in_grad_2 = outputs[2];
+  int dim_size = inputs[3].Size() / inputs[2].Size();
+  int index_size = inputs[2].Size();
+  OpReqType orig_req = req[0];
+  OpReqType new_req = req[2];
+  // index_copy_backward
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
+      switch (orig_req) {
+        case kNullOp:
+          break;
+        case kWriteTo:
+        case kWriteInplace:
+          copy(s, in_grad_1, out_grad);
+          break;
+        case kAddTo:
+          Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, cpu>::Launch(
+            s, out_grad.Size(), in_grad_1.dptr<DType>(),
+            out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
+      }
+      Kernel<index_copy_bwd_cpu, cpu>::Launch(
+        s, index_size, out_grad.dptr<DType>(),
+        in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
+        index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
+    });
+  });
+}
+
 static bool IndexCopyType(const nnvm::NodeAttrs& attrs,
                           std::vector<int> *in_attrs,
                           std::vector<int> *out_attrs) {
@@ -71,6 +187,10 @@ Examples::
 .set_attr<nnvm::FInferType>("FInferType", IndexCopyType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_contrib_backward_index_copy"})
 .set_attr<FCompute>("FCompute<cpu>", IndexCopyForward<cpu>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"old_tensor", "index_vector", "new_tensor"};
+  })
 .add_argument("old_tensor", "NDArray-or-Symbol", "Old tensor")
 .add_argument("index_vector", "NDArray-or-Symbol", "Index vector")
 .add_argument("new_tensor", "NDArray-or-Symbol", "New tensor to be copied");
diff --git a/src/operator/contrib/index_copy.cu b/src/operator/contrib/index_copy.cu
index dc416114b04d..53f2600aba06 100644
--- a/src/operator/contrib/index_copy.cu
+++ b/src/operator/contrib/index_copy.cu
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file index_copy.cc
+ * \file index_copy.cu
  * \brief
  */
 #include "./index_copy-inl.h"
@@ -26,6 +26,114 @@
 namespace mxnet {
 namespace op {
 
+struct index_copy_fwd_gpu {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const DType* new_tensor,
+                                  const IType* idx,
+                                  DType* out_tensor,
+                                  int dim_size) {
+    int index = static_cast<int>(idx[i / dim_size]);
+    out_tensor[index * dim_size + i % dim_size] = new_tensor[i];
+  }
+};
+
+template<>
+void IndexCopyForward<gpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK(req[0] != kAddTo);
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+  const TBlob& out = outputs[0];
+  const TBlob& original_tensor = inputs[0];
+  const TBlob& idx_vector = inputs[1];
+  const TBlob& copied_tensor = inputs[2];
+  int dim_size = inputs[2].Size() / inputs[1].Size();
+  // copy original tensor to output
+  copy(s, out, original_tensor);
+  // index copy
+  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
+      Kernel<index_copy_fwd_gpu, gpu>::Launch(
+        s, copied_tensor.Size(), copied_tensor.dptr<DType>(),
+        idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
+    });
+  });
+}
+
+struct index_copy_bwd_gpu {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const DType* out_grad,
+                                  DType* orig_grad,
+                                  DType* new_grad,
+                                  const IType* idx,
+                                  int dim_size,
+                                  int idx_size,
+                                  OpReqType orig_req,
+                                  OpReqType new_req) {
+    int index = idx[i / dim_size];
+    KERNEL_ASSIGN(new_grad[i], new_req, out_grad[index * dim_size + i % dim_size]);
+    if (orig_req == kAddTo) {
+      orig_grad[index * dim_size + i % dim_size] -= new_grad[i];
+    } else if (orig_req == kNullOp) {
+      return;
+    } else {
+      orig_grad[index * dim_size + i % dim_size] = 0;
+    }
+  }
+};
+
+template<>
+void IndexCopyBackward<gpu>(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 4U);
+  CHECK_EQ(outputs.size(), 3U);
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  const TBlob& out_grad = inputs[0];
+  const TBlob& index = inputs[2];
+  const TBlob& in_grad_1 = outputs[0];
+  const TBlob& in_grad_2 = outputs[2];
+  int dim_size = inputs[3].Size() / inputs[2].Size();
+  int index_size = inputs[2].Size();
+  OpReqType orig_req = req[0];
+  OpReqType new_req = req[2];
+  // index_copy_backward
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
+      switch (orig_req) {
+        case kNullOp:
+          break;
+        case kWriteTo:
+        case kWriteInplace:
+          copy(s, in_grad_1, out_grad);
+          break;
+        case kAddTo:
+          Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, gpu>::Launch(
+            s, out_grad.Size(), in_grad_1.dptr<DType>(),
+            out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
+      }
+      Kernel<index_copy_bwd_gpu, gpu>::Launch(
+        s, in_grad_2.Size(), out_grad.dptr<DType>(),
+        in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
+        index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
+    });
+  });
+}
+
 NNVM_REGISTER_OP(_contrib_index_copy)
 .set_attr<FCompute>("FCompute<gpu>", IndexCopyForward<gpu>);