diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 7679d21f0e59..0d0e3b64491b 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -1053,6 +1053,58 @@ def _np_diag(array, k=0):
     pass
 
 
+def _np_diagonal(a, offset=0, axis1=0, axis2=1):
+    """
+    If a is 2-D, returns the diagonal of a with the given offset, i.e., the collection of elements of
+    the form a[i, i+offset]. If a has more than two dimensions, then the axes specified by axis1 and
+    axis2 are used to determine the 2-D sub-array whose diagonal is returned. The shape of the
+    resulting array can be determined by removing axis1 and axis2 and appending an index to the
+    right equal to the size of the resulting diagonals.
+
+    Parameters
+    ----------
+    a : Symbol
+        Input data from which diagonal are taken.
+    offset: int, Optional
+        Offset of the diagonal from the main diagonal
+    axis1: int, Optional
+        Axis to be used as the first axis of the 2-D sub-arrays
+    axis2: int, Optional
+        Axis to be used as the second axis of the 2-D sub-arrays
+
+    Returns
+    -------
+    out : Symbol
+        Output result
+
+    Raises
+    -------
+    ValueError:  If the dimension of a is less than 2.
+
+    Examples
+    --------
+    >>> a = np.arange(4).reshape(2,2)
+    >>> a
+    array([[0, 1],
+        [2, 3]])
+    >>> np.diagonal(a)
+    array([0, 3])
+    >>> np.diagonal(a, 1)
+    array([1])
+
+    >>> a = np.arange(8).reshape(2,2,2)
+    >>>a
+    array([[[0, 1],
+            [2, 3]],
+            [[4, 5],
+            [6, 7]]])
+    >>> np.diagonal(a, 0, 0, 1)
+    array([[0, 6],
+            [1, 7]])
+    """
+    pass
+
+
 def _np_diagflat(array, k=0):
     """
     Create a two-dimensional array with the flattened input as a diagonal.
@@ -1086,4 +1138,4 @@ def _np_diagflat(array, k=0):
            [0, 0, 2],
            [0, 0, 0]])
     """
-    pass
\ No newline at end of file
+    pass
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index c8b11d85b000..1a238ec2c7c7 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -95,6 +95,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'copy',
     'cumsum',
     'diag',
+    'diagonal',
     'diagflat',
     'dot',
     'expand_dims',
diff --git a/src/operator/numpy/np_matrix_op-inl.h b/src/operator/numpy/np_matrix_op-inl.h
index 37af908042c9..3abf5627f20f 100644
--- a/src/operator/numpy/np_matrix_op-inl.h
+++ b/src/operator/numpy/np_matrix_op-inl.h
@@ -1156,6 +1156,224 @@ void NumpyDiagOpBackward(const nnvm::NodeAttrs &attrs,
                              in_data.Size(), param.k, s, req[0]);
 }
 
+struct NumpyDiagonalParam : public dmlc::Parameter<NumpyDiagonalParam> {
+  int offset;
+  int32_t axis1;
+  int32_t axis2;
+  DMLC_DECLARE_PARAMETER(NumpyDiagonalParam) {
+    DMLC_DECLARE_FIELD(offset)
+      .set_default(0)
+      .describe("Diagonal in question. The default is 0. "
+                "Use k>0 for diagonals above the main diagonal, "
+                "and k<0 for diagonals below the main diagonal. "
+                "If input has shape (S0 S1) k must be between -S0 and S1");
+    DMLC_DECLARE_FIELD(axis1)
+      .set_default(0)
+      .describe("The first axis of the sub-arrays of interest. "
+                "Ignored when the input is a 1-D array.");
+    DMLC_DECLARE_FIELD(axis2)
+      .set_default(1)
+      .describe("The second axis of the sub-arrays of interest. "
+                "Ignored when the input is a 1-D array.");
+  }
+};
+
+inline mxnet::TShape NumpyDiagonalShapeImpl(const mxnet::TShape& ishape, const int k,
+                                            const int32_t axis1, const int32_t axis2) {
+  int32_t x1 = CheckAxis(axis1, ishape.ndim());
+  int32_t x2 = CheckAxis(axis2, ishape.ndim());
+
+  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer to the same axis " << x1;
+
+  auto h = ishape[x1];
+  auto w = ishape[x2];
+  if (k > 0) {
+    w -= k;
+  } else if (k < 0) {
+    h += k;
+  }
+  auto s = std::min(h, w);
+  if (s < 0) s = 0;
+  if (x1 > x2) std::swap(x1, x2);
+
+  int32_t n_dim = ishape.ndim() - 1;
+  mxnet::TShape oshape(n_dim, -1);
+
+  // remove axis1 and axis2 and append the new axis to the end
+  uint32_t idx = 0;
+  for (int i = 0; i <= n_dim; ++i) {
+    if (i != x1 && i != x2) {
+      oshape[idx++] = ishape[i];
+    }
+  }
+  oshape[n_dim - 1] = s;
+  return oshape;
+}
+
+inline bool NumpyDiagonalOpShape(const nnvm::NodeAttrs& attrs,
+                                 mxnet::ShapeVector* in_attrs,
+                                 mxnet::ShapeVector* out_attrs) {
+    CHECK_EQ(in_attrs->size(), 1U);
+    CHECK_EQ(out_attrs->size(), 1U);
+
+    const mxnet::TShape& ishape = (*in_attrs)[0];
+    CHECK_GE(ishape.ndim(), 2) << "Input array should be at least 2d";
+    if (!mxnet::ndim_is_known(ishape)) {
+      return false;
+    }
+
+    const NumpyDiagonalParam& param = nnvm::get<NumpyDiagonalParam>(attrs.parsed);
+    mxnet::TShape oshape = NumpyDiagonalShapeImpl(ishape, param.offset, param.axis1,
+                                         param.axis2);
+    if (shape_is_none(oshape)) {
+      LOG(FATAL) << "Diagonal does not exist.";
+    }
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+    return shape_is_known(out_attrs->at(0));
+}
+
+inline bool NumpyDiagonalOpType(const nnvm::NodeAttrs& attrs,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  return (*out_attrs)[0] != -1;
+}
+
+template<int ndim, int req, bool back>
+struct diag_n {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* a,
+                                  mshadow::Shape<ndim> oshape,
+                                  mshadow::Shape<ndim> ishape,
+                                  index_t stride, index_t offset,
+                                  index_t base) {
+    using namespace mxnet_op;
+    index_t idx = i / base;
+    index_t j = ravel(unravel(idx, oshape), ishape) + offset + stride * (i - idx * base);
+    if (back) {
+      KERNEL_ASSIGN(out[j], req, a[i]);
+    } else {
+      KERNEL_ASSIGN(out[i], req, a[j]);
+    }
+  }
+};
+
+template<typename xpu, bool back>
+void NumpyDiagonalOpImpl(const TBlob& in_data,
+                         const TBlob& out_data,
+                         const mxnet::TShape& ishape,
+                         const mxnet::TShape& oshape,
+                         index_t dsize,
+                         const NumpyDiagonalParam& param,
+                         mxnet_op::Stream<xpu> *s,
+                         const std::vector<OpReqType>& req) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  uint32_t x1 = CheckAxis(param.axis1, ishape.ndim());
+  uint32_t x2 = CheckAxis(param.axis2, ishape.ndim());
+  uint32_t idim = ishape.ndim(), odim = oshape.ndim();
+  uint32_t minx = x1, maxx = x2;
+  if (minx > maxx) std::swap(minx, maxx);
+
+  index_t oleading = 1,
+          obody = 1,
+          otrailing = 1;
+  for (uint32_t i = 0; i < minx; ++i) {
+    oleading *= ishape[i];
+  }
+  for (uint32_t i = minx + 1; i < maxx; ++i) {
+    obody *= ishape[i];
+  }
+  for (uint32_t i = maxx + 1; i < idim; ++i) {
+    otrailing *= ishape[i];
+  }
+
+  index_t ileading = oleading,
+      ibody = obody * ishape[minx],
+      itrailing = otrailing * ishape[maxx];
+
+  index_t stride1 = itrailing * obody,
+      stride2 = otrailing;
+  // stride1 + stride2 is the stride for iterating over the diagonal
+
+  if (x1 == maxx) std::swap(stride1, stride2);
+  index_t offset;
+  int k = param.offset;
+  if (k > 0) {
+    offset = stride2 * k;
+  } else if (k < 0) {
+    offset = stride1 * -k;
+  } else {
+    offset = 0;
+  }  // the extra index offset introduced by k
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      if (back && req[0] != kAddTo && req[0] != kNullOp) {
+        out_data.FlatTo1D<xpu, DType>(s) = 0;
+      }
+      if (ileading == 1) {
+        Kernel<diag_n<2, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
+          in_data.dptr<DType>(), Shape2(obody, otrailing), Shape2(ibody, itrailing),
+          stride1 + stride2, offset, oshape[odim - 1]);
+      } else {
+        Kernel<diag_n<3, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
+          in_data.dptr<DType>(), Shape3(oleading, obody, otrailing),
+          Shape3(ileading, ibody, itrailing), stride1 + stride2, offset, oshape[odim - 1]);
+      }
+    });
+  });
+}
+
+template<typename xpu>
+void NumpyDiagonalOpForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const mxnet::TShape& ishape = inputs[0].shape_;
+  const mxnet::TShape& oshape = outputs[0].shape_;
+  const NumpyDiagonalParam& param = nnvm::get<NumpyDiagonalParam>(attrs.parsed);
+
+  NumpyDiagonalOpImpl<xpu, false>(in_data, out_data, ishape, oshape,
+                                  out_data.Size(), param, s, req);
+}
+
+template<typename xpu>
+void NumpyDiagonalOpBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const mxnet::TShape& ishape = inputs[0].shape_;
+  const mxnet::TShape& oshape = outputs[0].shape_;
+  const NumpyDiagonalParam& param = nnvm::get<NumpyDiagonalParam>(attrs.parsed);
+
+  NumpyDiagonalOpImpl<xpu, true>(in_data, out_data, oshape, ishape,
+                                 in_data.Size(), param, s, req);
+}
+
 struct NumpyDiagflatParam : public dmlc::Parameter<NumpyDiagflatParam> {
   int k;
   DMLC_DECLARE_PARAMETER(NumpyDiagflatParam) {
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index 05b7e948e8a0..0a85887e2d4f 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -38,6 +38,7 @@ DMLC_REGISTER_PARAMETER(NumpyRot90Param);
 DMLC_REGISTER_PARAMETER(NumpyReshapeParam);
 DMLC_REGISTER_PARAMETER(NumpyXReshapeParam);
 DMLC_REGISTER_PARAMETER(NumpyDiagParam);
+DMLC_REGISTER_PARAMETER(NumpyDiagonalParam);
 DMLC_REGISTER_PARAMETER(NumpyDiagflatParam);
 
 
@@ -1332,6 +1333,28 @@ NNVM_REGISTER_OP(_backward_np_diag)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", NumpyDiagOpBackward<cpu>);
 
+NNVM_REGISTER_OP(_np_diagonal)
+.set_attr_parser(ParamParser<NumpyDiagonalParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyDiagonalOpShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpyDiagonalOpType)
+.set_attr<FCompute>("FCompute<cpu>", NumpyDiagonalOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_diagonal"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(NumpyDiagonalParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_diagonal)
+.set_attr_parser(ParamParser<NumpyDiagonalParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", NumpyDiagonalOpBackward<cpu>);
+
 NNVM_REGISTER_OP(_np_diagflat)
 .set_attr_parser(ParamParser<NumpyDiagflatParam>)
 .set_num_inputs(1)
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 6f292ab95802..10ff0eac2c29 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -124,6 +124,12 @@ NNVM_REGISTER_OP(_np_diag)
 NNVM_REGISTER_OP(_backward_np_diag)
 .set_attr<FCompute>("FCompute<gpu>", NumpyDiagOpBackward<gpu>);
 
+NNVM_REGISTER_OP(_np_diagonal)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDiagonalOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_diagonal)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDiagonalOpBackward<gpu>);
+
 NNVM_REGISTER_OP(_np_diagflat)
 .set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpForward<gpu>);
 
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index d1047c80b7b4..4c4e8b90eca9 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -97,6 +97,23 @@ def get_mat(n):
     OpArgMngr.add_workload('diag', vals_f, k=-2)
 
 
+def _add_workload_diagonal():
+    A = np.arange(12).reshape((3, 4))
+    B = np.arange(8).reshape((2,2,2))
+
+    OpArgMngr.add_workload('diagonal', A)
+    OpArgMngr.add_workload('diagonal', A, offset=0)
+    OpArgMngr.add_workload('diagonal', A, offset=-1)
+    OpArgMngr.add_workload('diagonal', A, offset=1)
+    OpArgMngr.add_workload('diagonal', B, offset=0)
+    OpArgMngr.add_workload('diagonal', B, offset=1)
+    OpArgMngr.add_workload('diagonal', B, offset=-1)
+    OpArgMngr.add_workload('diagonal', B, 0, 1, 2)
+    OpArgMngr.add_workload('diagonal', B, 0, 0, 1)
+    OpArgMngr.add_workload('diagonal', B, offset=1, axis1=0, axis2=2)
+    OpArgMngr.add_workload('diagonal', B, 0, 2, 1)
+
+
 def _add_workload_concatenate(array_pool):
     OpArgMngr.add_workload('concatenate', [array_pool['4x1'], array_pool['4x1']])
     OpArgMngr.add_workload('concatenate', [array_pool['4x1'], array_pool['4x1']], axis=1)
@@ -1364,6 +1381,7 @@ def _prepare_workloads():
     _add_workload_ravel()
     _add_workload_unravel_index()
     _add_workload_diag()
+    _add_workload_diagonal()
     _add_workload_diagflat()
     _add_workload_dot()
     _add_workload_expand_dims()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 20482e8241da..078e37fc4146 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -5158,6 +5158,85 @@ def hybrid_forward(self, F, a):
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
 
 
+
+@with_seed()
+@use_np
+def test_np_diagonal():
+    class TestDiagonal(HybridBlock):
+        def __init__(self, k=0, axis1=0, axis2=1):
+            super(TestDiagonal, self).__init__()
+            self._k = k
+            self._axis1 = axis1
+            self._axis2 = axis2
+
+        def hybrid_forward(self, F, a):
+            return F.np.diagonal(a, self._k, self._axis1, self._axis2)
+
+    configs = [
+        [(1, 5), (0, 1)], [(2, 2),(0, 1)],
+        [(2, 5), (0, 1)], [(5, 5), (0, 1)],
+        [(2, 2, 2), (0, 1)], [(2, 4, 4), (0, 2)],
+        [(3, 3, 3), (1, 2)], [(4, 8, 8), (1, 2)],
+        [(4, 4, 4, 4), (1, 2)], [(5, 6, 7, 8), (2, 3)],
+        [(6, 7, 8, 9, 10), (3, 4)]
+    ]
+    dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
+    offsets = [0, 2, 4, 6]
+    combination = itertools.product([False, True], configs, dtypes, offsets)
+    for hybridize, config, dtype, k in combination:
+        rtol = 1e-2 if dtype == np.float16 else 1e-3
+        atol = 1e-4 if dtype == np.float16 else 1e-5
+        shape = config[0]
+        axis = config[1]
+        axis1 = axis[0]
+        axis2 = axis[1]
+        x = np.random.uniform(-5.0, 5.0, size=shape).astype(dtype)
+        x.attach_grad()
+        test_diagonal = TestDiagonal(k, axis1, axis2)
+        if hybridize:
+            test_diagonal.hybridize()
+        np_out = _np.diagonal(x.asnumpy(), offset=k, axis1=axis[0], axis2=axis[1])
+        with mx.autograd.record():
+            mx_out = test_diagonal(x)
+        assert mx_out.shape == np_out.shape
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+        # check backward function
+        mx_out.backward()
+        size_out = np_out.size
+        shape_out = np_out.shape
+        ndim = len(shape)
+        h = shape[axis1]
+        w = shape[axis2]
+        np_backward_slice = _np.zeros((h, w))
+        np_backward = _np.zeros(shape)
+        if k > 0:
+            w -= k
+        else:
+            h += k
+        s = min(w, h)
+        if s > 0:
+            if k >= 0:
+                for i in range(s):
+                    np_backward_slice[0+i][k+i] = 1
+            else:
+                for i in range(s):
+                    np_backward_slice[-k+i][0+i] = 1
+            ileading = int(size_out/s)
+            array_temp = _np.array([np_backward_slice for i in range(ileading)])
+            array_temp = array_temp.reshape(shape_out[:-1] + (shape[axis1], shape[axis2]))
+            axis_idx = [i for i in range(ndim-2)]
+            axis_idx[axis1:axis1] = [ndim - 2]
+            axis_idx[axis2:axis2] = [ndim - 1] 
+            np_backward = _np.transpose(array_temp, tuple(axis_idx))
+        assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)
+
+        # Test imperative once again
+        mx_out = np.diagonal(x, k, axis[0], axis[1])
+        np_out = _np.diagonal(x.asnumpy(), offset=k, axis1=axis[0], axis2=axis[1])
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+
 @with_seed()
 @use_np
 def test_np_nan_to_num():