From ce4e902010b10bde1f3b31828d705b2a361ec9a8 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Sat, 1 Sep 2018 18:14:00 +0800
Subject: [PATCH 01/11] Support for N-d arrays added to diag op.

---
 CONTRIBUTORS.md                   |   1 +
 src/operator/tensor/diag_op-inl.h | 190 ++++++++++++++++++++++--------
 2 files changed, 139 insertions(+), 52 deletions(-)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8d8aeaca73e4..32f2b22164f3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -178,3 +178,4 @@ List of Contributors
 * [Aaron Markham](https://github.com/aaronmarkham)
 * [Sam Skalicky](https://github.com/samskalicky)
 * [Per Goncalves da Silva](https://github.com/perdasilva)
+* [Zhijingcheng Yu](https://github.com/jasonyu1996)
diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 3bc240f206b4..f1697024ad6a 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -21,7 +21,7 @@
 * Copyright (c) 2015 by Contributors
 * \file diag_op-inl.h
 * \brief CPU Implementation of the diag op
-* \author Istvan Fehervari
+* \author Istvan Fehervari, Zhijingcheng Yu
 */
 
 #ifndef MXNET_OPERATOR_TENSOR_DIAG_OP_INL_H_
@@ -33,12 +33,15 @@
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
+#include "./broadcast_reduce_op.h"
 
 namespace mxnet {
 namespace op {
 
 struct DiagParam : public dmlc::Parameter<DiagParam> {
   dmlc::optional<int> k;
+  dmlc::optional<int> axis1;
+  dmlc::optional<int> axis2;
   DMLC_DECLARE_PARAMETER(DiagParam) {
     DMLC_DECLARE_FIELD(k)
     .set_default(dmlc::optional<int>(0))
@@ -46,17 +49,24 @@ struct DiagParam : public dmlc::Parameter<DiagParam> {
               "Use k>0 for diagonals above the main diagonal, "
               "and k<0 for diagonals below the main diagonal. "
               "If input has shape (S0 S1) k must be between -S0 and S1");
+    DMLC_DECLARE_FIELD(axis1)
+    .set_default(dmlc::optional<int>(0))
+    .describe("The first axis of the submatrix in question. Ignored when the input is a 1-D array.");
+    DMLC_DECLARE_FIELD(axis2)
+    .set_default(dmlc::optional<int>(1))
+    .describe("The second axis of the submatrix in question. Ignored when the input is a 1-D array.");
   }
 };
 
-inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k) {
+inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k, 
+                            const nnvm::dim_t axis1, const nnvm::dim_t axis2) {
   if (ishape.ndim() == 1) {
     auto s = ishape[0] + std::abs(k);
     return TShape({s, s});
   }
 
-  auto h = ishape[0];
-  auto w = ishape[1];
+  auto h = ishape[axis1];
+  auto w = ishape[axis2];
 
   if (k > 0) {
     w -= k;
@@ -69,7 +79,28 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k) {
     s = 0;
   }
 
-  return TShape({s});
+  auto x1 = CheckAxis(axis1, ishape.ndim());
+  auto x2 = CheckAxis(axis2, ishape.ndim());
+
+  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer the the same axis " << x1;
+
+  if (x1 > x2) {
+    std::swap(x1, x2);
+  }
+
+  int n_dim = static_cast<int>(ishape.ndim()) - 1;
+  TShape oshape(n_dim);
+
+  // remove axis1 and axis2 and append the new axis to the end
+  for (int i = 0; i < x1; i ++)
+    oshape[i] = ishape[i];
+  for (int i = x1; i < x2; i ++)
+    oshape[i] = ishape[i + 1];
+  for (int i = x2; i < n_dim; i ++)
+    oshape[i] = ishape[i + 2];
+  oshape[n_dim - 1] = s;
+
+  return oshape;
 }
 
 inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
@@ -80,11 +111,13 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
 
     const TShape& ishape = (*in_attrs)[0];
     if (ishape.ndim() == 0) return false;
-    if (ishape.ndim() > 2) LOG(FATAL) << "Input must be 1- or 2-d.";
 
     const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
-    TShape oshape = DiagShapeImpl(ishape, param.k.value());
+    TShape oshape = DiagShapeImpl(ishape,
+                                  param.k.value(), 
+                                  param.axis1.value(),
+                                  param.axis2.value());
     if (shape_is_none(oshape)) {
       LOG(FATAL) << "Diagonal does not exist.";
     }
@@ -104,26 +137,26 @@ inline bool DiagOpType(const nnvm::NodeAttrs& attrs,
   return (*out_attrs)[0] != -1;
 }
 
-template<int req>
+template<int ndim, int req, bool back>
 struct diag {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType* out, const DType* a,
-                                  mshadow::Shape<2> ishape, int k) {
+                                  mshadow::Shape<ndim> oshape,
+                                  mshadow::Shape<ndim> ishape,
+                                  int stride, int offset,
+                                  int base) {
     using namespace mxnet_op;
-    int j = 0;
-    if (k > 0) {
-      j = ravel(mshadow::Shape2(i, i + k), ishape);
-    } else if (k < 0) {
-      j = ravel(mshadow::Shape2(i - k, i), ishape);
-    } else {
-      j = ravel(mshadow::Shape2(i, i), ishape);
+    int idx = i / base;
+    int j = ravel(unravel(idx, oshape), ishape) + offset + stride * (i - idx * base);
+    if (back){
+      KERNEL_ASSIGN(out[j], req, a[i]);
+    } else{
+      KERNEL_ASSIGN(out[i], req, a[j]);
     }
-
-    KERNEL_ASSIGN(out[i], req, a[j]);
   }
 };
 
-template<int req>
+template<int req, bool back>
 struct diag_gen {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType* out, const DType* a,
@@ -133,13 +166,92 @@ struct diag_gen {
     auto j = unravel(i, oshape);
     if (j[1] == (j[0] + k)) {
       auto l = j[0] < j[1] ? j[0] : j[1];
-      KERNEL_ASSIGN(out[i], req, a[l]);
-    } else {
+      if (back){
+        KERNEL_ASSIGN(out[l], req, a[i]);
+      } else{
+        KERNEL_ASSIGN(out[i], req, a[l]);
+      }
+    } else if (!back) {
       KERNEL_ASSIGN(out[i], req, static_cast<DType>(0));
     }
   }
 };
 
+template<typename xpu, bool back>
+void DiagOpProcess(const TBlob& in_data, 
+                   const TBlob& out_data,
+                   const TShape& ishape,
+                   const TShape& oshape,
+                   int dsize,
+                   const DiagParam& param,
+                   mxnet_op::Stream<xpu> *s,
+                   const std::vector<OpReqType>& req){
+  using namespace mxnet_op;
+  using namespace mshadow;
+  if (ishape.ndim() > 1) {
+    // input : (leading + i, body + i, trailing)
+    int x1 = CheckAxis(param.axis1.value(), ishape.ndim());
+    int x2 = CheckAxis(param.axis2.value(), ishape.ndim());
+
+    int idim = ishape.ndim(), odim = oshape.ndim();
+
+    int minx = x1, maxx = x2;
+    if (minx > maxx)
+      std::swap(minx, maxx);
+
+    int oleading = 1, obody = 1, otrailing = 1;
+    for (int i = 0; i < minx; i ++)
+      oleading *= ishape[i];
+    for (int i = minx + 1; i < maxx; i ++)
+      obody *= ishape[i];
+    for (int i = maxx + 1; i < idim; i ++)
+      otrailing *= ishape[i];
+
+
+    int ileading = oleading, 
+        ibody = obody * ishape[minx],
+        itrailing = otrailing * ishape[maxx];
+    
+    int stride1 = itrailing * obody,
+        stride2 = otrailing;
+
+    if (x1 == maxx)
+      std::swap(stride1, stride2);
+    int offset, k = param.k.value();
+    if (k > 0) {
+      offset = stride2 * k;
+    } else if(k < 0) {
+      offset = stride1 * -k;
+    } else {
+      offset = 0;
+    }
+
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        if(ileading == 1){
+          Kernel<diag<2, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
+                              in_data.dptr<DType>(), Shape2(obody, otrailing),
+                              Shape2(ibody, itrailing),
+                              stride1 + stride2, offset, oshape[odim - 1]);
+        } else{
+          Kernel<diag<3, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
+                              in_data.dptr<DType>(), Shape3(oleading, obody, otrailing),
+                              Shape3(ileading, ibody, itrailing),
+                              stride1 + stride2, offset, oshape[odim - 1]);
+        }
+      });
+    });
+  } else {
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<diag_gen<req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
+                            in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]), 
+                            param.k.value());
+      });
+    });
+  }
+}
+
 template<typename xpu>
 void DiagOpForward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
@@ -159,21 +271,7 @@ void DiagOpForward(const nnvm::NodeAttrs& attrs,
   const TShape& oshape = outputs[0].shape_;
   const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
-  if (ishape.ndim() == 2) {
-    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<diag<req_type>, xpu>::Launch(s, out_data.Size(), out_data.dptr<DType>(),
-                            in_data.dptr<DType>(), Shape2(ishape[0], ishape[1]), param.k.value());
-      });
-    });
-  } else {
-    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<diag_gen<req_type>, xpu>::Launch(s, out_data.Size(), out_data.dptr<DType>(),
-                            in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]), param.k.value());
-      });
-    });
-  }
+  DiagOpProcess<xpu, false>(in_data, out_data, ishape, oshape, out_data.Size(), param, s, req);
 }
 
 template<typename xpu>
@@ -194,24 +292,12 @@ void DiagOpBackward(const nnvm::NodeAttrs& attrs,
   const TShape& oshape = outputs[0].shape_;
   const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
-  if (oshape.ndim() == 2) {
-    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<diag_gen<req_type>, xpu>::Launch(s, out_data.Size(), out_data.dptr<DType>(),
-                            in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]), param.k.value());
-      });
-    });
-  } else {
-    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<diag<req_type>, xpu>::Launch(s, out_data.Size(), out_data.dptr<DType>(),
-                            in_data.dptr<DType>(), Shape2(ishape[0], ishape[1]), param.k.value());
-      });
-    });
-  }
+
+  DiagOpProcess<xpu, true>(in_data, out_data, oshape, ishape, in_data.Size(), param, s, req);
 }
 
-}  // namespace op
+
+} // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_OPERATOR_TENSOR_DIAG_OP_INL_H_

From 98adc98a80a32b5db2bc06f95c3b0958257178d4 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Sat, 1 Sep 2018 19:43:22 +0800
Subject: [PATCH 02/11] Doc for diag updated. Sanity fix. Unit test for N-d
 diag op added.

---
 src/operator/tensor/diag_op-inl.h      | 49 ++++++++++++++------------
 src/operator/tensor/diag_op.cc         | 10 ++++--
 tests/python/unittest/test_operator.py | 44 +++++++++++++++++++++--
 3 files changed, 76 insertions(+), 27 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index f1697024ad6a..7a3bde4f2bdd 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -30,6 +30,7 @@
 #include <dmlc/parameter.h>
 #include <vector>
 #include <algorithm>
+#include <utility>
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
@@ -51,14 +52,16 @@ struct DiagParam : public dmlc::Parameter<DiagParam> {
               "If input has shape (S0 S1) k must be between -S0 and S1");
     DMLC_DECLARE_FIELD(axis1)
     .set_default(dmlc::optional<int>(0))
-    .describe("The first axis of the submatrix in question. Ignored when the input is a 1-D array.");
+    .describe("The first axis of the sub-arrays of interest. "
+              "Ignored when the input is a 1-D array.");
     DMLC_DECLARE_FIELD(axis2)
     .set_default(dmlc::optional<int>(1))
-    .describe("The second axis of the submatrix in question. Ignored when the input is a 1-D array.");
+    .describe("The second axis of the sub-arrays of interest. "
+              "Ignored when the input is a 1-D array.");
   }
 };
 
-inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k, 
+inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
                             const nnvm::dim_t axis1, const nnvm::dim_t axis2) {
   if (ishape.ndim() == 1) {
     auto s = ishape[0] + std::abs(k);
@@ -94,10 +97,10 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
   // remove axis1 and axis2 and append the new axis to the end
   for (int i = 0; i < x1; i ++)
     oshape[i] = ishape[i];
-  for (int i = x1; i < x2; i ++)
-    oshape[i] = ishape[i + 1];
-  for (int i = x2; i < n_dim; i ++)
-    oshape[i] = ishape[i + 2];
+  for (int i = x1 + 1; i < x2; i ++)
+    oshape[i - 1] = ishape[i];
+  for (int i = x2 + 1; i <= n_dim; i ++)
+    oshape[i - 2] = ishape[i];
   oshape[n_dim - 1] = s;
 
   return oshape;
@@ -115,7 +118,7 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
     TShape oshape = DiagShapeImpl(ishape,
-                                  param.k.value(), 
+                                  param.k.value(),
                                   param.axis1.value(),
                                   param.axis2.value());
     if (shape_is_none(oshape)) {
@@ -148,9 +151,9 @@ struct diag {
     using namespace mxnet_op;
     int idx = i / base;
     int j = ravel(unravel(idx, oshape), ishape) + offset + stride * (i - idx * base);
-    if (back){
+    if (back) {
       KERNEL_ASSIGN(out[j], req, a[i]);
-    } else{
+    } else {
       KERNEL_ASSIGN(out[i], req, a[j]);
     }
   }
@@ -166,9 +169,9 @@ struct diag_gen {
     auto j = unravel(i, oshape);
     if (j[1] == (j[0] + k)) {
       auto l = j[0] < j[1] ? j[0] : j[1];
-      if (back){
+      if (back) {
         KERNEL_ASSIGN(out[l], req, a[i]);
-      } else{
+      } else {
         KERNEL_ASSIGN(out[i], req, a[l]);
       }
     } else if (!back) {
@@ -178,14 +181,14 @@ struct diag_gen {
 };
 
 template<typename xpu, bool back>
-void DiagOpProcess(const TBlob& in_data, 
+void DiagOpProcess(const TBlob& in_data,
                    const TBlob& out_data,
                    const TShape& ishape,
                    const TShape& oshape,
                    int dsize,
                    const DiagParam& param,
                    mxnet_op::Stream<xpu> *s,
-                   const std::vector<OpReqType>& req){
+                   const std::vector<OpReqType>& req) {
   using namespace mxnet_op;
   using namespace mshadow;
   if (ishape.ndim() > 1) {
@@ -208,19 +211,20 @@ void DiagOpProcess(const TBlob& in_data,
       otrailing *= ishape[i];
 
 
-    int ileading = oleading, 
+    int ileading = oleading,
         ibody = obody * ishape[minx],
         itrailing = otrailing * ishape[maxx];
-    
+
     int stride1 = itrailing * obody,
         stride2 = otrailing;
 
-    if (x1 == maxx)
+    if (x1 == maxx) {
       std::swap(stride1, stride2);
+    }
     int offset, k = param.k.value();
     if (k > 0) {
       offset = stride2 * k;
-    } else if(k < 0) {
+    } else if (k < 0) {
       offset = stride1 * -k;
     } else {
       offset = 0;
@@ -228,12 +232,13 @@ void DiagOpProcess(const TBlob& in_data,
 
     MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        if(ileading == 1){
+        if (back && req[0] != kAddTo && req[0] != kNullOp) out_data.FlatTo1D<xpu, DType>(s) = 0;
+        if (ileading == 1) {
           Kernel<diag<2, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
                               in_data.dptr<DType>(), Shape2(obody, otrailing),
                               Shape2(ibody, itrailing),
                               stride1 + stride2, offset, oshape[odim - 1]);
-        } else{
+        } else {
           Kernel<diag<3, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
                               in_data.dptr<DType>(), Shape3(oleading, obody, otrailing),
                               Shape3(ileading, ibody, itrailing),
@@ -245,7 +250,7 @@ void DiagOpProcess(const TBlob& in_data,
     MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         Kernel<diag_gen<req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
-                            in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]), 
+                            in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]),
                             param.k.value());
       });
     });
@@ -297,7 +302,7 @@ void DiagOpBackward(const nnvm::NodeAttrs& attrs,
 }
 
 
-} // namespace op
+}  // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_OPERATOR_TENSOR_DIAG_OP_INL_H_
diff --git a/src/operator/tensor/diag_op.cc b/src/operator/tensor/diag_op.cc
index 1ad3b8adc028..82b8ac83c13b 100644
--- a/src/operator/tensor/diag_op.cc
+++ b/src/operator/tensor/diag_op.cc
@@ -36,9 +36,13 @@ NNVM_REGISTER_OP(diag)
 
 ``diag``'s behavior depends on the input array dimensions:
 
-- 1-D arrays: constructs a 2-D array with the input as its diagonal, all other elements are zero
-- 2-D arrays: returns elements in the diagonal as a new 1-D array
-- N-D arrays: not supported yet
+- 1-D arrays: constructs a 2-D array with the input as its diagonal, all other elements are zero.
+- N-D arrays: extracts the diagonals of the sub-arrays with axes specified by ``axis1`` and ``axis2``.
+  The output shape would be decided by removing the axes numbered ``axis1`` and ``axis2`` from the
+  input shape and appending to the result a new axis with the size of the diagonals in question.
+
+  For example, when the input shape is `(2, 3, 4, 5)`, ``axis1`` and ``axis2`` are 0 and 2
+  respectively and ``k`` is 0, the resulting shape would be `(3, 5, 2)`.
 
 Examples::
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index eeaeab8d1667..a950d5ec2e53 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -4493,7 +4493,7 @@ def test_invalid_shape():
         x = mx.sym.Variable('x')
         y = mx.sym.Variable('y')
         where_sym = mx.sym.where(condition, x, y)
-	
+
         assert_exception(lambda: where_sym.eval(x=mx.nd.array([[2,3],[4,5],[6,7]]),
                                                 y=mx.nd.array([[8,9],[10,11],[12,13]]),
                                                 condition=mx.nd.array([1,0])), MXNetError)
@@ -5077,7 +5077,7 @@ def _validate_sample_location(input_rois, input_offset, spatial_scale, pooled_w,
                     trans_x = input_offset[roi_idx, class_id * 2, part_h, part_w] * trans_std
                     trans_y = input_offset[roi_idx, class_id * 2 + 1, part_h, part_w] * trans_std
                     bin_h_start, bin_w_start = ph * bin_size_h + roi_start_h, pw * bin_size_w + roi_start_w
-                    
+
                     need_check = True
                     while need_check:
                         pass_check = True
@@ -6901,6 +6901,46 @@ def test_diag():
     diag_sym = mx.sym.diag(data=data, k=-1)
     check_numeric_gradient(diag_sym, [a_np])
 
+    # Test 4d input
+    x1 = np.random.randint(2,9)
+    x2 = np.random.randint(2,9)
+    x3 = np.random.randint(2,9)
+    x4 = np.random.randint(2,9)
+    a_np = np.random.random((x1, x2, x3, x4)).astype(np.float32)
+    a = mx.nd.array(a_np).astype('float32')
+
+    # k = 0, axis1=0, axis2=1
+    r = mx.nd.diag(data=a, k=0, axis1=0, axis2=1)
+    print(a_np.shape)
+    print(r.shape)
+    print(np.diagonal(a_np, offset=0, axis1=0, axis2=1).shape)
+    assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=0, axis1=0, axis2=1))
+
+    # k = 1, axis1=1, axis2=0
+    r = mx.nd.diag(data=a, k=1, axis1=1, axis2=0)
+    assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=1, axis1=1, axis2=0))
+
+    # k = -1 axis1=1, axis3=3
+    r = mx.nd.diag(data=a, k=-1, axis1=1, axis2=3)
+    assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=-1, axis1=1, axis2=3))
+
+    # Test 4d backward, k=0, axis1=3, axis2=0
+    data = mx.sym.Variable('data')
+    diag_sym = mx.sym.diag(data=data, k=0, axis1=3, axis2=0)
+    check_numeric_gradient(diag_sym, [a_np])
+
+    # Test 4d backward, k=1, axis1=1, axis2=2
+    data = mx.sym.Variable('data')
+    diag_sym = mx.sym.diag(data=data, k=1, axis1=1, axis2=2)
+    check_numeric_gradient(diag_sym, [a_np])
+
+    # Test 4d backward, k=-1, axis1=2, axis2=0
+    data = mx.sym.Variable('data')
+    diag_sym = mx.sym.diag(data=data, k=-1, axis1=2, axis2=0)
+    check_numeric_gradient(diag_sym, [a_np])
+
+
+
 @with_seed()
 def test_depthtospace():
     def f(x, blocksize):

From b25080a62185d59373da0f3a2f8ac2f710a35891 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Sat, 1 Sep 2018 19:57:34 +0800
Subject: [PATCH 03/11] Unwanted print in diag test removed.

---
 src/operator/tensor/diag_op.cc         | 2 +-
 tests/python/unittest/test_operator.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/operator/tensor/diag_op.cc b/src/operator/tensor/diag_op.cc
index 82b8ac83c13b..c638619ad411 100644
--- a/src/operator/tensor/diag_op.cc
+++ b/src/operator/tensor/diag_op.cc
@@ -21,7 +21,7 @@
 * Copyright (c) 2015 by Contributors
 * \file diag_op.cc
 * \brief
-* \author Istvan Fehervari
+* \author Istvan Fehervari, Zhijingcheng Yu
 */
 
 #include "./diag_op-inl.h"
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a950d5ec2e53..10ddf10ecc11 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6911,9 +6911,6 @@ def test_diag():
 
     # k = 0, axis1=0, axis2=1
     r = mx.nd.diag(data=a, k=0, axis1=0, axis2=1)
-    print(a_np.shape)
-    print(r.shape)
-    print(np.diagonal(a_np, offset=0, axis1=0, axis2=1).shape)
     assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=0, axis1=0, axis2=1))
 
     # k = 1, axis1=1, axis2=0

From 1004a49219cb6c783b664b47f18df5b330033ce2 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Thu, 6 Sep 2018 10:32:18 +0800
Subject: [PATCH 04/11] Bad negative axis support in diag fixed.

---
 src/operator/tensor/diag_op-inl.h      | 24 +++++++++++-------------
 tests/python/unittest/test_operator.py | 17 ++++++++++++-----
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 7a3bde4f2bdd..8f1ad976384b 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -68,8 +68,13 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     return TShape({s, s});
   }
 
-  auto h = ishape[axis1];
-  auto w = ishape[axis2];
+  int x1 = CheckAxis(axis1, ishape.ndim());
+  int x2 = CheckAxis(axis2, ishape.ndim());
+
+  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer to the the same axis " << x1;
+
+  auto h = ishape[x1];
+  auto w = ishape[x2];
 
   if (k > 0) {
     w -= k;
@@ -82,11 +87,6 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     s = 0;
   }
 
-  auto x1 = CheckAxis(axis1, ishape.ndim());
-  auto x2 = CheckAxis(axis2, ishape.ndim());
-
-  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer the the same axis " << x1;
-
   if (x1 > x2) {
     std::swap(x1, x2);
   }
@@ -95,12 +95,10 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
   TShape oshape(n_dim);
 
   // remove axis1 and axis2 and append the new axis to the end
-  for (int i = 0; i < x1; i ++)
-    oshape[i] = ishape[i];
-  for (int i = x1 + 1; i < x2; i ++)
-    oshape[i - 1] = ishape[i];
-  for (int i = x2 + 1; i <= n_dim; i ++)
-    oshape[i - 2] = ishape[i];
+  int idx = 0;
+  for (int i = 0; i <= n_dim; i ++)
+    if (i != x1 && i != x2)
+      oshape[idx ++] = ishape[i];
   oshape[n_dim - 1] = s;
 
   return oshape;
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 10ddf10ecc11..d2dbc0ca1c4b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6902,10 +6902,10 @@ def test_diag():
     check_numeric_gradient(diag_sym, [a_np])
 
     # Test 4d input
-    x1 = np.random.randint(2,9)
-    x2 = np.random.randint(2,9)
-    x3 = np.random.randint(2,9)
-    x4 = np.random.randint(2,9)
+    x1 = np.random.randint(3,9)
+    x2 = np.random.randint(3,9)
+    x3 = np.random.randint(3,9)
+    x4 = np.random.randint(3,9)
     a_np = np.random.random((x1, x2, x3, x4)).astype(np.float32)
     a = mx.nd.array(a_np).astype('float32')
 
@@ -6921,6 +6921,10 @@ def test_diag():
     r = mx.nd.diag(data=a, k=-1, axis1=1, axis2=3)
     assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=-1, axis1=1, axis2=3))
 
+    # k = 2, axis1=-2, axis2=0
+    r = mx.nd.diag(data=a, k=2, axis1=-2, axis2=0)
+    assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=2, axis1=-2, axis2=0))
+
     # Test 4d backward, k=0, axis1=3, axis2=0
     data = mx.sym.Variable('data')
     diag_sym = mx.sym.diag(data=data, k=0, axis1=3, axis2=0)
@@ -6936,7 +6940,10 @@ def test_diag():
     diag_sym = mx.sym.diag(data=data, k=-1, axis1=2, axis2=0)
     check_numeric_gradient(diag_sym, [a_np])
 
-
+    # Test 4d backward, k=-2, axis1=1, axis2=-1
+    data = mx.sym.Variable('data')
+    diag_sym = mx.sym.diag(data=data, k=-2, axis1=1, axis2=-1)
+    check_numeric_gradient(diag_sym, [a_np])
 
 @with_seed()
 def test_depthtospace():

From 128f89e9b532bf6e77f59d1491f6a4be26cc7491 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Thu, 6 Sep 2018 10:32:18 +0800
Subject: [PATCH 05/11] Bad negative axis support in diag fixed.

---
 src/operator/tensor/diag_op-inl.h      | 24 +++++++++++-------------
 tests/python/unittest/test_operator.py | 17 ++++++++++++-----
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 7a3bde4f2bdd..8f1ad976384b 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -68,8 +68,13 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     return TShape({s, s});
   }
 
-  auto h = ishape[axis1];
-  auto w = ishape[axis2];
+  int x1 = CheckAxis(axis1, ishape.ndim());
+  int x2 = CheckAxis(axis2, ishape.ndim());
+
+  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer to the the same axis " << x1;
+
+  auto h = ishape[x1];
+  auto w = ishape[x2];
 
   if (k > 0) {
     w -= k;
@@ -82,11 +87,6 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     s = 0;
   }
 
-  auto x1 = CheckAxis(axis1, ishape.ndim());
-  auto x2 = CheckAxis(axis2, ishape.ndim());
-
-  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer the the same axis " << x1;
-
   if (x1 > x2) {
     std::swap(x1, x2);
   }
@@ -95,12 +95,10 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
   TShape oshape(n_dim);
 
   // remove axis1 and axis2 and append the new axis to the end
-  for (int i = 0; i < x1; i ++)
-    oshape[i] = ishape[i];
-  for (int i = x1 + 1; i < x2; i ++)
-    oshape[i - 1] = ishape[i];
-  for (int i = x2 + 1; i <= n_dim; i ++)
-    oshape[i - 2] = ishape[i];
+  int idx = 0;
+  for (int i = 0; i <= n_dim; i ++)
+    if (i != x1 && i != x2)
+      oshape[idx ++] = ishape[i];
   oshape[n_dim - 1] = s;
 
   return oshape;
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 10ddf10ecc11..d2dbc0ca1c4b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6902,10 +6902,10 @@ def test_diag():
     check_numeric_gradient(diag_sym, [a_np])
 
     # Test 4d input
-    x1 = np.random.randint(2,9)
-    x2 = np.random.randint(2,9)
-    x3 = np.random.randint(2,9)
-    x4 = np.random.randint(2,9)
+    x1 = np.random.randint(3,9)
+    x2 = np.random.randint(3,9)
+    x3 = np.random.randint(3,9)
+    x4 = np.random.randint(3,9)
     a_np = np.random.random((x1, x2, x3, x4)).astype(np.float32)
     a = mx.nd.array(a_np).astype('float32')
 
@@ -6921,6 +6921,10 @@ def test_diag():
     r = mx.nd.diag(data=a, k=-1, axis1=1, axis2=3)
     assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=-1, axis1=1, axis2=3))
 
+    # k = 2, axis1=-2, axis2=0
+    r = mx.nd.diag(data=a, k=2, axis1=-2, axis2=0)
+    assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=2, axis1=-2, axis2=0))
+
     # Test 4d backward, k=0, axis1=3, axis2=0
     data = mx.sym.Variable('data')
     diag_sym = mx.sym.diag(data=data, k=0, axis1=3, axis2=0)
@@ -6936,7 +6940,10 @@ def test_diag():
     diag_sym = mx.sym.diag(data=data, k=-1, axis1=2, axis2=0)
     check_numeric_gradient(diag_sym, [a_np])
 
-
+    # Test 4d backward, k=-2, axis1=1, axis2=-1
+    data = mx.sym.Variable('data')
+    diag_sym = mx.sym.diag(data=data, k=-2, axis1=1, axis2=-1)
+    check_numeric_gradient(diag_sym, [a_np])
 
 @with_seed()
 def test_depthtospace():

From 785d95d290364701189f7fbba47fa6885b054b51 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Wed, 12 Sep 2018 10:20:51 +0800
Subject: [PATCH 06/11] Index overflow in diag fixed. Exemplars for Nd diag
 added.

---
 src/operator/tensor/diag_op-inl.h | 59 +++++++++++++++++++------------
 src/operator/tensor/diag_op.cc    | 15 ++++++++
 2 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 8f1ad976384b..9c650d046619 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -96,9 +96,12 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
 
   // remove axis1 and axis2 and append the new axis to the end
   int idx = 0;
-  for (int i = 0; i <= n_dim; i ++)
-    if (i != x1 && i != x2)
-      oshape[idx ++] = ishape[i];
+  for (int i = 0; i <= n_dim; ++i) {
+    if (i != x1 && i != x2) {
+      oshape[idx++] = ishape[i];
+    }
+  }
+
   oshape[n_dim - 1] = s;
 
   return oshape;
@@ -111,7 +114,9 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(out_attrs->size(), 1U);
 
     const TShape& ishape = (*in_attrs)[0];
-    if (ishape.ndim() == 0) return false;
+    if (ishape.ndim() == 0) {
+      return false;
+    }
 
     const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
@@ -141,14 +146,14 @@ inline bool DiagOpType(const nnvm::NodeAttrs& attrs,
 template<int ndim, int req, bool back>
 struct diag {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* a,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* a,
                                   mshadow::Shape<ndim> oshape,
                                   mshadow::Shape<ndim> ishape,
-                                  int stride, int offset,
-                                  int base) {
+                                  index_t stride, index_t offset,
+                                  index_t base) {
     using namespace mxnet_op;
-    int idx = i / base;
-    int j = ravel(unravel(idx, oshape), ishape) + offset + stride * (i - idx * base);
+    index_t idx = i / base;
+    index_t j = ravel(unravel(idx, oshape), ishape) + offset + stride * (i - idx * base);
     if (back) {
       KERNEL_ASSIGN(out[j], req, a[i]);
     } else {
@@ -160,7 +165,7 @@ struct diag {
 template<int req, bool back>
 struct diag_gen {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* a,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* a,
                                   mshadow::Shape<2> oshape, int k) {
     using namespace mxnet_op;
 
@@ -183,7 +188,7 @@ void DiagOpProcess(const TBlob& in_data,
                    const TBlob& out_data,
                    const TShape& ishape,
                    const TShape& oshape,
-                   int dsize,
+                   index_t dsize,
                    const DiagParam& param,
                    mxnet_op::Stream<xpu> *s,
                    const std::vector<OpReqType>& req) {
@@ -197,29 +202,36 @@ void DiagOpProcess(const TBlob& in_data,
     int idim = ishape.ndim(), odim = oshape.ndim();
 
     int minx = x1, maxx = x2;
-    if (minx > maxx)
+    if (minx > maxx) {
       std::swap(minx, maxx);
+    }
+
+    index_t oleading = 1,
+           obody = 1,
+           otrailing = 1;
 
-    int oleading = 1, obody = 1, otrailing = 1;
-    for (int i = 0; i < minx; i ++)
+    for (int i = 0; i < minx; ++i) {
       oleading *= ishape[i];
-    for (int i = minx + 1; i < maxx; i ++)
+    }
+    for (int i = minx + 1; i < maxx; ++i) {
       obody *= ishape[i];
-    for (int i = maxx + 1; i < idim; i ++)
+    }
+    for (int i = maxx + 1; i < idim; ++i) {
       otrailing *= ishape[i];
+    }
 
-
-    int ileading = oleading,
+    index_t ileading = oleading,
         ibody = obody * ishape[minx],
         itrailing = otrailing * ishape[maxx];
 
-    int stride1 = itrailing * obody,
+    index_t stride1 = itrailing * obody,
         stride2 = otrailing;
 
     if (x1 == maxx) {
       std::swap(stride1, stride2);
     }
-    int offset, k = param.k.value();
+    index_t offset;
+    int k = param.k.value();
     if (k > 0) {
       offset = stride2 * k;
     } else if (k < 0) {
@@ -230,7 +242,9 @@ void DiagOpProcess(const TBlob& in_data,
 
     MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        if (back && req[0] != kAddTo && req[0] != kNullOp) out_data.FlatTo1D<xpu, DType>(s) = 0;
+        if (back && req[0] != kAddTo && req[0] != kNullOp) {
+          out_data.FlatTo1D<xpu, DType>(s) = 0;
+        }
         if (ileading == 1) {
           Kernel<diag<2, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
                               in_data.dptr<DType>(), Shape2(obody, otrailing),
@@ -249,7 +263,7 @@ void DiagOpProcess(const TBlob& in_data,
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         Kernel<diag_gen<req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
                             in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]),
-                            param.k.value());
+                            k);
       });
     });
   }
@@ -295,7 +309,6 @@ void DiagOpBackward(const nnvm::NodeAttrs& attrs,
   const TShape& oshape = outputs[0].shape_;
   const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
-
   DiagOpProcess<xpu, true>(in_data, out_data, oshape, ishape, in_data.Size(), param, s, req);
 }
 
diff --git a/src/operator/tensor/diag_op.cc b/src/operator/tensor/diag_op.cc
index c638619ad411..cd5be9d0fd5c 100644
--- a/src/operator/tensor/diag_op.cc
+++ b/src/operator/tensor/diag_op.cc
@@ -69,6 +69,21 @@ Examples::
                    [1, 0, 0],
                    [0, 2, 0]]
 
+  x = [[[1, 2],
+        [3, 4]],
+
+       [[5, 6],
+        [7, 8]]]
+
+  diag(x) = [[1, 7],
+             [2, 8]]
+
+  diag(x, k=1) = [[3],
+                  [4]]
+
+  diag(x, axis1=-2, axis2=-1) = [[1, 4],
+                                 [5, 8]]
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<DiagParam>)
 .set_num_inputs(1)

From e672aede240d339f6de494d106a17d1ecbf564ff Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Wed, 12 Sep 2018 13:53:00 +0800
Subject: [PATCH 07/11] A bug in diag op fixed.

---
 src/operator/tensor/diag_op-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 9c650d046619..3f314acdbc16 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -263,7 +263,7 @@ void DiagOpProcess(const TBlob& in_data,
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         Kernel<diag_gen<req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
                             in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]),
-                            k);
+                            param.k.value());
       });
     });
   }

From 0c305925020162d04a78d558f54ca48ca1d52bb2 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Thu, 13 Sep 2018 10:05:16 +0800
Subject: [PATCH 08/11] Types of axis number and dim size changed to dim_t in
 diag.

---
 src/operator/tensor/diag_op-inl.h | 40 +++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 3f314acdbc16..6a0f95b4c243 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -40,22 +40,22 @@ namespace mxnet {
 namespace op {
 
 struct DiagParam : public dmlc::Parameter<DiagParam> {
-  dmlc::optional<int> k;
-  dmlc::optional<int> axis1;
-  dmlc::optional<int> axis2;
+  dmlc::optional<nnvm::dim_t> k;
+  dmlc::optional<nnvm::dim_t> axis1;
+  dmlc::optional<nnvm::dim_t> axis2;
   DMLC_DECLARE_PARAMETER(DiagParam) {
     DMLC_DECLARE_FIELD(k)
-    .set_default(dmlc::optional<int>(0))
+    .set_default(dmlc::optional<nnvm::dim_t>(0))
     .describe("Diagonal in question. The default is 0. "
               "Use k>0 for diagonals above the main diagonal, "
               "and k<0 for diagonals below the main diagonal. "
               "If input has shape (S0 S1) k must be between -S0 and S1");
     DMLC_DECLARE_FIELD(axis1)
-    .set_default(dmlc::optional<int>(0))
+    .set_default(dmlc::optional<nnvm::dim_t>(0))
     .describe("The first axis of the sub-arrays of interest. "
               "Ignored when the input is a 1-D array.");
     DMLC_DECLARE_FIELD(axis2)
-    .set_default(dmlc::optional<int>(1))
+    .set_default(dmlc::optional<nnvm::dim_t>(1))
     .describe("The second axis of the sub-arrays of interest. "
               "Ignored when the input is a 1-D array.");
   }
@@ -68,8 +68,8 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     return TShape({s, s});
   }
 
-  int x1 = CheckAxis(axis1, ishape.ndim());
-  int x2 = CheckAxis(axis2, ishape.ndim());
+  nnvm::dim_t x1 = CheckAxis(axis1, ishape.ndim());
+  nnvm::dim_t x2 = CheckAxis(axis2, ishape.ndim());
 
   CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer to the the same axis " << x1;
 
@@ -91,12 +91,12 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     std::swap(x1, x2);
   }
 
-  int n_dim = static_cast<int>(ishape.ndim()) - 1;
+  nnvm::dim_t n_dim = ishape.ndim() - 1;
   TShape oshape(n_dim);
 
   // remove axis1 and axis2 and append the new axis to the end
-  int idx = 0;
-  for (int i = 0; i <= n_dim; ++i) {
+  nnvm::dim_t idx = 0;
+  for (nnvm::dim_t i = 0; i <= n_dim; ++i) {
     if (i != x1 && i != x2) {
       oshape[idx++] = ishape[i];
     }
@@ -166,7 +166,7 @@ template<int req, bool back>
 struct diag_gen {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* a,
-                                  mshadow::Shape<2> oshape, int k) {
+                                  mshadow::Shape<2> oshape, nnvm::dim_t k) {
     using namespace mxnet_op;
 
     auto j = unravel(i, oshape);
@@ -196,12 +196,12 @@ void DiagOpProcess(const TBlob& in_data,
   using namespace mshadow;
   if (ishape.ndim() > 1) {
     // input : (leading + i, body + i, trailing)
-    int x1 = CheckAxis(param.axis1.value(), ishape.ndim());
-    int x2 = CheckAxis(param.axis2.value(), ishape.ndim());
+    nnvm::dim_t x1 = CheckAxis(param.axis1.value(), ishape.ndim());
+    nnvm::dim_t x2 = CheckAxis(param.axis2.value(), ishape.ndim());
 
-    int idim = ishape.ndim(), odim = oshape.ndim();
+    nnvm::dim_t idim = ishape.ndim(), odim = oshape.ndim();
 
-    int minx = x1, maxx = x2;
+    nnvm::dim_t minx = x1, maxx = x2;
     if (minx > maxx) {
       std::swap(minx, maxx);
     }
@@ -210,13 +210,13 @@ void DiagOpProcess(const TBlob& in_data,
            obody = 1,
            otrailing = 1;
 
-    for (int i = 0; i < minx; ++i) {
+    for (nnvm::dim_t i = 0; i < minx; ++i) {
       oleading *= ishape[i];
     }
-    for (int i = minx + 1; i < maxx; ++i) {
+    for (nnvm::dim_t i = minx + 1; i < maxx; ++i) {
       obody *= ishape[i];
     }
-    for (int i = maxx + 1; i < idim; ++i) {
+    for (nnvm::dim_t i = maxx + 1; i < idim; ++i) {
       otrailing *= ishape[i];
     }
 
@@ -231,7 +231,7 @@ void DiagOpProcess(const TBlob& in_data,
       std::swap(stride1, stride2);
     }
     index_t offset;
-    int k = param.k.value();
+    nnvm::dim_t k = param.k.value();
     if (k > 0) {
       offset = stride2 * k;
     } else if (k < 0) {

From 11a9043b6dc63c13b0c583af2fb37ba27fcb043e Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Thu, 13 Sep 2018 13:55:02 +0800
Subject: [PATCH 09/11] A bug in params of diag fixed.

---
 src/operator/tensor/diag_op-inl.h | 42 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 6a0f95b4c243..a7f7a8f2901b 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -40,24 +40,24 @@ namespace mxnet {
 namespace op {
 
 struct DiagParam : public dmlc::Parameter<DiagParam> {
-  dmlc::optional<nnvm::dim_t> k;
-  dmlc::optional<nnvm::dim_t> axis1;
-  dmlc::optional<nnvm::dim_t> axis2;
+  nnvm::dim_t k;
+  nnvm::dim_t axis1;
+  nnvm::dim_t axis2;
   DMLC_DECLARE_PARAMETER(DiagParam) {
     DMLC_DECLARE_FIELD(k)
-    .set_default(dmlc::optional<nnvm::dim_t>(0))
-    .describe("Diagonal in question. The default is 0. "
-              "Use k>0 for diagonals above the main diagonal, "
-              "and k<0 for diagonals below the main diagonal. "
-              "If input has shape (S0 S1) k must be between -S0 and S1");
+      .set_default(0)
+      .describe("Diagonal in question. The default is 0. "
+                "Use k>0 for diagonals above the main diagonal, "
+                "and k<0 for diagonals below the main diagonal. "
+                "If input has shape (S0 S1) k must be between -S0 and S1");
     DMLC_DECLARE_FIELD(axis1)
-    .set_default(dmlc::optional<nnvm::dim_t>(0))
-    .describe("The first axis of the sub-arrays of interest. "
-              "Ignored when the input is a 1-D array.");
+      .set_default(0)
+      .describe("The first axis of the sub-arrays of interest. "
+                "Ignored when the input is a 1-D array.");
     DMLC_DECLARE_FIELD(axis2)
-    .set_default(dmlc::optional<nnvm::dim_t>(1))
-    .describe("The second axis of the sub-arrays of interest. "
-              "Ignored when the input is a 1-D array.");
+      .set_default(1)
+      .describe("The second axis of the sub-arrays of interest. "
+                "Ignored when the input is a 1-D array.");
   }
 };
 
@@ -121,9 +121,9 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     const DiagParam& param = nnvm::get<DiagParam>(attrs.parsed);
 
     TShape oshape = DiagShapeImpl(ishape,
-                                  param.k.value(),
-                                  param.axis1.value(),
-                                  param.axis2.value());
+                                  param.k,
+                                  param.axis1,
+                                  param.axis2);
     if (shape_is_none(oshape)) {
       LOG(FATAL) << "Diagonal does not exist.";
     }
@@ -196,8 +196,8 @@ void DiagOpProcess(const TBlob& in_data,
   using namespace mshadow;
   if (ishape.ndim() > 1) {
     // input : (leading + i, body + i, trailing)
-    nnvm::dim_t x1 = CheckAxis(param.axis1.value(), ishape.ndim());
-    nnvm::dim_t x2 = CheckAxis(param.axis2.value(), ishape.ndim());
+    nnvm::dim_t x1 = CheckAxis(param.axis1, ishape.ndim());
+    nnvm::dim_t x2 = CheckAxis(param.axis2, ishape.ndim());
 
     nnvm::dim_t idim = ishape.ndim(), odim = oshape.ndim();
 
@@ -231,7 +231,7 @@ void DiagOpProcess(const TBlob& in_data,
       std::swap(stride1, stride2);
     }
     index_t offset;
-    nnvm::dim_t k = param.k.value();
+    nnvm::dim_t k = param.k;
     if (k > 0) {
       offset = stride2 * k;
     } else if (k < 0) {
@@ -263,7 +263,7 @@ void DiagOpProcess(const TBlob& in_data,
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         Kernel<diag_gen<req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
                             in_data.dptr<DType>(), Shape2(oshape[0], oshape[1]),
-                            param.k.value());
+                            param.k);
       });
     });
   }

From 5597d1013d5aa39f6ed6229f072e62372101341f Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Thu, 13 Sep 2018 21:42:49 +0800
Subject: [PATCH 10/11] Poisonous dim_t removed from diag parameters.

---
 src/operator/tensor/diag_op-inl.h | 38 +++++++++++++++----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index a7f7a8f2901b..192ff6379496 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -40,9 +40,9 @@ namespace mxnet {
 namespace op {
 
 struct DiagParam : public dmlc::Parameter<DiagParam> {
-  nnvm::dim_t k;
-  nnvm::dim_t axis1;
-  nnvm::dim_t axis2;
+  int k;
+  int32_t axis1;
+  int32_t axis2;
   DMLC_DECLARE_PARAMETER(DiagParam) {
     DMLC_DECLARE_FIELD(k)
       .set_default(0)
@@ -61,15 +61,15 @@ struct DiagParam : public dmlc::Parameter<DiagParam> {
   }
 };
 
-inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
-                            const nnvm::dim_t axis1, const nnvm::dim_t axis2) {
+inline TShape DiagShapeImpl(const TShape& ishape, const int k,
+                            const int32_t axis1, const int32_t axis2) {
   if (ishape.ndim() == 1) {
     auto s = ishape[0] + std::abs(k);
     return TShape({s, s});
   }
 
-  nnvm::dim_t x1 = CheckAxis(axis1, ishape.ndim());
-  nnvm::dim_t x2 = CheckAxis(axis2, ishape.ndim());
+  int32_t x1 = CheckAxis(axis1, ishape.ndim());
+  int32_t x2 = CheckAxis(axis2, ishape.ndim());
 
   CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer to the the same axis " << x1;
 
@@ -91,12 +91,12 @@ inline TShape DiagShapeImpl(const TShape& ishape, const nnvm::dim_t k,
     std::swap(x1, x2);
   }
 
-  nnvm::dim_t n_dim = ishape.ndim() - 1;
+  int32_t n_dim = static_cast<int32_t>(ishape.ndim()) - 1;
   TShape oshape(n_dim);
 
   // remove axis1 and axis2 and append the new axis to the end
-  nnvm::dim_t idx = 0;
-  for (nnvm::dim_t i = 0; i <= n_dim; ++i) {
+  uint32_t idx = 0;
+  for (int32_t i = 0; i <= n_dim; ++i) {
     if (i != x1 && i != x2) {
       oshape[idx++] = ishape[i];
     }
@@ -166,7 +166,7 @@ template<int req, bool back>
 struct diag_gen {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* a,
-                                  mshadow::Shape<2> oshape, nnvm::dim_t k) {
+                                  mshadow::Shape<2> oshape, int k) {
     using namespace mxnet_op;
 
     auto j = unravel(i, oshape);
@@ -196,12 +196,12 @@ void DiagOpProcess(const TBlob& in_data,
   using namespace mshadow;
   if (ishape.ndim() > 1) {
     // input : (leading + i, body + i, trailing)
-    nnvm::dim_t x1 = CheckAxis(param.axis1, ishape.ndim());
-    nnvm::dim_t x2 = CheckAxis(param.axis2, ishape.ndim());
+    uint32_t x1 = CheckAxis(param.axis1, ishape.ndim());
+    uint32_t x2 = CheckAxis(param.axis2, ishape.ndim());
 
-    nnvm::dim_t idim = ishape.ndim(), odim = oshape.ndim();
+    uint32_t idim = ishape.ndim(), odim = oshape.ndim();
 
-    nnvm::dim_t minx = x1, maxx = x2;
+    uint32_t minx = x1, maxx = x2;
     if (minx > maxx) {
       std::swap(minx, maxx);
     }
@@ -210,13 +210,13 @@ void DiagOpProcess(const TBlob& in_data,
            obody = 1,
            otrailing = 1;
 
-    for (nnvm::dim_t i = 0; i < minx; ++i) {
+    for (uint32_t i = 0; i < minx; ++i) {
       oleading *= ishape[i];
     }
-    for (nnvm::dim_t i = minx + 1; i < maxx; ++i) {
+    for (uint32_t i = minx + 1; i < maxx; ++i) {
       obody *= ishape[i];
     }
-    for (nnvm::dim_t i = maxx + 1; i < idim; ++i) {
+    for (uint32_t i = maxx + 1; i < idim; ++i) {
       otrailing *= ishape[i];
     }
 
@@ -231,7 +231,7 @@ void DiagOpProcess(const TBlob& in_data,
       std::swap(stride1, stride2);
     }
     index_t offset;
-    nnvm::dim_t k = param.k;
+    int k = param.k;
     if (k > 0) {
       offset = stride2 * k;
     } else if (k < 0) {

From 758050f962851c0dd9796849632dce4020abe603 Mon Sep 17 00:00:00 2001
From: Jason Yu <yuzhijingcheng1996@hotmail.com>
Date: Tue, 18 Sep 2018 16:11:43 +0800
Subject: [PATCH 11/11] Diag impl details documented in comments.

---
 src/operator/tensor/diag_op-inl.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 192ff6379496..deab2569e489 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -206,6 +206,14 @@ void DiagOpProcess(const TBlob& in_data,
       std::swap(minx, maxx);
     }
 
+    // merges contiguous axes that are not separated
+    // by axis1 or axis2 since they can be directly
+    // mapped to the output and there is no need
+    // to distinguish them
+    // (After this the input will have no more than
+    // three axes, hence improving the rave and
+    // unravel efficiency)
+
     index_t oleading = 1,
            obody = 1,
            otrailing = 1;
@@ -226,10 +234,14 @@ void DiagOpProcess(const TBlob& in_data,
 
     index_t stride1 = itrailing * obody,
         stride2 = otrailing;
+    // stride1 + stride2 is the stride for
+    // iterating over the diagonal in question
 
     if (x1 == maxx) {
       std::swap(stride1, stride2);
     }
+
+    // the extra index offset introduced by k
     index_t offset;
     int k = param.k;
     if (k > 0) {