From 569d3381f017acd734c8483bb11312fd654d8f0e Mon Sep 17 00:00:00 2001
From: zhiboniu <zhiboniu@163.com>
Date: Wed, 29 Sep 2021 06:56:08 +0000
Subject: [PATCH] fill_diagonal op fix border cross caused by offset

---
 paddle/fluid/operators/fill_diagonal_op.cc    | 18 ++++++++---
 paddle/fluid/operators/fill_diagonal_op.cu    | 16 +++++++---
 .../unittests/test_tensor_fill_diagonal_.py   | 30 +++++++++++++++++++
 3 files changed, 56 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc
index db55c3e99693a..be3239d504844 100644
--- a/paddle/fluid/operators/fill_diagonal_op.cc
+++ b/paddle/fluid/operators/fill_diagonal_op.cc
@@ -108,8 +108,15 @@ class FillIDiagonalKernel : public framework::OpKernel<T> {
       size = std::min(size, out_dims[1] * out_dims[1]);
     }
 
-    for (int64_t i = offset; i < size; i += strides) {
-      out_data[i] = temp_var;
+    for (int64_t i = 0; i < size; i += strides) {
+      // to check if the new position with offset is still in the same line;
+      // this modify should not affect across lines.
+      // out_dims[1] is also work for tensor with dim>2, for which the dims must
+      // be the same number
+      if (i % out_dims[1] + offset >= 0 &&
+          i % out_dims[1] + offset < out_dims[1]) {
+        out_data[i + offset] = temp_var;
+      }
     }
   }
 };
@@ -176,8 +183,11 @@ class FillIDiagonalGradKernel : public framework::OpKernel<T> {
         wrapsize = size;
       }
 
-      for (int64_t i = offset; i < wrapsize; i += strides) {
-        data[i] = T(0);
+      for (int64_t i = 0; i < wrapsize; i += strides) {
+        if (i % dx_dims[1] + offset >= 0 &&
+            i % dx_dims[1] + offset < dx_dims[1]) {
+          data[i + offset] = T(0);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu
index 5047059fb364d..15eabd4216d0b 100644
--- a/paddle/fluid/operators/fill_diagonal_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_op.cu
@@ -22,11 +22,19 @@ using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
 __global__ void fill_constant_kernel(const int64_t featuresize, T* in_data,
-                                     int64_t strides, int offset, T fillvar) {
+                                     int64_t strides, int offset, T fillvar,
+                                     int dims) {
   for (int64_t idx = blockIdx.x * featuresize + threadIdx.x;
        idx * strides + offset < (blockIdx.x + 1) * featuresize;
        idx += blockDim.x) {
-    in_data[idx * strides + offset] = fillvar;
+    // to check if the new position with offset is still in the same line;
+    // this modify should not affect across lines.
+    // out_dims[1] is also work for tensor with dim>2, for which the dims must
+    // be the same number
+    if ((idx * strides) % dims + offset < dims &&
+        (idx * strides) % dims + offset >= 0) {
+      in_data[idx * strides + offset] = fillvar;
+    }
   }
 }
 
@@ -62,7 +70,7 @@ class FillIDiagonalCUDAKernel : public framework::OpKernel<T> {
 
     int64_t kBlockDim = std::min(int64_t(size / strides), kMaxBlockDim);
     fill_constant_kernel<T><<<1, kBlockDim, 0>>>(size, out_data, strides,
-                                                 offset, temp_var);
+                                                 offset, temp_var, out_dims[1]);
   }
 };
 
@@ -96,7 +104,7 @@ class FillIDiagonalGradCUDAKernel : public framework::OpKernel<T> {
 
     int64_t kBlockDim = std::min(int64_t(size), kMaxBlockDim);
     fill_constant_kernel<T><<<1, kBlockDim, 0>>>(wrapsize, in_data, strides,
-                                                 offset, T(0));
+                                                 offset, T(0), out_dims[1]);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
index 41a8a9750cb64..3beb6a537eca0 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
@@ -50,6 +50,36 @@ def test_dim2_normal(self):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True)
 
+    def test_offset(self):
+        expected_np = np.array(
+            [[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
+        expected_grad = np.array(
+            [[1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32')
+
+        typelist = ['float32', 'float64', 'int32', 'int64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+            for dtype in typelist:
+                x = paddle.ones((3, 3), dtype=dtype)
+                x.stop_gradient = False
+                y = x * 2
+                y.fill_diagonal_(1, offset=2, wrap=True)
+                loss = y.sum()
+                loss.backward()
+
+                self.assertEqual(
+                    (y.numpy().astype('float32') == expected_np).all(), True)
+                self.assertEqual(
+                    (y.grad.numpy().astype('float32') == expected_grad).all(),
+                    True)
+
     def test_bool(self):
         expected_np = np.array(
             [[False, True, True], [True, False, True], [True, True, False]])