Export resize and support batch size (apache#14014)

* add image resize operator and unit test * refactor the resize operator and address lint issues * address comment and add doc * assert size is more than 2 * add test case of 4D input * use ndarray datatype * add inline to Shape * add 4D input example * refactor the duplicate code and separate the resize from image_random * clean up the code * add resize implementation * delete the variable not used * refactor the code with structure and enum to make code more understandable * fix the lint * address comments * address comment 1. add description 2. refactor unit test and add dtype * update data type check * lint * move the common utitlity to image_utils * add default value for keep_ratio * change the operator doc * update the image utility function * fix lint * use Hang implementation to achieve image resize operator GPU * update the check and doc * refactor the caffe_gpu_interp2_kernel * update doc and fix the cpu compile error * update the comment * fix lint * add unit test for gpu * address comments * remove the crop and centercop utility function to make the PR clear * fix the syntax error * delete the warning * add unit test with 4D * fix typo * add more unit test * fix unit test * set atol = 1 * fix missing numpy import * fix the unit test * delete test case * fix unit test missing dependency * fix error data type * unify the style and add invalid interp * update the doc
stu1130 · Feb 1, 2019 · 2a4634b · 2a4634b
1 parent 9a3e4a0
commit 2a4634b
Show file tree

Hide file tree

Showing 11 changed files with 744 additions and 113 deletions.
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
@@ -262,8 +262,8 @@ def forward(self, x):
         return image.center_crop(x, *self._args)[0]
 
 
-class Resize(Block):
-    """Resize an image to the given size.
+class Resize(HybridBlock):
+    """Resize an image or a batch of image NDArray to the given size.
     Should be applied before `mxnet.gluon.data.vision.transforms.ToTensor`.
 
     Parameters
@@ -276,44 +276,36 @@ class Resize(Block):
     interpolation : int
         Interpolation method for resizing. By default uses bilinear
         interpolation. See OpenCV's resize function for available choices.
+        Note that the Resize on gpu use contrib.bilinearResize2D operator
+        which only support bilinear interpolation(1). The result would be slightly
+        different on gpu compared to cpu. OpenCV tend to align center while bilinearResize2D
+        use algorithm which aligns corner.
 
 
     Inputs:
-        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **data**: input tensor with (H x W x C) or (N x H x W x C) shape.
 
     Outputs:
-        - **out**: output tensor with (H x W x C) shape.
+        - **out**: output tensor with (H x W x C) or (N x H x W x C) shape.
 
     Examples
     --------
     >>> transformer = vision.transforms.Resize(size=(1000, 500))
     >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8)
     >>> transformer(image)
     <NDArray 500x1000x3 @cpu(0)>
+    >>> image = mx.nd.random.uniform(0, 255, (3, 224, 224, 3)).astype(dtype=np.uint8)
+    >>> transformer(image)
+    <NDArray 3x500x1000x3 @cpu(0)>
     """
     def __init__(self, size, keep_ratio=False, interpolation=1):
         super(Resize, self).__init__()
         self._keep = keep_ratio
         self._size = size
         self._interpolation = interpolation
 
-    def forward(self, x):
-        if isinstance(self._size, numeric_types):
-            if not self._keep:
-                wsize = self._size
-                hsize = self._size
-            else:
-                h, w, _ = x.shape
-                if h > w:
-                    wsize = self._size
-                    hsize = int(h * wsize / w)
-                else:
-                    hsize = self._size
-                    wsize = int(w * hsize / h)
-        else:
-            wsize, hsize = self._size
-        return image.imresize(x, wsize, hsize, self._interpolation)
-
+    def hybrid_forward(self, F, x):
+        return F.image.resize(x, self._size, self._keep, self._interpolation)
 
 class RandomFlipLeftRight(HybridBlock):
     """Randomly flip the input image left to right with a probability

diff --git a/src/io/image_io.cc b/src/io/image_io.cc
@@ -38,6 +38,7 @@
 #include <cstring>
 
 #include "../operator/elemwise_op_common.h"
+#include "../operator/image/resize-inl.h"
 
 #if MXNET_USE_OPENCV
   #include <opencv2/opencv.hpp>
@@ -285,19 +286,8 @@ inline void Imresize(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob> &inputs,
                      const std::vector<OpReqType> &req,
                      const std::vector<TBlob> &outputs) {
-#if MXNET_USE_OPENCV
-  CHECK_NE(inputs[0].type_flag_, mshadow::kFloat16) << "imresize doesn't support fp16";
-  const int DTYPE[] = {CV_32F, CV_64F, -1, CV_8U, CV_32S};
-  int cv_type = CV_MAKETYPE(DTYPE[inputs[0].type_flag_], inputs[0].shape_[2]);
   const auto& param = nnvm::get<ResizeParam>(attrs.parsed);
-  cv::Mat buf(inputs[0].shape_[0], inputs[0].shape_[1], cv_type, inputs[0].dptr_);
-  cv::Mat dst(outputs[0].shape_[0], outputs[0].shape_[1], cv_type, outputs[0].dptr_);
-  cv::resize(buf, dst, cv::Size(param.w, param.h), 0, 0, param.interp);
-  CHECK(!dst.empty());
-  CHECK_EQ(static_cast<void*>(dst.ptr()), outputs[0].dptr_);
-#else
-  LOG(FATAL) << "Build with USE_OPENCV=1 for image io.";
-#endif  // MXNET_USE_OPENCV
+  op::image::ResizeImpl(inputs, outputs, param.h, param.w, param.interp);
 }
 
 

diff --git a/src/operator/contrib/bilinear_resize-inl.cuh b/src/operator/contrib/bilinear_resize-inl.cuh
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file bilinear_resize-inl.cuh
+ * \brief bilinear resize operator cuda implementation
+ * \author Hang Zhang, Jake Lee
+*/
+
+#ifndef MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
+#define MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+
+namespace mxnet {
+namespace op {
+
+using namespace mshadow;
+
+enum ImageLayout {
+  HWC,
+  NHWC,
+  NCHW
+};
+
+template<typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
+};
+
+// The maximum number of threads in a block
+static const unsigned MAX_BLOCK_SIZE = 512U;
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static unsigned getNumThreads(int nElem, const bool smaller) {
+  unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE};
+  const int maxi = smaller ? 4 : 5;
+  for (int i = 0; i != maxi; ++i) {
+    if (static_cast<unsigned>(nElem) <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE;
+}
+
+// caffe_gpu_interp2_kernel overloading with Tensor<xpu, 3, DType>
+template<typename xpu, typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rheight, const Acctype rwidth,
+    const Tensor<xpu, 3, Dtype> data1,
+    Tensor<xpu, 3, Dtype> data2,
+    ImageLayout layout) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int channels = data1.size(2);
+  const int height1 = data1.size(0);
+  const int width1 = data1.size(1);
+  const int height2 = data2.size(0);
+  const int width2 = data2.size(1);
+
+  if (index < n) {
+    const int w2 = index % width2;  // 0:width2-1
+    const int h2 = index / width2;  // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int c = 0; c < channels; ++c) {
+        const Dtype val = data1[h1][w1][c];
+        data2[h2][w2][c] = val;
+      }
+      return;
+    }
+    //
+    const Acctype h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = rwidth * w2;
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    for (int c = 0; c < channels; ++c) {
+      const Acctype val = h0lambda * (w0lambda * data1[h1][w1][c]
+                            + w1lambda * data1[h1][w1+w1p][c])
+                            + h1lambda * (w0lambda * data1[h1+h1p][w1][c]
+                            + w1lambda * data1[h1+h1p][w1+w1p][c]);
+      data2[h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
+    }
+  }
+}
+
+// caffe_gpu_interp2_kernel overloading with Tensor<xpu, 4, DType>
+template<typename xpu, typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rheight, const Acctype rwidth,
+    const Tensor<xpu, 4, Dtype> data1,
+    Tensor<xpu, 4, Dtype> data2,
+    ImageLayout layout) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int batch_size = (layout == NHWC) ? data1.size(0) : data1.size(0);
+  int channels = (layout == NHWC) ? data1.size(3) : data1.size(1);
+  int height1 = (layout == NHWC) ? data1.size(1) : data1.size(2);
+  int width1 = (layout == NHWC) ? data1.size(2) : data1.size(3);
+  int height2 = (layout == NHWC) ? data2.size(1) : data2.size(2);
+  int width2 = (layout == NHWC) ? data2.size(2): data2.size(3);
+
+  if (index < n) {
+    const int w2 = index % width2;  // 0:width2-1
+    const int h2 = index / width2;  // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batch_size; ++n) {
+        for (int c = 0; c < channels; ++c) {
+          if (layout == NHWC) {
+            const Dtype val = data1[n][h1][w1][c];
+            data2[n][h2][w2][c] = val;
+          } else {
+            const Dtype val = data1[n][c][h1][w1];
+            data2[n][c][h2][w2] = val;
+          }
+        }
+      }
+      return;
+    }
+    //
+    const Acctype h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = rwidth * w2;
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+
+    for (auto n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < channels; ++c) {
+        if (layout == NHWC) {
+          const Acctype val = h0lambda * (w0lambda * data1[n][h1][w1][c]
+                            + w1lambda * data1[n][h1][w1+w1p][c])
+                            + h1lambda * (w0lambda * data1[n][h1+h1p][w1][c]
+                            + w1lambda * data1[n][h1+h1p][w1+w1p][c]);
+          data2[n][h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
+        } else {
+          const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
+                            + w1lambda * data1[n][c][h1][w1+w1p])
+                            + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
+                            + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
+          data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
diff --git a/src/operator/contrib/bilinear_resize.cu b/src/operator/contrib/bilinear_resize.cu
@@ -25,86 +25,13 @@
 #include <cuda_runtime_api.h>
 #include <algorithm>
 #include "bilinear_resize-inl.h"
+#include "bilinear_resize-inl.cuh"
 
 namespace mxnet {
 namespace op {
 
 using namespace mshadow;
 
-template<typename In, typename Out>
-struct ScalarConvert {
-  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
-};
-
-
-// The maximum number of threads in a block
-static const unsigned MAX_BLOCK_SIZE = 512U;
-
-// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
-static unsigned getNumThreads(int nElem, const bool smaller) {
-  unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE};
-  const int maxi = smaller ? 4 : 5;
-  for (int i = 0; i != maxi; ++i) {
-    if (static_cast<unsigned>(nElem) <= threadSizes[i]) {
-      return threadSizes[i];
-    }
-  }
-  return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE;
-}
-
-template<typename xpu, typename Dtype, typename Acctype>
-__global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rheight, const Acctype rwidth,
-    const Tensor<xpu, 4, Dtype> data1,
-    Tensor<xpu, 4, Dtype> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.size(0);
-  const int channels = data1.size(1);
-  const int height1 = data1.size(2);
-  const int width1 = data1.size(3);
-  const int height2 = data2.size(2);
-  const int width2 = data2.size(3);
-
-  if (index < n) {
-    const int w2 = index % width2;  // 0:width2-1
-    const int h2 = index / width2;  // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][h1][w1];
-          data2[n][c][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype h1r = rheight * h2;
-    const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const Acctype h1lambda = h1r - h1;
-    const Acctype h0lambda = Acctype(1) - h1lambda;
-    //
-    const Acctype w1r = rwidth * w2;
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++) {
-        for (int c = 0; c < channels; ++c) {
-        const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
-                            + w1lambda * data1[n][c][h1][w1+w1p])
-                            + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
-                            + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
-        data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
-      }
-    }
-  }
-}
-
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template<typename xpu, typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel_backward(const int n,
@@ -181,9 +108,10 @@ void SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream<gpu> *s,
   dim3 blocks(static_cast<int>(num_kernels / num_threads) + 1);
   dim3 threads(num_threads);
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+  ImageLayout layout = NCHW;
   caffe_gpu_interp2_kernel<xpu, DType, AccReal>
   <<<blocks, threads , 0, stream>>>(
-    num_kernels, rheight, rwidth, idata, odata);
+    num_kernels, rheight, rwidth, idata, odata, layout);
   MSHADOW_CUDA_POST_KERNEL_CHECK(SpatialUpSamplingBilinearUpdateOutput);
 }
 
@@ -215,6 +143,5 @@ NNVM_REGISTER_OP(_contrib_BilinearResize2D)
 
 NNVM_REGISTER_OP(_backward_contrib_BilinearResize2D)
 .set_attr<FCompute>("FCompute<gpu>", BilinearSampleOpBackward<gpu>);
-
 }  // namespace op
 }  // namespace mxnet