src/operator/nn/im2col.h

/*!
 ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
 *
 * COPYRIGHT
 *
 * All contributions by the University of California:
 * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 * All rights reserved.
 *
 * All other contributions:
 * Copyright (c) 2014-2017, the respective contributors
 * All rights reserved.
 *
 * Caffe uses a shared copyright model: each contributor holds copyright over
 * their contributions to Caffe. The project versioning records all such
 * contribution and copyright details. If a contributor wants to further mark
 * their specific copyright on a particular contribution, they should indicate
 * their copyright solely in the commit message of the change when it is
 * committed.
 *
 * LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * CONTRIBUTION AGREEMENT
 *
 * By contributing to the BVLC/caffe repository through pull-request, comment,
 * or otherwise, the contributor releases their content to the
 * license and copyright terms herein.
 *
 ***************** END Caffe Copyright Notice and Disclaimer ********************
 *
 * \file im2col.h
 * \brief Function definitions of converting an image to
 * column matrix based on kernel, padding, and dilation.
 * These functions are mainly used in convolution operators.
 * The implementation of the im2col and col2im algorithms
 * are copied from Caffe with minor interface modifications
 * adapting to MXNet data structures.
 */

#ifndef MXNET_OPERATOR_NN_IM2COL_H_
#define MXNET_OPERATOR_NN_IM2COL_H_

#include <mxnet/base.h>
#include <mxnet/operator.h>
#include <cstring>
#include <vector>
#include "../mxnet_op.h"

namespace mxnet {
namespace op {

// Function uses casting from int to unsigned to compare if value of
// parameter a is greater or equal to zero and lower than value of
// parameter b. The b parameter is of type signed and is always positive,
// therefore its value is always lower than 0x800... where casting
// negative value of a parameter converts it to value higher than 0x800...
// The casting allows to use one condition instead of two.
inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
}

/*!
 * \brief im2col 2D cpu version.
 * DO NOT call this function directly.
 * Use the wrapper function im2col() instead.
 */
template <typename DType>
inline void im2col_cpu(const DType* data_im,
                       const int channels,
                       const int height,
                       const int width,
                       const int kernel_h,
                       const int kernel_w,
                       const int pad_h,
                       const int pad_w,
                       const int stride_h,
                       const int stride_w,
                       const int dilation_h,
                       const int dilation_w,
                       DType* data_col) {
  const int output_h     = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int output_w     = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
  const int channel_size = height * width;
  // TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
  // for testing the performance of convolution operator,
  // but the total runtime increased by 0.8s for images of shape
  // (8, 32, 64, 64) and decreased by 0.2s for images of shape
  // (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
  // bottleneck of the convolution operator probably lies in dot().
  // Hence, adding more threads to the loops contributes little
  // toward improving the convolution operator's performance.
  // We will revisit this issue in the future.
  for (int channel = channels; channel--; data_im += channel_size) {
    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
        int input_row = -pad_h + kernel_row * dilation_h;
        for (int output_rows = output_h; output_rows; output_rows--) {
          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
            for (int output_cols = output_w; output_cols; output_cols--) {
              *(data_col++) = 0;
            }
          } else {
            int input_col = -pad_w + kernel_col * dilation_w;
            for (int output_col = output_w; output_col; output_col--) {
              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                *(data_col++) = data_im[input_row * width + input_col];
              } else {
                *(data_col++) = 0;
              }
              input_col += stride_w;
            }
          }
          input_row += stride_h;
        }
      }
    }
  }
}

/*!
 * \brief core function of im2col algorithm. DO NOT call this function directly.
 * Use wrapper function im2col() instead.
 * \param data_input image pointer pointing the first element of channel dim
 * \param im2col determine whether the algorithm is im2col or col2im
 * \param im_shape input image shape in dimensions (N, C, H, W,)
 * \param col_shape column buffer shape
 * \param kernel_shape kernel filter shape
 * \param pad pad shape
 * \param stride stride shape
 * \param dilation dilation shape
 * \param data_output start pointer of the column buffer to be filled
 */
template <typename DType>
inline void im2col_nd_core_cpu(const DType* data_input,
                               const bool im2col,
                               const mxnet::TShape& im_shape,
                               const mxnet::TShape& col_shape,
                               const mxnet::TShape& kernel_shape,
                               const mxnet::TShape& pad,
                               const mxnet::TShape& stride,
                               const mxnet::TShape& dilation,
                               DType* data_output,
                               OpReqType req = mxnet::kWriteTo) {
  if (mxnet::kNullOp == req)
    return;
  int num_spatial_axes = kernel_shape.ndim();
  if (!im2col) {
    index_t im_size = im_shape[1];  // skip batch dim
    for (index_t i = 0; i < num_spatial_axes; ++i) {
      im_size *= im_shape[2 + i];
    }
    if (mxnet::kAddTo != req) {
      std::fill(data_output, data_output + im_size, static_cast<DType>(0));
    }
  }
  index_t kernel_size = 1;
  for (index_t i = 0; i < num_spatial_axes; ++i) {
    kernel_size *= kernel_shape[i];
  }
  const index_t channels_col = col_shape[0];
  std::vector<index_t> d_offset(num_spatial_axes, 0);
  std::vector<index_t> d_iter(num_spatial_axes, 0);
  for (index_t c_col = 0; c_col < channels_col; ++c_col) {
    // Loop over spatial axes in reverse order to compute a per-axis offset.
    index_t offset = c_col;
    for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
      if (d_i < static_cast<int>(num_spatial_axes) - 1) {
        offset /= kernel_shape[d_i + 1];
      }
      d_offset[d_i] = offset % kernel_shape[d_i];
    }
    for (bool incremented = true; incremented;) {
      // Loop over spatial axes in forward order to compute the indices in the
      // image and column, and whether the index lies in the padding.
      index_t index_col = c_col;
      index_t index_im  = c_col / kernel_size;
      bool is_padding   = false;
      for (index_t d_i = 0; d_i < num_spatial_axes; ++d_i) {
        const index_t d = d_iter[d_i];
        const int d_im  = static_cast<int>(d * stride[d_i] + d_offset[d_i] * dilation[d_i]) -
                         static_cast<int>(pad[d_i]);
        is_padding |= d_im < 0 || d_im >= static_cast<int>(im_shape[d_i + 2]);
        index_col *= col_shape[d_i + 1];
        index_col += d;
        index_im *= static_cast<index_t>(im_shape[d_i + 2]);
        index_im += d_im;
      }
      if (im2col) {
        if (is_padding) {
          data_output[index_col] = 0;
        } else {
          data_output[index_col] = data_input[index_im];
        }
      } else if (!is_padding) {  // col2im
        data_output[index_im] += data_input[index_col];
      }
      // Loop over spatial axes in reverse order to choose an index,
      // like counting.
      incremented = false;
      for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
        const index_t d_max = col_shape[d_i + 1];
        CHECK_LT(d_iter[d_i], d_max);
        if (d_iter[d_i] + 1 == d_max) {
          d_iter[d_i] = 0;
        } else {  // d_iter[d_i] < d_max - 1
          ++d_iter[d_i];
          incremented = true;
          break;
        }
      }
    }  // while(incremented)
  }    // for (int c = 0; c < channels_col; ++c)
}

/*!
 * \brief cpu function of im2col algorithm
 * \param data_im pointer of a image (C, H, W,...) in the image batch
 * \param im_shape input image shape in dimensions (N, C, H, W,)
 * \param col_shape column buffer shape
 * \param kernel_shape kernel filter shape
 * \param pad pad shape
 * \param stride stride shape
 * \param dilation dilation shape
 * \param data_col start pointer of the column buffer to be filled
 */
template <typename DType>
inline void im2col(mshadow::Stream<cpu>* s,
                   const DType* data_im,
                   const mxnet::TShape& im_shape,
                   const mxnet::TShape& col_shape,
                   const mxnet::TShape& kernel_shape,
                   const mxnet::TShape& pad,
                   const mxnet::TShape& stride,
                   const mxnet::TShape& dilation,
                   DType* data_col) {
  if (2 == kernel_shape.ndim()) {
    im2col_cpu(data_im,
               im_shape[1],
               im_shape[2],
               im_shape[3],
               kernel_shape[0],
               kernel_shape[1],
               pad[0],
               pad[1],
               stride[0],
               stride[1],
               dilation[0],
               dilation[1],
               data_col);
  } else {
    im2col_nd_core_cpu(
        data_im, true, im_shape, col_shape, kernel_shape, pad, stride, dilation, data_col);
  }
}

/*!
 * \brief col2im 2D cpu version.
 * DO NOT call this function directly. Use wrapper function col2im() instead.
 */
template <typename DType>
inline void col2im_cpu(const DType* data_col,
                       const int channels,
                       const int height,
                       const int width,
                       const int kernel_h,
                       const int kernel_w,
                       const int pad_h,
                       const int pad_w,
                       const int stride_h,
                       const int stride_w,
                       const int dilation_h,
                       const int dilation_w,
                       DType* data_im,
                       OpReqType req) {
  if (mxnet::kNullOp == req)
    return;
  if (mxnet::kAddTo != req) {
    std::fill(data_im, data_im + height * width * channels, static_cast<DType>(0));
  }
  const int output_h     = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int output_w     = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
  const int channel_size = height * width;
  // TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
  // for testing the performance of convolution operator,
  // but the total runtime increased by 0.8s for images of shape
  // (8, 32, 64, 64) and decreased by 0.2s for images of shape
  // (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
  // bottleneck of the convolution operator probably lies in dot().
  // Hence, adding more threads to the loops contributes little
  // toward improving the convolution operator's performance.
  // We will revisit this issue in the future.
  for (int channel = channels; channel--; data_im += channel_size) {
    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
        int input_row = -pad_h + kernel_row * dilation_h;
        for (int output_rows = output_h; output_rows; output_rows--) {
          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
            data_col += output_w;
          } else {
            int input_col = -pad_w + kernel_col * dilation_w;
            for (int output_col = output_w; output_col; output_col--) {
              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                data_im[input_row * width + input_col] += *data_col;
              }
              data_col++;
              input_col += stride_w;
            }
          }
          input_row += stride_h;
        }
      }
    }
  }
}

/*!\brief
 * cpu function of col2im algorithm
 * \param s device stream
 * \param data_col start pointer of the column buffer to be filled
 * \param im_shape input image shape in dimensions (N, C, H, W,)
 * \param col_shape column buffer shape
 * \param kernel_shape kernel filter shape
 * \param pad pad shape
 * \param stride stride shape
 * \param dilation dilation shape
 * \param data_im pointer of a image (C, H, W,...) in the image batch
 */
template <typename DType>
inline void col2im(mshadow::Stream<cpu>* s,
                   const DType* data_col,
                   const mxnet::TShape& im_shape,
                   const mxnet::TShape& col_shape,
                   const mxnet::TShape& kernel_shape,
                   const mxnet::TShape& pad,
                   const mxnet::TShape& stride,
                   const mxnet::TShape& dilation,
                   DType* data_im,
                   OpReqType req) {
  int num_spatial_axes = kernel_shape.ndim();
  if (2 == num_spatial_axes) {
    col2im_cpu(data_col,
               im_shape[1],
               im_shape[2],
               im_shape[3],
               kernel_shape[0],
               kernel_shape[1],
               pad[0],
               pad[1],
               stride[0],
               stride[1],
               dilation[0],
               dilation[1],
               data_im,
               req);
  } else {
    im2col_nd_core_cpu(
        data_col, false, im_shape, col_shape, kernel_shape, pad, stride, dilation, data_im, req);
  }
}

}  // namespace op
}  // namespace mxnet
#ifdef __CUDACC__
#include "./im2col.cuh"
#endif
#endif  // MXNET_OPERATOR_NN_IM2COL_H_