Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 163 additions & 11 deletions paddle/phi/kernels/cpu/contiguous_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,39 @@ limitations under the License. */

#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/dense_tensor_iterator.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"

#if defined(PADDLE_WITH_OPENMP)
#include <omp.h>
#endif

namespace phi {

inline int64_t DivUp(const int64_t& x, const int64_t& y) {
return (x + y - 1) / y;
}

inline void DealWithStride(const DenseTensorIterator& iter, int64_t* strides) {
for (int dim = 0; dim < iter.ndim(); dim++) {
for (int arg = 0; arg < iter.ntensors(); arg++) {
*strides++ = iter.strides(arg)[dim];
}
}
if (iter.ndim() < 2) {
std::fill_n(strides, (2 - iter.ndim()) * iter.ntensors(), 0);
}
}

inline bool FastTransposeCopyValid(const DenseTensor& self,
const DenseTensor& src) {
constexpr int64_t MIN_NUMEL = 360;
return src.numel() != 0 && src.dims().size() == 2 && src.strides()[0] == 1 &&
src.strides()[1] == src.dims()[0] &&
self.dims().size() == src.dims().size() && self.numel() >= MIN_NUMEL;
}

template <typename T, typename Context>
void ContiguousKernel(const Context& dev_ctx,
const DenseTensor& input,
Expand All @@ -31,21 +59,145 @@ void ContiguousKernel(const Context& dev_ctx,

const T* input_data = input.data<T>();
T* output_data = dev_ctx.template Alloc<T>(out);
int rank = input.dims().size();
auto dims = input.dims();
auto input_stride = input.strides();
auto numel = input.numel();

for (int64_t i = 0; i < numel; i++) {
int64_t input_offset = 0;
int64_t index_tmp = i;
for (int dim = rank - 1; dim >= 0; --dim) {
int64_t mod = index_tmp % dims[dim];
index_tmp = index_tmp / dims[dim];
input_offset += mod * input_stride[dim];
if (FastTransposeCopyValid(*out, input)) {
// Fast path for 2D transpose
constexpr int64_t TRANS_NUMEL = 60;
void* trans_buffer =
malloc(phi::SizeOf(input.dtype()) * TRANS_NUMEL * TRANS_NUMEL);

const T* tmp_src_ptr = input_data;
T* tmp_out_ptr = output_data;
T* tmp_buf_ptr = reinterpret_cast<T*>(trans_buffer);

int64_t dim0 = out->dims()[0];
int64_t dim1 = out->dims()[1];

for (int64_t d0 = 0; d0 < dim0; d0 += TRANS_NUMEL) {
for (int64_t d1 = 0; d1 < dim1; d1 += TRANS_NUMEL) {
const T* src_ptr_inter = tmp_src_ptr + d0 + d1 * dim0;
T* out_ptr_inter = tmp_out_ptr + d1 + d0 * dim1;

int nr = std::min(dim0 - d0, TRANS_NUMEL);
int nc = std::min(dim1 - d1, TRANS_NUMEL);

for (int c = 0; c < nc; c++) {
memcpy(tmp_buf_ptr + c * TRANS_NUMEL,
src_ptr_inter + c * dim0,
nr * sizeof(T));
}

int rc_max = std::max(nr, nc);
int rc_min = std::min(nr, nc);
for (int r = 0; r < rc_max; r++) {
int end = std::min(r, rc_min);
for (int c = 0; c < end; c++) {
T tmp = tmp_buf_ptr[r + TRANS_NUMEL * c];
tmp_buf_ptr[r + TRANS_NUMEL * c] = tmp_buf_ptr[r * TRANS_NUMEL + c];
tmp_buf_ptr[r * TRANS_NUMEL + c] = tmp;
}
}

for (int r = 0; r < nr; r++) {
memcpy(out_ptr_inter + r * dim1,
tmp_buf_ptr + r * TRANS_NUMEL,
nc * sizeof(T));
}
}
}
free(trans_buffer);
} else {
#if defined(PADDLE_WITH_OPENMP)
// OpenMP parallel path
phi::DenseTensorIteratorConfig config;
config.add_output(*out);
config.add_const_input(input);
config.is_alloc_out_ = true;
phi::DenseTensorIterator iter = config.build();

std::vector<int64_t> tmp_strides(
iter.ntensors() * static_cast<size_t>(std::max(iter.ndim(), 2)));

DealWithStride(iter, tmp_strides.data());

std::vector<int64_t> out_stride(tmp_strides.begin() + iter.ntensors(),
tmp_strides.end());

const int64_t& iter_numel = iter.numel();

const char* in_ptr = reinterpret_cast<const char*>(input_data);
char* out_ptr = reinterpret_cast<char*>(output_data);

int64_t end = iter_numel;
int64_t begin = 0;
int64_t grain_size = 32768;

int64_t* whole_stride = tmp_strides.data();

#pragma omp parallel
{
int64_t num_threads = omp_get_num_threads();

if (grain_size > 0) {
num_threads = std::min(num_threads, DivUp((end - begin), grain_size));
}

int64_t tid = omp_get_thread_num();
int64_t chunk_size = DivUp((end - begin), num_threads);
int64_t begin_tid = begin + tid * chunk_size;

if (begin_tid < end) {
int64_t range_start = begin_tid;
int64_t range_end = std::min(end, chunk_size + begin_tid);

auto dimiter = DimIter(iter.shape(), range_start, range_end);
while (!dimiter.iter_to_end()) {
const auto v_ndim = dimiter.values.size();
const char* tmp_in_data = in_ptr;
char* tmp_out_data = out_ptr;
for (size_t dim = 0; dim < v_ndim; dim++) {
int64_t value = dimiter.values[dim];
tmp_out_data += value * whole_stride[dim * iter.ntensors() + 0];
tmp_in_data += value * whole_stride[dim * iter.ntensors() + 1];
}

auto step = dimiter.iter_for_step();

for (int64_t i = 0; i < step[1]; i++) {
for (int64_t j = 0; j < step[0]; j++) {
const char* real_in_ptr = tmp_in_data + j * whole_stride[1];
char* real_out_ptr = tmp_out_data + j * whole_stride[0];

*reinterpret_cast<T*>(real_out_ptr) =
*reinterpret_cast<const T*>(real_in_ptr);
}
tmp_in_data = tmp_in_data + out_stride[1];
tmp_out_data = tmp_out_data + out_stride[0];
}

dimiter.iter_to_next(step);
}
}
}
#else
// Serial fallback path
int rank = input.dims().size();
auto dims = input.dims();
auto input_stride = input.strides();

output_data[i] = input_data[input_offset];
for (int64_t i = 0; i < numel; i++) {
int64_t input_offset = 0;
int64_t index_tmp = i;
for (int dim = rank - 1; dim >= 0; --dim) {
int64_t mod = index_tmp % dims[dim];
index_tmp = index_tmp / dims[dim];
input_offset += mod * input_stride[dim];
}

output_data[i] = input_data[input_offset];
}
#endif
}
}
} // namespace phi
Expand Down
Loading