From 2fe0eaf757a992eb645b82fb4ea294b44c44df08 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 13 Feb 2020 19:31:26 -0800 Subject: [PATCH 01/37] Vectorized loads for binary elemwise kernel --- src/operator/tensor/elemwise_binary_op.h | 24 -- .../tensor/elemwise_binary_op_basic.cu | 14 +- src/operator/tensor/elemwise_op.cuh | 273 ++++++++++++++++++ 3 files changed, 280 insertions(+), 31 deletions(-) create mode 100644 src/operator/tensor/elemwise_op.cuh diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index bc5140a5d75f..4d3d02f1e901 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -574,30 +574,6 @@ class ElemwiseBinaryOp : public OpBase { }); } - template - static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mxnet_op; - if (req[0] == kNullOp) return; - Stream *s = ctx.get_stream(); - CHECK_EQ(inputs.size(), 2U); - CHECK_EQ(outputs.size(), 1U); - MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { - const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size()) - + DataType::kLanes - 1) / DataType::kLanes; - if (size != 0) { - Kernel, xpu>::Launch(s, size, - outputs[0].dptr(), - inputs[0].dptr(), inputs[1].dptr()); - } - }); - }); - } - template static void ComputeEx(const nnvm::NodeAttrs &attrs, const OpContext &ctx, diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index 16d7fc1ad72b..3f7a0a1d574f 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -26,6 +26,7 @@ #include "./elemwise_binary_op.h" #include "./elemwise_binary_op-inl.h" #include "./indexing_op.h" +#include "./elemwise_op.cuh" namespace mxnet { namespace op { @@ -218,11 +219,11 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream *s, } NNVM_REGISTER_OP(elemwise_add) -.set_attr("FCompute", ElemwiseBinaryOp::ComputeWithHalf2) +.set_attr("FCompute", ComputeWithHalf2) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_grad_add) -.set_attr("FCompute", ElemwiseBinaryOp::ComputeWithHalf2); +.set_attr("FCompute", ComputeWithHalf2); NNVM_REGISTER_OP(_backward_add) .set_attr("FCompute", @@ -230,8 +231,7 @@ NNVM_REGISTER_OP(_backward_add) mshadow_op::identity>); NNVM_REGISTER_OP(elemwise_sub) -.set_attr("FCompute", ElemwiseBinaryOp::ComputeWithHalf2< - gpu, op::mshadow_op::minus>) +.set_attr("FCompute", ComputeWithHalf2) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_sub) @@ -240,7 +240,7 @@ NNVM_REGISTER_OP(_backward_sub) mshadow_op::negation>); NNVM_REGISTER_OP(elemwise_mul) -.set_attr("FCompute", ElemwiseBinaryOp::ComputeWithHalf2) +.set_attr("FCompute", ComputeWithHalf2) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeDnsLRValueEx); @@ -251,7 +251,7 @@ NNVM_REGISTER_OP(_backward_mul) NNVM_REGISTER_OP(elemwise_div) .set_attr("FCompute", - ElemwiseBinaryOp::ElemwiseBinaryOp::ComputeWithHalf2); + ComputeWithHalf2); NNVM_REGISTER_OP(_backward_div) .set_attr("FCompute", @@ -259,7 +259,7 @@ NNVM_REGISTER_OP(_backward_div) mshadow_op::div_rgrad>); NNVM_REGISTER_OP(_mod) -.set_attr("FCompute", ElemwiseBinaryOp::ComputeWithHalf2); +.set_attr("FCompute", ComputeWithHalf2); NNVM_REGISTER_OP(_backward_mod) .set_attr("FCompute", diff --git a/src/operator/tensor/elemwise_op.cuh b/src/operator/tensor/elemwise_op.cuh new file mode 100644 index 000000000000..091d2c5ab091 --- /dev/null +++ b/src/operator/tensor/elemwise_op.cuh @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file elemwise_op.cuh + * \brief GPU helpers for elementwise operators + */ + +#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_OP_CUH_ +#define MXNET_OPERATOR_TENSOR_ELEMWISE_OP_CUH_ + +#include +#include "../operator_common.h" + +#include + +#if MXNET_USE_CUDA + +namespace mxnet { +namespace op { + +template +class VectorizedStorage { + public: + constexpr static int nvec = sizeof(LType) / sizeof(DType); + union vectorized_storage { + LType aligned; + DType separate[nvec]; // NOLINT(*) + + MSHADOW_XINLINE vectorized_storage() {} + MSHADOW_XINLINE ~vectorized_storage() {} + } scratch_; +}; + +template +class VectorizedAccessor { + public: + VectorizedStorage::type, + typename std::remove_const::type> storage_; + + LType* aligned_ptr_; + DType* unaligned_ptr_; + int alignment_; + index_t n_elems_; + + MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t N) { + unaligned_ptr_ = ptr; + if (aligned) { + alignment_ = 0; + aligned_ptr_ = reinterpret_cast(ptr); + n_elems_ = (N + storage_.nvec - 1) / storage_.nvec; + } else { + size_t ptr_as_number = reinterpret_cast(ptr); + alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType); + aligned_ptr_ = reinterpret_cast(ptr - alignment_); + n_elems_ = (N + alignment_ + storage_.nvec - 1) / storage_.nvec; + } + } + + MSHADOW_XINLINE index_t num_aligned_elements() const { + return n_elems_; + } + + MSHADOW_XINLINE void load(const index_t id, const index_t N) { + if (aligned) { + storage_.scratch_.aligned = aligned_ptr_[id]; + } else { + if (id > 0 && id < n_elems_ - 1) { + storage_.scratch_.aligned = aligned_ptr_[id]; + } else { +#pragma unroll + for (int j = 0; j < storage_.nvec; ++j) { + DType* ptr = reinterpret_cast(&(aligned_ptr_[id])) + j; + if (reinterpret_cast(ptr) >= reinterpret_cast(unaligned_ptr_) && + reinterpret_cast(ptr) < reinterpret_cast(unaligned_ptr_ + N)) { + storage_.scratch_.separate[j] = *ptr; + } + } + } + } + } +}; + +template +class VectorizedLoader : public VectorizedAccessor { + public: + MSHADOW_XINLINE VectorizedLoader(const DType* ptr, const index_t N) : + VectorizedAccessor(ptr, N) { + } +}; + +template +class VectorizedStorer : public VectorizedAccessor { + public: + MSHADOW_XINLINE VectorizedStorer(DType* ptr, const index_t N) : + VectorizedAccessor(ptr, N) { + } + + MSHADOW_XINLINE void store(const index_t id, const index_t N) { + if (aligned) { + this->aligned_ptr_[id] = this->storage_.scratch_.aligned; + } else { + if (id > 0 && id < this->n_elems_ - 1) { + this->aligned_ptr_[id] = this->storage_.scratch_.aligned; + } else { +#pragma unroll + for (int j = 0; j < this->storage_.nvec; ++j) { + DType* ptr = reinterpret_cast(&(this->aligned_ptr_[id])) + j; + if (reinterpret_cast(ptr) >= reinterpret_cast(this->unaligned_ptr_) && + reinterpret_cast(ptr) < reinterpret_cast(this->unaligned_ptr_ + N)) { + *ptr = this->storage_.scratch_.separate[j]; + } + } + } + } + } +}; + +namespace { + +template +__global__ void VectorizedElementwiseKernel(DType* output, const DType* input0, const DType* input1, index_t N) { + VectorizedLoader loader0(input0, N); + VectorizedLoader loader1(input1, N); + VectorizedStorer storer(output, N); + + const index_t M = loader0.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + loader0.load(tid, N); + loader1.load(tid, N); + if (req == kAddTo) { + storer.load(tid, N); + } +#pragma unroll + for (int i = 0; i < loader0.storage_.nvec; ++i) { + DType temp = OP::Map(loader0.storage_.scratch_.separate[i], + loader1.storage_.scratch_.separate[i]); + + if (req == kAddTo) { + storer.storage_.scratch_.separate[i] += temp; + } else { + storer.storage_.scratch_.separate[i] = temp; + } + } + storer.store(tid, N); + } +} + +enum class Alignment { + SAME_ALIGNED, + SAME_UNALIGNED, + DIFFERENT +}; + +template +int CalcAlignment(const DType* ptr) { + size_t ptr_as_number = reinterpret_cast(ptr); + return ptr_as_number % sizeof(LType); +} + +template +Alignment CheckAlignment(const std::vector& pointers) { + int align = -1; + for (const DType* ptr : pointers) { + int new_align = CalcAlignment(ptr); + if (align == -1) { + align = new_align; + } else { + if (align != new_align) { + return Alignment::DIFFERENT; + } + } + } + + return align == 0 ? Alignment::SAME_ALIGNED + : Alignment::SAME_UNALIGNED; +} + +size_t minthree(const size_t a, const size_t b, const size_t c) { + return a < b ? (a < c ? a : c) : (b < c ? b : c); +} + +} // namespace + +template +void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mxnet_op; + if (req[0] == kNullOp) return; + Stream *s = ctx.get_stream(); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + if (dmlc::GetEnv("DEBUG_VECTOR", false)) { + MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { + const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size()) + + DataType::kLanes - 1) / DataType::kLanes; + if (size != 0) { + Kernel, gpu>::Launch(s, size, + outputs[0].dptr(), + inputs[0].dptr(), inputs[1].dptr()); + } + }); + } else { + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using LType = double; + static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); + if (outputs[0].Size() != 0) { + cudaStream_t stream = mshadow::Stream::GetStream(s); + constexpr int nvec = sizeof(LType) / sizeof(DType); + VectorizedLoader l(outputs[0].dptr(), outputs[0].Size()); + size_t num_elements = l.num_aligned_elements(); + constexpr int threads = 512; + index_t blocks = (num_elements + threads - 1) / threads; + auto align = CheckAlignment({outputs[0].dptr(), + inputs[0].dptr(), + inputs[1].dptr()}); + if (align == Alignment::SAME_ALIGNED && (outputs[0].Size() % nvec == 0)) { + VectorizedElementwiseKernel + <<>>(outputs[0].dptr(), + inputs[0].dptr(), + inputs[1].dptr(), + outputs[0].Size()); + } else { + if (align != Alignment::DIFFERENT) { + VectorizedElementwiseKernel + <<>>(outputs[0].dptr(), + inputs[0].dptr(), + inputs[1].dptr(), + outputs[0].Size()); + } else { + // If the pointers are aligned differently we cannot vectorize + VectorizedElementwiseKernel + <<>>(outputs[0].dptr(), + inputs[0].dptr(), + inputs[1].dptr(), + outputs[0].Size()); + } + } + } + }); + } + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_USE_CUDA +#endif // MXNET_OPERATOR_TENSOR_ELEMWISE_OP_CUH_ From 6b8950645b7c104c987e8fd437579172f08e1e6d Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 14 Feb 2020 14:59:40 -0800 Subject: [PATCH 02/37] More generalization --- .../tensor/elemwise_binary_op_basic.cu | 12 ++--- src/operator/tensor/elemwise_op.cuh | 48 +++++++++++++++---- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index 3f7a0a1d574f..e00319358a8c 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -219,11 +219,11 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream *s, } NNVM_REGISTER_OP(elemwise_add) -.set_attr("FCompute", ComputeWithHalf2) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_grad_add) -.set_attr("FCompute", ComputeWithHalf2); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_add) .set_attr("FCompute", @@ -231,7 +231,7 @@ NNVM_REGISTER_OP(_backward_add) mshadow_op::identity>); NNVM_REGISTER_OP(elemwise_sub) -.set_attr("FCompute", ComputeWithHalf2) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_sub) @@ -240,7 +240,7 @@ NNVM_REGISTER_OP(_backward_sub) mshadow_op::negation>); NNVM_REGISTER_OP(elemwise_mul) -.set_attr("FCompute", ComputeWithHalf2) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeDnsLRValueEx); @@ -251,7 +251,7 @@ NNVM_REGISTER_OP(_backward_mul) NNVM_REGISTER_OP(elemwise_div) .set_attr("FCompute", - ComputeWithHalf2); + VectorizedCompute); NNVM_REGISTER_OP(_backward_div) .set_attr("FCompute", @@ -259,7 +259,7 @@ NNVM_REGISTER_OP(_backward_div) mshadow_op::div_rgrad>); NNVM_REGISTER_OP(_mod) -.set_attr("FCompute", ComputeWithHalf2); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_mod) .set_attr("FCompute", diff --git a/src/operator/tensor/elemwise_op.cuh b/src/operator/tensor/elemwise_op.cuh index 091d2c5ab091..5d83c50312ee 100644 --- a/src/operator/tensor/elemwise_op.cuh +++ b/src/operator/tensor/elemwise_op.cuh @@ -28,6 +28,7 @@ #include #include "../operator_common.h" +#include "../../common/cuda_utils.h" #include @@ -49,6 +50,29 @@ class VectorizedStorage { } scratch_; }; +template +MSHADOW_XINLINE void ldg(LType* dst, const LType* src) { + *dst = *src; +} + +template <> +MSHADOW_XINLINE void ldg(double* dst, const double* src) { + double temp; + asm volatile ("ld.global.f64 %0, [%1];" : + "=d"(temp) : + "l"(src)); + *dst = temp; +} + +/*template <>*/ +/*MSHADOW_XINLINE void ldg(uint64_t* dst, const uint64_t* src) {*/ + /*uint64_t temp;*/ + /*asm volatile ("ld.global.u64 %0, [%1];" :*/ + /*"=l"(temp) :*/ + /*"l"(src));*/ + /**dst = temp;*/ +/*}*/ + template class VectorizedAccessor { public: @@ -80,10 +104,12 @@ class VectorizedAccessor { MSHADOW_XINLINE void load(const index_t id, const index_t N) { if (aligned) { - storage_.scratch_.aligned = aligned_ptr_[id]; + ldg::type>(&(storage_.scratch_.aligned), + aligned_ptr_ + id); } else { if (id > 0 && id < n_elems_ - 1) { - storage_.scratch_.aligned = aligned_ptr_[id]; + ldg::type>(&(storage_.scratch_.aligned), + aligned_ptr_ + id); } else { #pragma unroll for (int j = 0; j < storage_.nvec; ++j) { @@ -203,11 +229,11 @@ size_t minthree(const size_t a, const size_t b, const size_t c) { } // namespace template -void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +void VectorizedCompute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { using namespace mxnet_op; if (req[0] == kNullOp) return; Stream *s = ctx.get_stream(); @@ -226,7 +252,7 @@ void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, }); } else { MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - using LType = double; + using LType = uint4; static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); if (outputs[0].Size() != 0) { cudaStream_t stream = mshadow::Stream::GetStream(s); @@ -234,7 +260,8 @@ void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, VectorizedLoader l(outputs[0].dptr(), outputs[0].Size()); size_t num_elements = l.num_aligned_elements(); constexpr int threads = 512; - index_t blocks = (num_elements + threads - 1) / threads; + index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), + 65535); auto align = CheckAlignment({outputs[0].dptr(), inputs[0].dptr(), inputs[1].dptr()}); @@ -252,6 +279,9 @@ void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, inputs[1].dptr(), outputs[0].Size()); } else { + index_t blocks = std::min(static_cast((outputs[0].Size() + threads - 1) / + threads), + 65535); // If the pointers are aligned differently we cannot vectorize VectorizedElementwiseKernel <<>>(outputs[0].dptr(), From 37d81c806c36b8ce0128f619e483aa2e1904fa70 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 19 Feb 2020 15:41:41 -0800 Subject: [PATCH 03/37] Add backwardusenone --- .../tensor/elemwise_binary_op_basic.cu | 4 +- src/operator/tensor/elemwise_op.cuh | 265 ++++++++++++------ 2 files changed, 176 insertions(+), 93 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index e00319358a8c..1e13552d03b0 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -227,7 +227,7 @@ NNVM_REGISTER_OP(_grad_add) NNVM_REGISTER_OP(_backward_add) .set_attr("FCompute", - ElemwiseBinaryOp::BackwardUseNoneWithHalf2); NNVM_REGISTER_OP(elemwise_sub) @@ -236,7 +236,7 @@ NNVM_REGISTER_OP(elemwise_sub) NNVM_REGISTER_OP(_backward_sub) .set_attr("FCompute", - ElemwiseBinaryOp::BackwardUseNoneWithHalf2); NNVM_REGISTER_OP(elemwise_mul) diff --git a/src/operator/tensor/elemwise_op.cuh b/src/operator/tensor/elemwise_op.cuh index 5d83c50312ee..46ab611085e1 100644 --- a/src/operator/tensor/elemwise_op.cuh +++ b/src/operator/tensor/elemwise_op.cuh @@ -50,29 +50,6 @@ class VectorizedStorage { } scratch_; }; -template -MSHADOW_XINLINE void ldg(LType* dst, const LType* src) { - *dst = *src; -} - -template <> -MSHADOW_XINLINE void ldg(double* dst, const double* src) { - double temp; - asm volatile ("ld.global.f64 %0, [%1];" : - "=d"(temp) : - "l"(src)); - *dst = temp; -} - -/*template <>*/ -/*MSHADOW_XINLINE void ldg(uint64_t* dst, const uint64_t* src) {*/ - /*uint64_t temp;*/ - /*asm volatile ("ld.global.u64 %0, [%1];" :*/ - /*"=l"(temp) :*/ - /*"l"(src));*/ - /**dst = temp;*/ -/*}*/ - template class VectorizedAccessor { public: @@ -104,12 +81,10 @@ class VectorizedAccessor { MSHADOW_XINLINE void load(const index_t id, const index_t N) { if (aligned) { - ldg::type>(&(storage_.scratch_.aligned), - aligned_ptr_ + id); + storage_.scratch_.aligned = aligned_ptr_[id]; } else { if (id > 0 && id < n_elems_ - 1) { - ldg::type>(&(storage_.scratch_.aligned), - aligned_ptr_ + id); + storage_.scratch_.aligned = aligned_ptr_[id]; } else { #pragma unroll for (int j = 0; j < storage_.nvec; ++j) { @@ -161,11 +136,19 @@ class VectorizedStorer : public VectorizedAccessor { namespace { +template +struct VectorizedKernelParams { + const DType* inputs[NumInputs]; + DType* outputs[NumOutputs]; +}; + + template -__global__ void VectorizedElementwiseKernel(DType* output, const DType* input0, const DType* input1, index_t N) { - VectorizedLoader loader0(input0, N); - VectorizedLoader loader1(input1, N); - VectorizedStorer storer(output, N); +__global__ void VectorizedBinaryKernelFwd(const VectorizedKernelParams params, + const index_t N) { + VectorizedLoader loader0(params.inputs[0], N); + VectorizedLoader loader1(params.inputs[1], N); + VectorizedStorer storer(params.outputs[0], N); const index_t M = loader0.num_aligned_elements(); @@ -192,6 +175,82 @@ __global__ void VectorizedElementwiseKernel(DType* output, const DType* input0, } } +template +__global__ void VectorizedBinaryKernelBwdUseNone(const VectorizedKernelParams params, + const index_t N) { + VectorizedLoader loader(params.inputs[0], N); + VectorizedStorer lstorer(params.outputs[0], N); + VectorizedStorer rstorer(params.outputs[1], N); + + const index_t M = loader.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + loader.load(tid, N); + if (lreq == kAddTo) { + lstorer.load(tid, N); + } + if (rreq == kAddTo) { + rstorer.load(tid, N); + } +#pragma unroll + for (int i = 0; i < loader.storage_.nvec; ++i) { + DType inp = loader.storage_.scratch_.separate[i]; + if (!((std::is_same::value && lreq == kWriteInplace) || + lreq == kNullOp)) { + DType ltemp = LOP::Map(inp); + if (lreq == kAddTo) { + lstorer.storage_.scratch_.separate[i] += ltemp; + } else { + lstorer.storage_.scratch_.separate[i] = ltemp; + } + lstorer.store(tid, N); + } + if (!((std::is_same::value && rreq == kWriteInplace) || + rreq == kNullOp)) { + DType rtemp = ROP::Map(inp); + + if (rreq == kAddTo) { + rstorer.storage_.scratch_.separate[i] += rtemp; + } else { + rstorer.storage_.scratch_.separate[i] = rtemp; + } + rstorer.store(tid, N); + } + } + } +} + +template +class VectorizedBinaryFwd { + public: + using ParamType = VectorizedKernelParams; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedBinaryKernelFwd + <<>>(params, N); + } +}; + +template +class VectorizedBinaryBwdUseNone { + public: + using ParamType = VectorizedKernelParams; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedBinaryKernelBwdUseNone + <<>>(params, N); + } +}; + enum class Alignment { SAME_ALIGNED, SAME_UNALIGNED, @@ -204,10 +263,22 @@ int CalcAlignment(const DType* ptr) { return ptr_as_number % sizeof(LType); } -template -Alignment CheckAlignment(const std::vector& pointers) { +template +Alignment CheckAlignment(const VectorizedKernelParams& params) { int align = -1; - for (const DType* ptr : pointers) { + + for (const DType* ptr : params.inputs) { + int new_align = CalcAlignment(ptr); + if (align == -1) { + align = new_align; + } else { + if (align != new_align) { + return Alignment::DIFFERENT; + } + } + } + + for (const DType* ptr : params.outputs) { int new_align = CalcAlignment(ptr); if (align == -1) { align = new_align; @@ -222,80 +293,92 @@ Alignment CheckAlignment(const std::vector& pointers) { : Alignment::SAME_UNALIGNED; } -size_t minthree(const size_t a, const size_t b, const size_t c) { - return a < b ? (a < c ? a : c) : (b < c ? b : c); +template +void VectorizedKernelLauncher(const index_t size, mshadow::Stream* s, typename Kernel::ParamType params) { + static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); + if (size != 0) { + cudaStream_t stream = mshadow::Stream::GetStream(s); + constexpr int nvec = sizeof(LType) / sizeof(DType); + VectorizedLoader l(params.inputs[0], size); + size_t num_elements = l.num_aligned_elements(); + constexpr int threads = 512; + constexpr int max_blocks = 65535; + index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), + max_blocks); + auto align = CheckAlignment(params); + if (align == Alignment::SAME_ALIGNED && (size % nvec == 0)) { + Kernel::template Launch(blocks, threads, stream, params, size); + } else { + if (align != Alignment::DIFFERENT) { + Kernel::template Launch(blocks, threads, stream, params, size); + } else { + index_t blocks = std::min(static_cast((size + threads - 1) / + threads), + max_blocks); + // If the pointers are aligned differently we cannot vectorize + Kernel::template Launch(blocks, threads, stream, params, size); + } + } + } } -} // namespace - template void VectorizedCompute(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mxnet_op; if (req[0] == kNullOp) return; - Stream *s = ctx.get_stream(); + mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - if (dmlc::GetEnv("DEBUG_VECTOR", false)) { - MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { - const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size()) - + DataType::kLanes - 1) / DataType::kLanes; - if (size != 0) { - Kernel, gpu>::Launch(s, size, - outputs[0].dptr(), - inputs[0].dptr(), inputs[1].dptr()); - } - }); - } else { - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using LType = uint4; + using Kernel = VectorizedBinaryFwd; + + const index_t size = outputs[0].Size(); + typename Kernel::ParamType params; + params.inputs[0] = inputs[0].dptr(); + params.inputs[1] = inputs[1].dptr(); + params.outputs[0] = outputs[0].dptr(); + + VectorizedKernelLauncher(size, s, params); + }); + }); +} + +template +void VectorizedBackwardUseNoneCompute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + mshadow::Stream *s = ctx.get_stream(); + cudaStream_t stream = mshadow::Stream::GetStream(s); + + MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { + const index_t size = inputs[0].Size(); + if (req[0] != kNullOp || req[1] != kNullOp) { + MXNET_ASSIGN_REQ_SWITCH(req[0], lreq, { + MXNET_ASSIGN_REQ_SWITCH(req[1], rreq, { using LType = uint4; - static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); - if (outputs[0].Size() != 0) { - cudaStream_t stream = mshadow::Stream::GetStream(s); - constexpr int nvec = sizeof(LType) / sizeof(DType); - VectorizedLoader l(outputs[0].dptr(), outputs[0].Size()); - size_t num_elements = l.num_aligned_elements(); - constexpr int threads = 512; - index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), - 65535); - auto align = CheckAlignment({outputs[0].dptr(), - inputs[0].dptr(), - inputs[1].dptr()}); - if (align == Alignment::SAME_ALIGNED && (outputs[0].Size() % nvec == 0)) { - VectorizedElementwiseKernel - <<>>(outputs[0].dptr(), - inputs[0].dptr(), - inputs[1].dptr(), - outputs[0].Size()); - } else { - if (align != Alignment::DIFFERENT) { - VectorizedElementwiseKernel - <<>>(outputs[0].dptr(), - inputs[0].dptr(), - inputs[1].dptr(), - outputs[0].Size()); - } else { - index_t blocks = std::min(static_cast((outputs[0].Size() + threads - 1) / - threads), - 65535); - // If the pointers are aligned differently we cannot vectorize - VectorizedElementwiseKernel - <<>>(outputs[0].dptr(), - inputs[0].dptr(), - inputs[1].dptr(), - outputs[0].Size()); - } - } - } + using Kernel = VectorizedBinaryBwdUseNone; + + typename Kernel::ParamType params; + params.inputs[0] = inputs[0].dptr(); + params.outputs[0] = outputs[0].dptr(); + params.outputs[1] = outputs[1].dptr(); + + VectorizedKernelLauncher(size, s, params); }); - } + }); + } }); } +} // namespace + } // namespace op } // namespace mxnet From f86da86f809c8cbad07db76a3554f23890fe05a3 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 21 Feb 2020 15:54:14 -0800 Subject: [PATCH 04/37] Remove the unused _backward_add op --- .../tensor/elemwise_binary_op_basic.cc | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 4bfb2c84f551..3f607b2cc23e 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -156,26 +156,6 @@ static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs, return ret; } -NNVM_REGISTER_OP(_backward_add) -.set_num_inputs(1) -.set_num_outputs(2) -.set_attr("TIsBackward", true) -.set_attr("FInplaceOption", - [](const NodeAttrs &attrs) { - return std::vector >{{0, 0}, - {0, 1}}; - }) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -.set_attr("TIsMKLDNN", true) -#endif -.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseNone< - cpu, mshadow_op::identity, mshadow_op::identity>) -.set_attr("FComputeEx", _backward_ElemwiseAddEx) -.set_attr("FInferStorageType", ElemwiseAddBackwardStorageType); - MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_PD(elemwise_sub, op::mshadow_op::minus) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub) .add_alias("_sub").add_alias("_minus").add_alias("_Minus") From ea565529051c2d524606ffd63fa79f29d8340c7e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 21 Feb 2020 16:17:00 -0800 Subject: [PATCH 05/37] Add vectorized backwardusein --- src/common/cuda_vectorization.cuh | 232 ++++++++++++++ src/operator/tensor/elemwise_binary_op.h | 22 -- .../tensor/elemwise_binary_op_basic.cu | 11 +- src/operator/tensor/elemwise_op.cuh | 296 +++++++----------- 4 files changed, 346 insertions(+), 215 deletions(-) create mode 100644 src/common/cuda_vectorization.cuh diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh new file mode 100644 index 000000000000..d836c6c296cc --- /dev/null +++ b/src/common/cuda_vectorization.cuh @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file cuda_vectorization.cuh + * \brief GPU helpers for vectorized memory accesses + */ + +#ifndef MXNET_COMMON_CUDA_VECTORIZATION_CUH_ +#define MXNET_COMMON_CUDA_VECTORIZATION_CUH_ + +#include +#include "cuda_utils.h" + +#if MXNET_USE_CUDA && __CUDACC__ + +namespace mxnet { +namespace common { +namespace cuda { + +template +class VectorizedStorage { + public: + constexpr static int nvec = sizeof(LType) / sizeof(DType); + union vectorized_storage { + LType aligned; + DType separate[nvec]; // NOLINT(*) + + MSHADOW_XINLINE vectorized_storage() {} + MSHADOW_XINLINE ~vectorized_storage() {} + } scratch_; +}; + +template +class VectorizedAccessor { + public: + using StorageType = VectorizedStorage::type, + typename std::remove_const::type>; + StorageType storage_; + + LType* aligned_ptr_; + DType* unaligned_ptr_; + int alignment_; + index_t n_elems_; + + MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t N) { + unaligned_ptr_ = ptr; + if (aligned) { + alignment_ = 0; + aligned_ptr_ = reinterpret_cast(ptr); + n_elems_ = (N + storage_.nvec - 1) / storage_.nvec; + } else { + size_t ptr_as_number = reinterpret_cast(ptr); + alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType); + aligned_ptr_ = reinterpret_cast(ptr - alignment_); + n_elems_ = (N + alignment_ + storage_.nvec - 1) / storage_.nvec; + } + } + + MSHADOW_XINLINE DType* separate() { + return storage_.scratch_.separate; + } + + MSHADOW_XINLINE constexpr int nvec() const { + return storage_.nvec; + } + + MSHADOW_XINLINE index_t num_aligned_elements() const { + return n_elems_; + } + + MSHADOW_XINLINE void load(const index_t id, const index_t N) { + if (aligned) { + storage_.scratch_.aligned = aligned_ptr_[id]; + } else { + if (id > 0 && id < n_elems_ - 1) { + storage_.scratch_.aligned = aligned_ptr_[id]; + } else { +#pragma unroll + for (int j = 0; j < storage_.nvec; ++j) { + DType* ptr = reinterpret_cast(&(aligned_ptr_[id])) + j; + if (reinterpret_cast(ptr) >= reinterpret_cast(unaligned_ptr_) && + reinterpret_cast(ptr) < reinterpret_cast(unaligned_ptr_ + N)) { + storage_.scratch_.separate[j] = *ptr; + } + } + } + } + } + +}; + +template +class VectorizedLoader : public VectorizedAccessor { + public: + MSHADOW_XINLINE VectorizedLoader(const DType* ptr, const index_t N) : + VectorizedAccessor(ptr, N) { + } +}; + +template +class VectorizedStorer : public VectorizedAccessor { + public: + MSHADOW_XINLINE VectorizedStorer(DType* ptr, const index_t N) : + VectorizedAccessor(ptr, N) { + } + + MSHADOW_XINLINE void store(const index_t id, const index_t N) { + if (aligned) { + this->aligned_ptr_[id] = this->storage_.scratch_.aligned; + } else { + if (id > 0 && id < this->n_elems_ - 1) { + this->aligned_ptr_[id] = this->storage_.scratch_.aligned; + } else { +#pragma unroll + for (int j = 0; j < this->storage_.nvec; ++j) { + DType* ptr = reinterpret_cast(&(this->aligned_ptr_[id])) + j; + if (reinterpret_cast(ptr) >= reinterpret_cast(this->unaligned_ptr_) && + reinterpret_cast(ptr) < reinterpret_cast(this->unaligned_ptr_ + N)) { + *ptr = this->storage_.scratch_.separate[j]; + } + } + } + } + } +}; + +template +struct VectorizedKernelParams { + const DType* inputs[NumInputs]; + DType* outputs[NumOutputs]; +}; + +namespace { + +enum class Alignment { + SAME_ALIGNED, + SAME_UNALIGNED, + DIFFERENT +}; + +template +int CalcAlignment(const DType* ptr) { + size_t ptr_as_number = reinterpret_cast(ptr); + return ptr_as_number % sizeof(LType); +} + +template +Alignment CheckAlignment(const VectorizedKernelParams& params) { + int align = -1; + + for (const DType* ptr : params.inputs) { + int new_align = CalcAlignment(ptr); + if (align == -1) { + align = new_align; + } else { + if (align != new_align) { + return Alignment::DIFFERENT; + } + } + } + + for (const DType* ptr : params.outputs) { + int new_align = CalcAlignment(ptr); + if (align == -1) { + align = new_align; + } else { + if (align != new_align) { + return Alignment::DIFFERENT; + } + } + } + + return align == 0 ? Alignment::SAME_ALIGNED + : Alignment::SAME_UNALIGNED; +} + +} // namespace + +template +void VectorizedKernelLauncher(const index_t size, mshadow::Stream* s, typename Kernel::ParamType params) { + static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); + if (size != 0) { + cudaStream_t stream = mshadow::Stream::GetStream(s); + constexpr int nvec = sizeof(LType) / sizeof(DType); + VectorizedLoader l(params.inputs[0], size); + size_t num_elements = l.num_aligned_elements(); + constexpr int threads = 512; + constexpr int max_blocks = 65535; + index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), + max_blocks); + auto align = CheckAlignment(params); + if (align == Alignment::SAME_ALIGNED && (size % nvec == 0)) { + Kernel::template Launch(blocks, threads, stream, params, size); + } else { + if (align != Alignment::DIFFERENT) { + Kernel::template Launch(blocks, threads, stream, params, size); + } else { + index_t blocks = std::min(static_cast((size + threads - 1) / + threads), + max_blocks); + // If the pointers are aligned differently we cannot vectorize + Kernel::template Launch(blocks, threads, stream, params, size); + } + } + } +} + +} // namespace cuda +} // namespace common +} // namespace mxnet + +#endif // MXNET_USE_CUDA && __CUDACC__ + +#endif // MXNET_COMMON_CUDA_VECTORIZATION_CUH_ diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 4d3d02f1e901..1db6c29a3eab 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -675,17 +675,6 @@ class ElemwiseBinaryOp : public OpBase { }); } - template - static inline void BackwardUseNoneWithHalf2(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { - BackwardUseNone_(attrs, ctx, inputs, req, outputs); - }); - } - template static inline void BackwardUseNoneEx(const nnvm::NodeAttrs &attrs, const OpContext &ctx, @@ -732,17 +721,6 @@ class ElemwiseBinaryOp : public OpBase { }); } - template - static inline void BackwardUseInWithHalf2(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { - BackwardUseIn_(attrs, ctx, inputs, req, outputs); - }); - } - template< typename xpu, typename LOP, typename ROP, bool in0_ok_dense = false, bool in1_ok_dense = false, bool in2_ok_dense = false> diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index 1e13552d03b0..733be3fdf160 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -225,11 +225,6 @@ NNVM_REGISTER_OP(elemwise_add) NNVM_REGISTER_OP(_grad_add) .set_attr("FCompute", VectorizedCompute); -NNVM_REGISTER_OP(_backward_add) -.set_attr("FCompute", - VectorizedBackwardUseNoneCompute); - NNVM_REGISTER_OP(elemwise_sub) .set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); @@ -246,7 +241,7 @@ NNVM_REGISTER_OP(elemwise_mul) NNVM_REGISTER_OP(_backward_mul) .set_attr("FCompute", - ElemwiseBinaryOp::BackwardUseInWithHalf2); NNVM_REGISTER_OP(elemwise_div) @@ -255,7 +250,7 @@ NNVM_REGISTER_OP(elemwise_div) NNVM_REGISTER_OP(_backward_div) .set_attr("FCompute", - ElemwiseBinaryOp::BackwardUseInWithHalf2); NNVM_REGISTER_OP(_mod) @@ -263,7 +258,7 @@ NNVM_REGISTER_OP(_mod) NNVM_REGISTER_OP(_backward_mod) .set_attr("FCompute", - ElemwiseBinaryOp::BackwardUseInWithHalf2); + VectorizedBackwardUseInCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_op.cuh b/src/operator/tensor/elemwise_op.cuh index 46ab611085e1..36ddf5dbd98d 100644 --- a/src/operator/tensor/elemwise_op.cuh +++ b/src/operator/tensor/elemwise_op.cuh @@ -28,7 +28,7 @@ #include #include "../operator_common.h" -#include "../../common/cuda_utils.h" +#include "../../common/cuda_vectorization.cuh" #include @@ -37,111 +37,12 @@ namespace mxnet { namespace op { -template -class VectorizedStorage { - public: - constexpr static int nvec = sizeof(LType) / sizeof(DType); - union vectorized_storage { - LType aligned; - DType separate[nvec]; // NOLINT(*) - - MSHADOW_XINLINE vectorized_storage() {} - MSHADOW_XINLINE ~vectorized_storage() {} - } scratch_; -}; - -template -class VectorizedAccessor { - public: - VectorizedStorage::type, - typename std::remove_const::type> storage_; - - LType* aligned_ptr_; - DType* unaligned_ptr_; - int alignment_; - index_t n_elems_; - - MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t N) { - unaligned_ptr_ = ptr; - if (aligned) { - alignment_ = 0; - aligned_ptr_ = reinterpret_cast(ptr); - n_elems_ = (N + storage_.nvec - 1) / storage_.nvec; - } else { - size_t ptr_as_number = reinterpret_cast(ptr); - alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType); - aligned_ptr_ = reinterpret_cast(ptr - alignment_); - n_elems_ = (N + alignment_ + storage_.nvec - 1) / storage_.nvec; - } - } - - MSHADOW_XINLINE index_t num_aligned_elements() const { - return n_elems_; - } - - MSHADOW_XINLINE void load(const index_t id, const index_t N) { - if (aligned) { - storage_.scratch_.aligned = aligned_ptr_[id]; - } else { - if (id > 0 && id < n_elems_ - 1) { - storage_.scratch_.aligned = aligned_ptr_[id]; - } else { -#pragma unroll - for (int j = 0; j < storage_.nvec; ++j) { - DType* ptr = reinterpret_cast(&(aligned_ptr_[id])) + j; - if (reinterpret_cast(ptr) >= reinterpret_cast(unaligned_ptr_) && - reinterpret_cast(ptr) < reinterpret_cast(unaligned_ptr_ + N)) { - storage_.scratch_.separate[j] = *ptr; - } - } - } - } - } -}; - -template -class VectorizedLoader : public VectorizedAccessor { - public: - MSHADOW_XINLINE VectorizedLoader(const DType* ptr, const index_t N) : - VectorizedAccessor(ptr, N) { - } -}; - -template -class VectorizedStorer : public VectorizedAccessor { - public: - MSHADOW_XINLINE VectorizedStorer(DType* ptr, const index_t N) : - VectorizedAccessor(ptr, N) { - } - - MSHADOW_XINLINE void store(const index_t id, const index_t N) { - if (aligned) { - this->aligned_ptr_[id] = this->storage_.scratch_.aligned; - } else { - if (id > 0 && id < this->n_elems_ - 1) { - this->aligned_ptr_[id] = this->storage_.scratch_.aligned; - } else { -#pragma unroll - for (int j = 0; j < this->storage_.nvec; ++j) { - DType* ptr = reinterpret_cast(&(this->aligned_ptr_[id])) + j; - if (reinterpret_cast(ptr) >= reinterpret_cast(this->unaligned_ptr_) && - reinterpret_cast(ptr) < reinterpret_cast(this->unaligned_ptr_ + N)) { - *ptr = this->storage_.scratch_.separate[j]; - } - } - } - } - } -}; - namespace { -template -struct VectorizedKernelParams { - const DType* inputs[NumInputs]; - DType* outputs[NumOutputs]; -}; - +using common::cuda::VectorizedKernelParams; +using common::cuda::VectorizedKernelLauncher; +using common::cuda::VectorizedLoader; +using common::cuda::VectorizedStorer; template __global__ void VectorizedBinaryKernelFwd(const VectorizedKernelParams params, @@ -161,14 +62,14 @@ __global__ void VectorizedBinaryKernelFwd(const VectorizedKernelParams::value && lreq == kWriteInplace) || lreq == kNullOp)) { DType ltemp = LOP::Map(inp); if (lreq == kAddTo) { - lstorer.storage_.scratch_.separate[i] += ltemp; + lstorer.separate()[i] += ltemp; } else { - lstorer.storage_.scratch_.separate[i] = ltemp; + lstorer.separate()[i] = ltemp; } lstorer.store(tid, N); } @@ -213,9 +114,61 @@ __global__ void VectorizedBinaryKernelBwdUseNone(const VectorizedKernelParams
+__global__ void VectorizedBinaryKernelBwdUseIn(const VectorizedKernelParams params, + const index_t N) { + VectorizedLoader ograd_loader(params.inputs[0], N); + VectorizedLoader linput_loader(params.inputs[1], N); + VectorizedLoader rinput_loader(params.inputs[2], N); + VectorizedStorer lstorer(params.outputs[0], N); + VectorizedStorer rstorer(params.outputs[1], N); + + const index_t M = ograd_loader.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + ograd_loader.load(tid, N); + linput_loader.load(tid, N); + rinput_loader.load(tid, N); + if (lreq == kAddTo) { + lstorer.load(tid, N); + } + if (rreq == kAddTo) { + rstorer.load(tid, N); + } +#pragma unroll + for (int i = 0; i < ograd_loader.nvec(); ++i) { + DType ograd = ograd_loader.separate()[i]; + DType linput = linput_loader.separate()[i]; + DType rinput = rinput_loader.separate()[i]; + if (!(lreq == kNullOp)) { + DType ltemp = ograd * LOP::Map(linput, rinput); + if (lreq == kAddTo) { + lstorer.separate()[i] += ltemp; + } else { + lstorer.separate()[i] = ltemp; + } + lstorer.store(tid, N); + } + if (!(rreq == kNullOp)) { + DType rtemp = ograd * ROP::Map(linput, rinput); + + if (rreq == kAddTo) { + rstorer.separate()[i] += rtemp; } else { - rstorer.storage_.scratch_.separate[i] = rtemp; + rstorer.separate()[i] = rtemp; } rstorer.store(tid, N); } @@ -251,76 +204,19 @@ class VectorizedBinaryBwdUseNone { } }; -enum class Alignment { - SAME_ALIGNED, - SAME_UNALIGNED, - DIFFERENT -}; - -template -int CalcAlignment(const DType* ptr) { - size_t ptr_as_number = reinterpret_cast(ptr); - return ptr_as_number % sizeof(LType); -} - -template -Alignment CheckAlignment(const VectorizedKernelParams& params) { - int align = -1; - - for (const DType* ptr : params.inputs) { - int new_align = CalcAlignment(ptr); - if (align == -1) { - align = new_align; - } else { - if (align != new_align) { - return Alignment::DIFFERENT; - } - } - } - - for (const DType* ptr : params.outputs) { - int new_align = CalcAlignment(ptr); - if (align == -1) { - align = new_align; - } else { - if (align != new_align) { - return Alignment::DIFFERENT; - } - } - } - - return align == 0 ? Alignment::SAME_ALIGNED - : Alignment::SAME_UNALIGNED; -} +template +class VectorizedBinaryBwdUseIn { + public: + using ParamType = VectorizedKernelParams; -template -void VectorizedKernelLauncher(const index_t size, mshadow::Stream* s, typename Kernel::ParamType params) { - static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); - if (size != 0) { - cudaStream_t stream = mshadow::Stream::GetStream(s); - constexpr int nvec = sizeof(LType) / sizeof(DType); - VectorizedLoader l(params.inputs[0], size); - size_t num_elements = l.num_aligned_elements(); - constexpr int threads = 512; - constexpr int max_blocks = 65535; - index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), - max_blocks); - auto align = CheckAlignment(params); - if (align == Alignment::SAME_ALIGNED && (size % nvec == 0)) { - Kernel::template Launch(blocks, threads, stream, params, size); - } else { - if (align != Alignment::DIFFERENT) { - Kernel::template Launch(blocks, threads, stream, params, size); - } else { - index_t blocks = std::min(static_cast((size + threads - 1) / - threads), - max_blocks); - // If the pointers are aligned differently we cannot vectorize - Kernel::template Launch(blocks, threads, stream, params, size); - } - } + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedBinaryKernelBwdUseIn + <<>>(params, N); } -} +}; template void VectorizedCompute(const nnvm::NodeAttrs &attrs, @@ -360,8 +256,8 @@ void VectorizedBackwardUseNoneCompute(const nnvm::NodeAttrs &attrs, MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { const index_t size = inputs[0].Size(); if (req[0] != kNullOp || req[1] != kNullOp) { - MXNET_ASSIGN_REQ_SWITCH(req[0], lreq, { - MXNET_ASSIGN_REQ_SWITCH(req[1], rreq, { + MXNET_REQ_TYPE_SWITCH(req[0], lreq, { + MXNET_REQ_TYPE_SWITCH(req[1], rreq, { using LType = uint4; using Kernel = VectorizedBinaryBwdUseNone; @@ -377,6 +273,36 @@ void VectorizedBackwardUseNoneCompute(const nnvm::NodeAttrs &attrs, }); } +template +void VectorizedBackwardUseInCompute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + mshadow::Stream *s = ctx.get_stream(); + if (req[0] != kNullOp || req[1] != kNullOp) { + MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { + MXNET_REQ_TYPE_SWITCH(req[0], lreq, { + MXNET_REQ_TYPE_SWITCH(req[1], rreq, { + const index_t size = inputs[0].Size(); + // Using 64 bit loads to reduce register pressure + using LType = uint2; + using Kernel = VectorizedBinaryBwdUseIn; + + typename Kernel::ParamType params; + params.inputs[0] = inputs[0].dptr(); + params.inputs[1] = inputs[1].dptr(); + params.inputs[2] = inputs[2].dptr(); + params.outputs[0] = outputs[0].dptr(); + params.outputs[1] = outputs[1].dptr(); + + VectorizedKernelLauncher(size, s, params); + }); + }); + }); + } +} + } // namespace } // namespace op From ec08749cf9b40120fdc74041110021a5adb86bda Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 25 Feb 2020 15:14:03 -0800 Subject: [PATCH 06/37] Extending vectorization to more binary ops, binary ops with scalar and unary ops --- src/common/cuda_vectorization.cuh | 10 +- ...elemwise_op.cuh => elemwise_binary_op.cuh} | 33 +-- .../tensor/elemwise_binary_op_basic.cu | 2 +- .../tensor/elemwise_binary_op_extended.cu | 17 +- .../tensor/elemwise_binary_op_logic.cu | 19 +- .../tensor/elemwise_binary_scalar_op.cuh | 205 ++++++++++++++++++ .../tensor/elemwise_binary_scalar_op_basic.cu | 30 ++- .../elemwise_binary_scalar_op_extended.cu | 30 ++- .../tensor/elemwise_binary_scalar_op_logic.cu | 19 +- src/operator/tensor/elemwise_unary_op.cuh | 126 +++++++++++ src/operator/tensor/elemwise_unary_op.h | 17 -- .../tensor/elemwise_unary_op_basic.cu | 51 +++-- 12 files changed, 435 insertions(+), 124 deletions(-) rename src/operator/tensor/{elemwise_op.cuh => elemwise_binary_op.cuh} (91%) create mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cuh create mode 100644 src/operator/tensor/elemwise_unary_op.cuh diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh index d836c6c296cc..556313f58e8f 100644 --- a/src/common/cuda_vectorization.cuh +++ b/src/common/cuda_vectorization.cuh @@ -142,12 +142,6 @@ class VectorizedStorer : public VectorizedAccessor { } }; -template -struct VectorizedKernelParams { - const DType* inputs[NumInputs]; - DType* outputs[NumOutputs]; -}; - namespace { enum class Alignment { @@ -162,8 +156,8 @@ int CalcAlignment(const DType* ptr) { return ptr_as_number % sizeof(LType); } -template -Alignment CheckAlignment(const VectorizedKernelParams& params) { +template +Alignment CheckAlignment(const Params& params) { int align = -1; for (const DType* ptr : params.inputs) { diff --git a/src/operator/tensor/elemwise_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh similarity index 91% rename from src/operator/tensor/elemwise_op.cuh rename to src/operator/tensor/elemwise_binary_op.cuh index 36ddf5dbd98d..db03cb807d79 100644 --- a/src/operator/tensor/elemwise_op.cuh +++ b/src/operator/tensor/elemwise_binary_op.cuh @@ -19,12 +19,12 @@ /*! * Copyright (c) 2020 by Contributors - * \file elemwise_op.cuh + * \file elemwise_binary_op.cuh * \brief GPU helpers for elementwise operators */ -#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_OP_CUH_ -#define MXNET_OPERATOR_TENSOR_ELEMWISE_OP_CUH_ +#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_ +#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_ #include #include "../operator_common.h" @@ -39,13 +39,18 @@ namespace op { namespace { -using common::cuda::VectorizedKernelParams; using common::cuda::VectorizedKernelLauncher; using common::cuda::VectorizedLoader; using common::cuda::VectorizedStorer; +template +struct VectorizedBinaryKernelParams { + const DType* inputs[NumInputs]; + DType* outputs[NumOutputs]; +}; + template -__global__ void VectorizedBinaryKernelFwd(const VectorizedKernelParams params, +__global__ void VectorizedBinaryKernelFwd(const VectorizedBinaryKernelParams params, const index_t N) { VectorizedLoader loader0(params.inputs[0], N); VectorizedLoader loader1(params.inputs[1], N); @@ -78,8 +83,9 @@ __global__ void VectorizedBinaryKernelFwd(const VectorizedKernelParams -__global__ void VectorizedBinaryKernelBwdUseNone(const VectorizedKernelParams params, - const index_t N) { +__global__ void VectorizedBinaryKernelBwdUseNone( + const VectorizedBinaryKernelParams params, + const index_t N) { VectorizedLoader loader(params.inputs[0], N); VectorizedStorer lstorer(params.outputs[0], N); VectorizedStorer rstorer(params.outputs[1], N); @@ -126,8 +132,9 @@ __global__ void VectorizedBinaryKernelBwdUseNone(const VectorizedKernelParams
-__global__ void VectorizedBinaryKernelBwdUseIn(const VectorizedKernelParams params, - const index_t N) { +__global__ void VectorizedBinaryKernelBwdUseIn( + const VectorizedBinaryKernelParams params, + const index_t N) { VectorizedLoader ograd_loader(params.inputs[0], N); VectorizedLoader linput_loader(params.inputs[1], N); VectorizedLoader rinput_loader(params.inputs[2], N); @@ -179,7 +186,7 @@ __global__ void VectorizedBinaryKernelBwdUseIn(const VectorizedKernelParams class VectorizedBinaryFwd { public: - using ParamType = VectorizedKernelParams; + using ParamType = VectorizedBinaryKernelParams; template static void Launch(const index_t blocks, const index_t threads, @@ -193,7 +200,7 @@ class VectorizedBinaryFwd { template class VectorizedBinaryBwdUseNone { public: - using ParamType = VectorizedKernelParams; + using ParamType = VectorizedBinaryKernelParams; template static void Launch(const index_t blocks, const index_t threads, @@ -207,7 +214,7 @@ class VectorizedBinaryBwdUseNone { template class VectorizedBinaryBwdUseIn { public: - using ParamType = VectorizedKernelParams; + using ParamType = VectorizedBinaryKernelParams; template static void Launch(const index_t blocks, const index_t threads, @@ -309,4 +316,4 @@ void VectorizedBackwardUseInCompute(const nnvm::NodeAttrs &attrs, } // namespace mxnet #endif // MXNET_USE_CUDA -#endif // MXNET_OPERATOR_TENSOR_ELEMWISE_OP_CUH_ +#endif // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_ diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index 733be3fdf160..54cb2b4c80d9 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -26,7 +26,7 @@ #include "./elemwise_binary_op.h" #include "./elemwise_binary_op-inl.h" #include "./indexing_op.h" -#include "./elemwise_op.cuh" +#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu index 0ae6ac966a2b..0cd0d79d3453 100644 --- a/src/operator/tensor/elemwise_binary_op_extended.cu +++ b/src/operator/tensor/elemwise_binary_op_extended.cu @@ -24,35 +24,36 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" +#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_power) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_power) -.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseIn("FCompute", VectorizedBackwardUseInCompute< mshadow_op::power_grad, mshadow_op::power_rgrad>); NNVM_REGISTER_OP(_maximum) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_maximum) -.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseIn("FCompute", VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(_minimum) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_minimum) -.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseIn("FCompute", VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(_hypot) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_hypot) -.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseIn("FCompute", VectorizedBackwardUseInCompute< mshadow_op::hypot_grad_left, mshadow_op::hypot_grad_right>); } // namespace op diff --git a/src/operator/tensor/elemwise_binary_op_logic.cu b/src/operator/tensor/elemwise_binary_op_logic.cu index e36e6971148f..cb1eaa4612c5 100644 --- a/src/operator/tensor/elemwise_binary_op_logic.cu +++ b/src/operator/tensor/elemwise_binary_op_logic.cu @@ -24,35 +24,36 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" +#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_equal) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_not_equal) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_greater) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_greater_equal) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_lesser) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_lesser_equal) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_logical_and) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_logical_or) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_logical_xor) -.set_attr("FCompute", ElemwiseBinaryOp::Compute); +.set_attr("FCompute", VectorizedCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh new file mode 100644 index 000000000000..c9d21a5ec9ba --- /dev/null +++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file elemwise_binary_scalar_op.cuh + * \brief GPU helpers for binary elementwise operators with scalar + */ + +#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_ +#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_ + +#include +#include "../operator_common.h" +#include "../../common/cuda_vectorization.cuh" + +#include + +#if MXNET_USE_CUDA + +namespace mxnet { +namespace op { + +namespace { + +using common::cuda::VectorizedKernelLauncher; +using common::cuda::VectorizedLoader; +using common::cuda::VectorizedStorer; + +template +struct VectorizedKernelParams { + const DType* inputs[NumInputs]; + DType* outputs[NumOutputs]; + DType scalar; +}; + +template +__global__ void VectorizedBinaryScalarKernelFwd(const VectorizedKernelParams params, + const index_t N) { + VectorizedLoader loader0(params.inputs[0], N); + VectorizedStorer storer(params.outputs[0], N); + + const index_t M = loader0.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + loader0.load(tid, N); + if (req == kAddTo) { + storer.load(tid, N); + } +#pragma unroll + for (int i = 0; i < loader0.nvec(); ++i) { + DType temp = OP::Map(loader0.separate()[i], + params.scalar); + + if (req == kAddTo) { + storer.separate()[i] += temp; + } else { + storer.separate()[i] = temp; + } + } + storer.store(tid, N); + } +} + +template +__global__ void VectorizedBinaryScalarKernelBwd(const VectorizedKernelParams params, + const index_t N) { + VectorizedLoader ograd_loader(params.inputs[0], N); + VectorizedLoader input_loader(params.inputs[1], N); + VectorizedStorer storer(params.outputs[0], N); + + const index_t M = ograd_loader.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + ograd_loader.load(tid, N); + input_loader.load(tid, N); + if (req == kAddTo) { + storer.load(tid, N); + } +#pragma unroll + for (int i = 0; i < ograd_loader.nvec(); ++i) { + DType ograd = ograd_loader.separate()[i]; + DType temp = ograd * OP::Map(input_loader.separate()[i], + params.scalar); + + if (req == kAddTo) { + storer.separate()[i] += temp; + } else { + storer.separate()[i] = temp; + } + } + storer.store(tid, N); + } +} + +template +class VectorizedBinaryScalarFwd { + public: + using ParamType = VectorizedKernelParams; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedBinaryScalarKernelFwd + <<>>(params, N); + } +}; + +template +class VectorizedBinaryScalarBwd { + public: + using ParamType = VectorizedKernelParams; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedBinaryScalarKernelBwd + <<>>(params, N); + } +}; + +template +void VectorizedCompute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + if (req[0] == kNullOp) return; + mshadow::Stream *s = ctx.get_stream(); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const double alpha = nnvm::get(attrs.parsed); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using LType = uint4; + using Kernel = VectorizedBinaryScalarFwd; + + const index_t size = outputs[0].Size(); + typename Kernel::ParamType params; + params.inputs[0] = inputs[0].dptr(); + params.outputs[0] = outputs[0].dptr(); + params.scalar = (DType)alpha; + + VectorizedKernelLauncher(size, s, params); + }); + }); +} + +template +void VectorizedBwdCompute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + if (req[0] == kNullOp) return; + mshadow::Stream *s = ctx.get_stream(); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + const double alpha = nnvm::get(attrs.parsed); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using LType = uint4; + using Kernel = VectorizedBinaryScalarBwd; + + const index_t size = outputs[0].Size(); + typename Kernel::ParamType params; + params.inputs[0] = inputs[0].dptr(); + params.inputs[1] = inputs[1].dptr(); + params.outputs[0] = outputs[0].dptr(); + params.scalar = (DType)alpha; + + VectorizedKernelLauncher(size, s, params); + }); + }); +} + +} // namespace + +} // namespace op +} // namespace mxnet + +#endif // MXNET_USE_CUDA +#endif // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_ diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu index 3c839205683a..ccfb324bcd2d 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu @@ -25,54 +25,52 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" #include "./elemwise_binary_scalar_op.h" +#include "./elemwise_binary_scalar_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_plus_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_minus_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_rminus_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_mul_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_backward_mul_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_div_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_backward_div_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_rdiv_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_rdiv_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(_mod_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_mod_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward< - gpu, mshadow_op::mod_grad>); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(_rmod_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_rmod_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward< - gpu, mshadow_op::rmod_grad>); +.set_attr("FCompute", VectorizedBwdCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu index 2bd52d7b9d7c..7864c465e19f 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu @@ -25,49 +25,45 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" #include "./elemwise_binary_scalar_op.h" +#include "./elemwise_binary_scalar_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_maximum_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_maximum_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(_minimum_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_minimum_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(_power_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_power_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward< - gpu, mshadow_op::power_grad>); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(_rpower_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_rpower_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward< - gpu, mshadow_op::rpower_grad>); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(_hypot_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_hypot_scalar) -.set_attr("FCompute", BinaryScalarOp::Backward< - gpu, mshadow_op::hypot_grad_left>); +.set_attr("FCompute", VectorizedBwdCompute); NNVM_REGISTER_OP(smooth_l1) -.set_attr("FCompute", BinaryScalarOp::Compute< - gpu, mshadow_op::smooth_l1_loss>); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_backward_smooth_l1) -.set_attr("FCompute", BinaryScalarOp::Backward< - gpu, mshadow_op::smooth_l1_gradient>); +.set_attr("FCompute", VectorizedBwdCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu index 6c393e0719a5..4b61542f00c7 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu @@ -23,42 +23,43 @@ * \brief GPU Implementation of binary scalar logic functions. */ #include "elemwise_binary_scalar_op.h" +#include "elemwise_binary_scalar_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_equal_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_not_equal_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_greater_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_greater_equal_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_lesser_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_lesser_equal_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute) +.set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_logical_and_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_logical_or_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); NNVM_REGISTER_OP(_logical_xor_scalar) -.set_attr("FCompute", BinaryScalarOp::Compute); +.set_attr("FCompute", VectorizedCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh new file mode 100644 index 000000000000..e0e2c84c7ff7 --- /dev/null +++ b/src/operator/tensor/elemwise_unary_op.cuh @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file elemwise_unary_op.cuh + * \brief GPU helpers for unary elementwise operators + */ + +#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_ +#define MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_ + +#include +#include "../operator_common.h" +#include "../../common/cuda_vectorization.cuh" + +#include + +#if MXNET_USE_CUDA + +namespace mxnet { +namespace op { + +namespace { + +using common::cuda::VectorizedKernelLauncher; +using common::cuda::VectorizedLoader; +using common::cuda::VectorizedStorer; + +template +struct VectorizedKernelParams { + const DType* inputs[NumInputs]; + DType* outputs[NumOutputs]; +}; + +template +__global__ void VectorizedUnaryScalarKernelFwd(const VectorizedKernelParams params, + const index_t N) { + VectorizedLoader loader(params.inputs[0], N); + VectorizedStorer storer(params.outputs[0], N); + + const index_t M = loader.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + loader.load(tid, N); + if (req == kAddTo) { + storer.load(tid, N); + } +#pragma unroll + for (int i = 0; i < loader.nvec(); ++i) { + DType temp = OP::Map(loader.separate()[i]); + + if (req == kAddTo) { + storer.separate()[i] += temp; + } else { + storer.separate()[i] = temp; + } + } + storer.store(tid, N); + } +} + +template +class VectorizedUnaryScalarFwd { + public: + using ParamType = VectorizedKernelParams; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedUnaryScalarKernelFwd + <<>>(params, N); + } +}; + +template +void VectorizedUnaryCompute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + if (req[0] == kNullOp) return; + mshadow::Stream *s = ctx.get_stream(); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using LType = uint4; + using Kernel = VectorizedUnaryScalarFwd; + + const index_t size = outputs[0].Size(); + typename Kernel::ParamType params; + params.inputs[0] = inputs[0].dptr(); + params.outputs[0] = outputs[0].dptr(); + + VectorizedKernelLauncher(size, s, params); + }); + }); +} + +} // namespace + +} // namespace op +} // namespace mxnet + +#endif // MXNET_USE_CUDA +#endif // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_ diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index dcbd53aac69b..e731db70f3bf 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -344,23 +344,6 @@ class UnaryOp : public OpBase { } #endif - template - static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mshadow; - using namespace mxnet_op; - Stream *s = ctx.get_stream(); - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { - Kernel::Launch(s, outputs[0].Size(), - outputs[0].dptr(), inputs[0].dptr()); - }); - } - template static void IdentityCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu index e5b60b1726e6..8d41c53b5ba2 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cu +++ b/src/operator/tensor/elemwise_unary_op_basic.cu @@ -22,11 +22,13 @@ * \brief GPU Implementation of unary functions. */ #include "./elemwise_binary_op.h" +#include "./elemwise_unary_op.cuh" +#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(relu) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_relu) @@ -34,7 +36,7 @@ NNVM_REGISTER_OP(_backward_relu) gpu, unary_bwd>); NNVM_REGISTER_OP(sigmoid) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); NNVM_REGISTER_OP(_backward_sigmoid) .set_attr("FCompute", ElemwiseBinaryOp::Compute< @@ -48,7 +50,7 @@ NNVM_REGISTER_OP(_backward_hard_sigmoid) // softsign NNVM_REGISTER_OP(softsign) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); NNVM_REGISTER_OP(_backward_softsign) .set_attr("FCompute", ElemwiseBinaryOp::Compute< @@ -56,19 +58,19 @@ NNVM_REGISTER_OP(_backward_softsign) // erf NNVM_REGISTER_OP(erf) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); NNVM_REGISTER_OP(_backward_erf) .set_attr("FCompute", - ElemwiseBinaryOp::Compute>); + VectorizedCompute>); // erfinv NNVM_REGISTER_OP(erfinv) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); NNVM_REGISTER_OP(_backward_erfinv) .set_attr("FCompute", - ElemwiseBinaryOp::Compute>); + VectorizedCompute>); // copy NNVM_REGISTER_OP(_copy) @@ -151,75 +153,72 @@ NNVM_REGISTER_OP(_backward_cast) // negative NNVM_REGISTER_OP(negative) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // abs NNVM_REGISTER_OP(abs) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_abs) -.set_attr("FCompute", ElemwiseBinaryOp::Compute >); +.set_attr("FCompute", VectorizedCompute >); // sign NNVM_REGISTER_OP(sign) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_sign) -.set_attr("FCompute", ElemwiseBinaryOp::Compute< - gpu, unary_bwd >); +.set_attr("FCompute", VectorizedCompute >); // round NNVM_REGISTER_OP(round) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // ceil NNVM_REGISTER_OP(ceil) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // floor NNVM_REGISTER_OP(floor) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // trunc NNVM_REGISTER_OP(trunc) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // rint NNVM_REGISTER_OP(rint) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // fix NNVM_REGISTER_OP(fix) -.set_attr("FCompute", UnaryOp::Compute) +.set_attr("FCompute", VectorizedUnaryCompute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // gamma NNVM_REGISTER_OP(gamma) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); NNVM_REGISTER_OP(_backward_gamma) -.set_attr("FCompute", ElemwiseBinaryOp::Compute< - gpu, unary_bwd >); +.set_attr("FCompute", VectorizedCompute >); // gammaln NNVM_REGISTER_OP(gammaln) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); NNVM_REGISTER_OP(_backward_gammaln) -.set_attr("FCompute", ElemwiseBinaryOp::Compute< - gpu, unary_bwd >); +.set_attr("FCompute", VectorizedCompute >); // logical not NNVM_REGISTER_OP(logical_not) -.set_attr("FCompute", UnaryOp::Compute); +.set_attr("FCompute", VectorizedUnaryCompute); } // namespace op } // namespace mxnet From 28e5877aaa0f6bc09e9c34de7eb85dbe048938de Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 4 Mar 2020 12:41:35 -0800 Subject: [PATCH 07/37] Handling ElementwiseSum --- src/common/cuda_vectorization.cuh | 4 +- src/operator/tensor/elemwise_sum.cu | 111 +++++++++++++++++++++++++++- src/operator/tensor/elemwise_sum.h | 12 --- 3 files changed, 113 insertions(+), 14 deletions(-) diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh index 556313f58e8f..b82029ef2222 100644 --- a/src/common/cuda_vectorization.cuh +++ b/src/common/cuda_vectorization.cuh @@ -186,6 +186,8 @@ Alignment CheckAlignment(const Params& params) { : Alignment::SAME_UNALIGNED; } +constexpr int vectorized_kernel_thread_num = 512; + } // namespace template @@ -196,7 +198,7 @@ void VectorizedKernelLauncher(const index_t size, mshadow::Stream* s, typen constexpr int nvec = sizeof(LType) / sizeof(DType); VectorizedLoader l(params.inputs[0], size); size_t num_elements = l.num_aligned_elements(); - constexpr int threads = 512; + constexpr int threads = vectorized_kernel_thread_num; constexpr int max_blocks = 65535; index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), max_blocks); diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu index f9a248214e85..8947de2fb52e 100644 --- a/src/operator/tensor/elemwise_sum.cu +++ b/src/operator/tensor/elemwise_sum.cu @@ -24,10 +24,117 @@ */ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" +#include "../../common/cuda_vectorization.cuh" namespace mxnet { namespace op { +using common::cuda::VectorizedKernelLauncher; +using common::cuda::VectorizedLoader; +using common::cuda::VectorizedStorer; + +namespace { + +constexpr size_t num_inputs_per_kernel = 4; + +template +struct VectorizedElementwiseSumKernelParams { + int num_inputs; + const DType* inputs[NumInputs]; + DType* outputs[1]; +}; + +template +__launch_bounds__(mxnet::common::cuda::vectorized_kernel_thread_num) +__global__ void VectorizedElementwiseSumKernel( + const VectorizedElementwiseSumKernelParams params, + const index_t N) { + VectorizedStorer storer(params.outputs[0], N); + + const index_t M = storer.num_aligned_elements(); + + for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < M; + tid += gridDim.x * blockDim.x) { + if (req == kAddTo) { + storer.load(tid, N); + } else { +#pragma unroll + for (int i = 0; i < storer.nvec(); ++i) { + storer.separate()[i] = 0; + } + } +#pragma unroll + for (int i = 0; i < num_inputs_per_kernel; ++i) { + if (i < params.num_inputs) { + VectorizedLoader loader(params.inputs[i], N); + loader.load(tid, N); +#pragma unroll + for (int i = 0; i < loader.nvec(); ++i) { + storer.separate()[i] += loader.separate()[i]; + } + } + } + + storer.store(tid, N); + } +} + + +template +class VectorizedElementwiseSumFwd { + public: + using ParamType = VectorizedElementwiseSumKernelParams; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedElementwiseSumKernel + <<>>(params, N); + } +}; + +void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + mshadow::Stream *s = ctx.get_stream(); + if (req[0] == kNullOp) return; + CHECK_EQ(outputs.size(), 1U); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using LType = uint2; + const index_t size = inputs[0].Size(); + for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) { + if (i == 0) { + using Kernel = VectorizedElementwiseSumFwd; + typename Kernel::ParamType params; + params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i); + for (int j = 0; j < params.num_inputs; ++j) { + params.inputs[j] = inputs[i + j].dptr(); + } + params.outputs[0] = outputs[0].dptr(); + VectorizedKernelLauncher(size, s, params); + } else { + /* During subsequent launches we need to + accumulate into the previous outputs + */ + using Kernel = VectorizedElementwiseSumFwd; + typename Kernel::ParamType params; + params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i); + for (int j = 0; j < params.num_inputs; ++j) { + params.inputs[j] = inputs[i + j].dptr(); + } + params.outputs[0] = outputs[0].dptr(); + VectorizedKernelLauncher(size, s, params); + } + } + }); + }); +} + void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, @@ -51,8 +158,10 @@ void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs, } } +} // namespace + NNVM_REGISTER_OP(add_n) -.set_attr("FCompute", ElementWiseSumComputeWithHalf2) +.set_attr("FCompute", VectorizedElementwiseSum) .set_attr("FComputeEx", ElementWiseSumComputeExGPU); } // namespace op diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h index e89e9d799903..5420adcdfbd3 100644 --- a/src/operator/tensor/elemwise_sum.h +++ b/src/operator/tensor/elemwise_sum.h @@ -113,18 +113,6 @@ void ElementWiseSumCompute(const nnvm::NodeAttrs& attrs, }); } -template -void ElementWiseSumComputeWithHalf2(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(outputs.size(), 1U); - MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, { - ElementWiseSumCompute_(attrs, ctx, inputs, req, outputs); - }); -} - } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_TENSOR_ELEMWISE_SUM_H_ From 541aebbec710afdf14726f10ec6dcd63e46fdff8 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 4 Mar 2020 14:36:28 -0800 Subject: [PATCH 08/37] Get rid of half2 in mshadow --- 3rdparty/mshadow/mshadow/base.h | 48 ----------- 3rdparty/mshadow/mshadow/half2.h | 143 ------------------------------- src/operator/mshadow_op.h | 67 --------------- 3 files changed, 258 deletions(-) delete mode 100755 3rdparty/mshadow/mshadow/half2.h diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h index a99838422348..3b47bd99a09a 100755 --- a/3rdparty/mshadow/mshadow/base.h +++ b/3rdparty/mshadow/mshadow/base.h @@ -277,7 +277,6 @@ extern "C" { } #include "./half.h" -#include "./half2.h" #include "./bfloat.h" #define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP) \ MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \ @@ -392,11 +391,6 @@ struct DataType { #endif }; template<> -struct DataType { - static const int kFlag = kFloat16; - static const int kLanes = 2; -}; -template<> struct DataType { static const int kFlag = kBfloat16; static const int kLanes = 1; @@ -1149,48 +1143,6 @@ struct minimum { } #endif -#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...) \ - switch (type) { \ - case mshadow::kFloat32: \ - { \ - typedef float DType; \ - {__VA_ARGS__} \ - } \ - break; \ - case mshadow::kFloat64: \ - { \ - typedef double DType; \ - {__VA_ARGS__} \ - } \ - break; \ - case mshadow::kFloat16: \ - { \ - typedef mshadow::half::half2_t DType; \ - {__VA_ARGS__} \ - } \ - break; \ - case mshadow::kUint8: \ - { \ - typedef uint8_t DType; \ - {__VA_ARGS__} \ - } \ - break; \ - case mshadow::kInt32: \ - { \ - typedef int32_t DType; \ - {__VA_ARGS__} \ - } \ - break; \ - case mshadow::kInt64: \ - { \ - typedef int64_t DType; \ - {__VA_ARGS__} \ - } \ - break; \ - default: \ - LOG(FATAL) << "Unknown type enum " << type; \ - } - #define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...) \ switch (type) { \ case mshadow::kFloat32: \ diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h deleted file mode 100755 index 3e130c85ba63..000000000000 --- a/3rdparty/mshadow/mshadow/half2.h +++ /dev/null @@ -1,143 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file half2.h - * \brief definition of vector float16, half2 type. - * - * \author Antti-Pekka Hynninen - */ -#ifndef MSHADOW_HALF2_H_ -#define MSHADOW_HALF2_H_ - -#if (defined(__CUDACC__) && __CUDA_ARCH__ >= 530 && MSHADOW_USE_CUDA && CUDA_VERSION >= 7050) - #define MSHADOW_CUDA_HALF2 1 - #include -#else - #define MSHADOW_CUDA_HALF2 0 -#endif - -#include - -/*! \brief namespace for mshadow */ -namespace mshadow { -/* \brief name space for host/device portable half-precision floats */ -namespace half { - -#define MSHADOW_HALF2_ASSIGNOP(AOP, OP) \ - template \ - MSHADOW_XINLINE half2_t operator AOP (const T& a) { \ - return *this = half2_t(*this OP a); /* NOLINT(*)*/ \ - } \ - -class MSHADOW_ALIGNED(4) half2_t { - public: -#if MSHADOW_CUDA_HALF2 - half2 half2_; -#else - half_t half_t2[2]; -#endif - - MSHADOW_XINLINE half2_t() {} - -#if MSHADOW_CUDA_HALF2 - MSHADOW_XINLINE explicit half2_t(half2 a) : half2_(a) {} -#else - MSHADOW_XINLINE explicit half2_t(half_t a, half_t b) { - half_t2[0] = a; - half_t2[1] = b; - } -#endif - - MSHADOW_XINLINE explicit half2_t(int a) { -#if MSHADOW_CUDA_HALF2 - half2_ = __half2half2(__int2half_rz(a)); -#else - half_t2[0] = (half_t)a; - half_t2[1] = (half_t)a; -#endif - } - - MSHADOW_XINLINE half2_t operator+() { - return *this; - } - - MSHADOW_XINLINE half2_t operator-() { -#if MSHADOW_CUDA_HALF2 - return half2_t(__hneg2(half2_)); -#else - return half2_t(-half_t2[0], -half_t2[1]); -#endif - } - - MSHADOW_XINLINE half2_t operator=(const half2_t& a) { -#if MSHADOW_CUDA_HALF2 - half2_ = a.half2_; -#else - half_t2[0] = a.half_t2[0]; - half_t2[1] = a.half_t2[1]; -#endif - return a; - } - - MSHADOW_HALF2_ASSIGNOP(+=, +) - MSHADOW_HALF2_ASSIGNOP(-=, -) - MSHADOW_HALF2_ASSIGNOP(*=, *) - MSHADOW_HALF2_ASSIGNOP(/=, /) -}; - -/*! \brief overloaded + operator for half2_t */ -MSHADOW_XINLINE half2_t operator+(half2_t a, half2_t b) { -#if MSHADOW_CUDA_HALF2 - return half2_t(__floats2half2_rn(__low2float(a.half2_) + __low2float(b.half2_), - __high2float(a.half2_) + __high2float(b.half2_))); -#else - return half2_t(a.half_t2[0] + b.half_t2[0], a.half_t2[1] + b.half_t2[1]); -#endif -} -/*! \brief overloaded - operator for half2_t */ -MSHADOW_XINLINE half2_t operator-(half2_t a, half2_t b) { -#if MSHADOW_CUDA_HALF2 - return half2_t(__floats2half2_rn(__low2float(a.half2_) - __low2float(b.half2_), - __high2float(a.half2_) - __high2float(b.half2_))); -#else - return half2_t(a.half_t2[0] - b.half_t2[0], a.half_t2[1] - b.half_t2[1]); -#endif -} -/*! \brief overloaded * operator for half2_t */ -MSHADOW_XINLINE half2_t operator*(half2_t a, half2_t b) { -#if MSHADOW_CUDA_HALF2 - return half2_t(__floats2half2_rn(__low2float(a.half2_) * __low2float(b.half2_), - __high2float(a.half2_) * __high2float(b.half2_))); -#else - return half2_t(a.half_t2[0] * b.half_t2[0], a.half_t2[1] * b.half_t2[1]); -#endif -} -/*! \brief overloaded / operator for half2_t */ -MSHADOW_XINLINE half2_t operator/(half2_t a, half2_t b) { -#if MSHADOW_CUDA_HALF2 - return half2_t(__floats2half2_rn(__low2float(a.half2_) / __low2float(b.half2_), - __high2float(a.half2_) / __high2float(b.half2_))); -#else - return half2_t(a.half_t2[0] / b.half_t2[0], a.half_t2[1] / b.half_t2[1]); -#endif -} -/*! \brief overloaded % operator for half2_t */ -MSHADOW_XINLINE half2_t operator%(half2_t a, half2_t b) { -#if MSHADOW_CUDA_HALF2 - return half2_t(__floats2half2_rn(::fmod(__low2float(a.half2_), __low2float(b.half2_)), - ::fmod(__high2float(a.half2_), __high2float(b.half2_)))); -#else - return half2_t(::fmod(a.half_t2[0], b.half_t2[0]), ::fmod(a.half_t2[1], b.half_t2[1])); -#endif -} -/*! \brief overloaded == operator for half2_t */ -MSHADOW_XINLINE bool operator==(half2_t a, half2_t b) { -#if MSHADOW_CUDA_HALF2 - return __hbeq2(a.half2_, b.half2_); -#else - return (a.half_t2[0] == b.half_t2[0] && a.half_t2[1] == b.half_t2[1]); -#endif -} - -} // namespace half -} // namespace mshadow -#endif // MSHADOW_HALF2_H_ diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h index 9106ee222542..7e11b56d0633 100644 --- a/src/operator/mshadow_op.h +++ b/src/operator/mshadow_op.h @@ -742,22 +742,8 @@ MXNET_BINARY_MATH_OP(rminus, b - a); MXNET_BINARY_MATH_OP(div_grad, 1.0f / math::id(b)); -template<> -MSHADOW_XINLINE mshadow::half::half2_t div_grad::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { - return mshadow::half::half2_t(1) / b; -} - MXNET_BINARY_MATH_OP(div_rgrad, -math::id(a) / math::sqr(b)); -template<> -MSHADOW_XINLINE mshadow::half::half2_t div_rgrad::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { - return -a / (b * b); -} - MXNET_BINARY_MATH_OP(rdiv, math::id(b) / math::id(a)); MXNET_BINARY_MATH_OP(rdiv_grad, -math::id(b) / math::sqr(a)); @@ -807,13 +793,6 @@ struct mod : public mxnet_op::tunable { } }; -template<> -MSHADOW_XINLINE mshadow::half::half2_t mod::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { - return a%b; -} - struct mod_grad : public mxnet_op::tunable { template MSHADOW_XINLINE static DType Map(DType a, DType b) { @@ -835,19 +814,6 @@ MSHADOW_XINLINE mshadow::half::half_t mod_grad::Map mshadow::half::half_t b) { return mshadow::half::half_t(1.0f); } -template<> -MSHADOW_XINLINE mshadow::half::half2_t mod_grad::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { - mshadow::half::half2_t result = mshadow::half::half2_t(); -#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2) - result.half2_ = ::__float2half2_rn(1.0f); -#else - result.half_t2[0] = mshadow::half::half_t(0.0f); - result.half_t2[1] = mshadow::half::half_t(1.0f); -#endif - return result; -} struct mod_rgrad : public mxnet_op::tunable { template @@ -870,19 +836,6 @@ MSHADOW_XINLINE mshadow::half::half_t mod_rgrad::Map mshadow::half::half_t b) { return mshadow::half::half_t(-::floorf(static_cast(a/b))); } -template<> -MSHADOW_XINLINE mshadow::half::half2_t mod_rgrad::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { -#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2) - return mshadow::half::half2_t(__hneg2(::h2floor((a/b).half2_))); -#else - return mshadow::half::half2_t(mshadow::half::half_t(-::floorf( - static_cast(a.half_t2[0]/b.half_t2[0]))), - mshadow::half::half_t(-::floorf( - static_cast(a.half_t2[1]/b.half_t2[1])))); -#endif -} struct rmod : public mxnet_op::tunable { template @@ -919,13 +872,6 @@ struct rmod : public mxnet_op::tunable { } }; -template<> -MSHADOW_XINLINE mshadow::half::half2_t rmod::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { - return b%a; -} - struct rmod_grad { template MSHADOW_XINLINE static DType Map(DType a, DType b) { @@ -947,19 +893,6 @@ MSHADOW_XINLINE mshadow::half::half_t rmod_grad::Map mshadow::half::half_t b) { return mshadow::half::half_t(-::floorf(static_cast(b/a))); } -template<> -MSHADOW_XINLINE mshadow::half::half2_t rmod_grad::Map - (mshadow::half::half2_t a, - mshadow::half::half2_t b) { -#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2) - return mshadow::half::half2_t(::__hneg2(::h2floor((b/a).half2_))); -#else - return mshadow::half::half2_t(mshadow::half::half_t(-::floorf( - static_cast(b.half_t2[0]/a.half_t2[0]))), - mshadow::half::half_t(-::floorf( - static_cast(b.half_t2[1]/a.half_t2[1])))); -#endif -} struct clip : public mxnet_op::tunable { template From 7729114caf6a1718c08ce1f35529d2267057d515 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 4 Mar 2020 17:01:28 -0800 Subject: [PATCH 09/37] Remove backward_elemwiseaddex --- .../tensor/elemwise_binary_op_basic.cc | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 3f607b2cc23e..054e12057008 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -114,29 +114,6 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs // this must differ from elemwise_add to prevent add to optimization in forward pass. MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus); -static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 2U); -#if MXNET_USE_MKLDNN == 1 - if (inputs[0].IsMKLDNNData()) { - MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[0], outputs[0]); - MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[1], outputs[1]); - return; - } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { - FallBackCompute( - ElemwiseBinaryOp::BackwardUseNone, - attrs, ctx, inputs, req, outputs); - return; - } -#endif - ElemwiseBinaryOp::BackwardUseNoneEx( - attrs, ctx, inputs, req, outputs); -} - static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, From 8455c0d6d9b2eefdf7a12a8743ae44feb0a5f8dd Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 5 Mar 2020 11:19:29 -0800 Subject: [PATCH 10/37] Revert "Remove the unused _backward_add op" This reverts commit f86da86f809c8cbad07db76a3554f23890fe05a3. --- .../tensor/elemwise_binary_op_basic.cc | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 054e12057008..eb2257a5fdd3 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -133,6 +133,26 @@ static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs, return ret; } +NNVM_REGISTER_OP(_backward_add) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", + [](const NodeAttrs &attrs) { + return std::vector >{{0, 0}, + {0, 1}}; + }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr("TIsMKLDNN", true) +#endif +.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseNone< + cpu, mshadow_op::identity, mshadow_op::identity>) +.set_attr("FComputeEx", _backward_ElemwiseAddEx) +.set_attr("FInferStorageType", ElemwiseAddBackwardStorageType); + MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_PD(elemwise_sub, op::mshadow_op::minus) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub) .add_alias("_sub").add_alias("_minus").add_alias("_Minus") From 402bb59ed327b85751994a5692a716e701a61ea3 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 5 Mar 2020 11:19:41 -0800 Subject: [PATCH 11/37] Revert "Remove backward_elemwiseaddex" This reverts commit 7729114caf6a1718c08ce1f35529d2267057d515. --- .../tensor/elemwise_binary_op_basic.cc | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index eb2257a5fdd3..4bfb2c84f551 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -114,6 +114,29 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs // this must differ from elemwise_add to prevent add to optimization in forward pass. MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus); +static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 2U); +#if MXNET_USE_MKLDNN == 1 + if (inputs[0].IsMKLDNNData()) { + MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[0], outputs[0]); + MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[1], outputs[1]); + return; + } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { + FallBackCompute( + ElemwiseBinaryOp::BackwardUseNone, + attrs, ctx, inputs, req, outputs); + return; + } +#endif + ElemwiseBinaryOp::BackwardUseNoneEx( + attrs, ctx, inputs, req, outputs); +} + static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, From 716aa1a32f976a04218487cc0b08047632eed3a7 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 5 Mar 2020 11:21:39 -0800 Subject: [PATCH 12/37] Add back the backward_add since C++ test relies on it --- src/operator/tensor/elemwise_binary_op_basic.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index 54cb2b4c80d9..772fcb4db5cc 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -225,6 +225,11 @@ NNVM_REGISTER_OP(elemwise_add) NNVM_REGISTER_OP(_grad_add) .set_attr("FCompute", VectorizedCompute); +NNVM_REGISTER_OP(_backward_add) +.set_attr("FCompute", + VectorizedBackwardUseNoneCompute); + NNVM_REGISTER_OP(elemwise_sub) .set_attr("FCompute", VectorizedCompute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); From 948cea104540136946244d164c866d98b5c52940 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 10 Mar 2020 09:38:24 -0700 Subject: [PATCH 13/37] Test bcast implementations --- .../tensor/elemwise_binary_broadcast_op.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index ffd0f123070a..28cb528020f0 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -154,11 +154,16 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: int j = 0; index_t lprod = 1, rprod = 1, oprod = 1; for (int i = 0; i < oshape.ndim(); ++i) { + std::cout << "Doing " << i << std::endl; + std::cout << "lprod " << lprod << std::endl; + std::cout << "rprod " << rprod << std::endl; index_t l = 1, r = 1, o = oshape[i]; if (i >= bl) l = lshape[i-bl]; if (i >= br) r = rshape[i-br]; + std::cout << "lr " << l << " " << r << std::endl; if ((lprod != rprod || l != r) && lprod*l > 1 && rprod*r > 1) { + std::cout << "Changing new shapes" << std::endl; (*new_lshape)[j] = lprod; (*new_rshape)[j] = rprod; (*new_oshape)[j] = oprod; @@ -167,6 +172,9 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: lprod *= l; rprod *= r; oprod *= o; + std::cout << *new_lshape << std::endl; + std::cout << *new_rshape << std::endl; + std::cout << *new_oshape << std::endl; } if (lprod > 1 || rprod > 1) { (*new_lshape)[j] = lprod; @@ -183,6 +191,10 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: } else { LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape; } + std::cout << "lshape " << lshape << " -> " << *new_lshape << std::endl; + std::cout << "rshape " << rshape << " -> " << *new_rshape << std::endl; + std::cout << "oshape " << oshape << " -> " << *new_oshape << std::endl; + return j; } @@ -383,12 +395,17 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, } MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { + if (dmlc::GetEnv("DEBUG_BCAST", false)) { + broadcast::BinaryBroadcastComputeImpl(s, req[0], inputs[0].reshape(new_lshape), + inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); + } else { mshadow::Shape oshape = new_oshape.get(); mshadow::Shape lstride = mxnet_op::calc_stride(new_lshape.get()); mshadow::Shape rstride = mxnet_op::calc_stride(new_rshape.get()); mxnet_op::Kernel, xpu>:: template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape, inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr()); + } }); }); } From f326f7ec6b02e28bec878c6c9cb157c151a179ea Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 11 Mar 2020 14:09:17 -0700 Subject: [PATCH 14/37] First version of vecotrized bcast --- src/operator/tensor/broadcast_reduce-inl.cuh | 120 +++++++++++++++++- src/operator/tensor/broadcast_reduce-inl.h | 6 + .../tensor/elemwise_binary_broadcast_op.h | 8 +- 3 files changed, 129 insertions(+), 5 deletions(-) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 379443dc1688..c9acfb164a96 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -54,6 +54,75 @@ __global__ void binary_broadcast_kernel(const int N, const bool addto, } } +template +struct VectorizedBinaryBroadcastParam { + const DType* inputs[2]; + DType* outputs[1]; + Shape lstride; + //Shape lshape; + Shape rstride; + //Shape rshape; + Shape oshape; + index_t size[2]; +}; + +template +MSHADOW_XINLINE index_t calc_index(const index_t idx, + const Shape& stride, + const Shape& shape) { + index_t ret = idx; +#pragma unroll + for (int i = 0; i < ndim; ++i) { + ret -= ((idx / stride[i]) % shape[i]) * stride[i]; + } + + return ret; +} + +using common::cuda::VectorizedLoader; +using common::cuda::VectorizedStorer; + +template +__global__ void VectorizedBinaryBroadcastKernel( + const VectorizedBinaryBroadcastParam param, + const index_t N) { + constexpr int nvec = sizeof(LType) / sizeof(DType); + const index_t M = N / nvec; + + VectorizedLoader lloader(param.inputs[0], param.size[0]); + VectorizedLoader rloader(param.inputs[1], param.size[1]); + VectorizedStorer storer(param.outputs[0], N); + + for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < M; + idx += gridDim.x * blockDim.x) { + index_t lindex, rindex; + unravel_dot(idx * nvec, param.oshape, + param.lstride, param.rstride, + &lindex, &rindex); + //index_t lindex = calc_index(idx * nvec, lstride); + //index_t rindex = calc_index(idx * nvec, rstride); + lloader.load(lindex / nvec, param.size[0]); + rloader.load(rindex / nvec, param.size[1]); + + if (req == kAddTo) { + storer.load(idx, N); + } +#pragma unroll + for (int i = 0; i < lloader.nvec(); ++i) { + DType temp = OP::Map(lloader.separate()[i], + rloader.separate()[i]); + + if (req == kAddTo) { + storer.separate()[i] += temp; + } else { + storer.separate()[i] = temp; + } + } + storer.store(idx, N); + } +} + template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { @@ -71,6 +140,49 @@ void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, out.shape_.get()); } +template +class VectorizedBinaryBroadcastFwd { + public: + using ParamType = VectorizedBinaryBroadcastParam; + + template + static void Launch(const index_t blocks, const index_t threads, + cudaStream_t stream, + const ParamType params, const index_t N) { + VectorizedBinaryBroadcastKernel + <<>>(params, N); + } +}; + +using common::cuda::VectorizedKernelLauncher; + +template +void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, + const TBlob& lhs, const TBlob& rhs, const TBlob& out) { + if (req == kNullOp) return; + cudaStream_t stream = Stream::GetStream(s); + const index_t N = out.shape_.Size(); + + Shape lstride = calc_stride(lhs.shape_.get()); + Shape rstride = calc_stride(rhs.shape_.get()); + constexpr int Req = kWriteTo; + //MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + using LType = uint2; + using Kernel = VectorizedBinaryBroadcastFwd; + + typename Kernel::ParamType param; + + param.inputs[0] = lhs.dptr(); + param.inputs[1] = rhs.dptr(); + param.outputs[0] = out.dptr(); + param.lstride = lstride; + param.rstride = rstride; + param.oshape = out.shape_.get(); + + VectorizedKernelLauncher(N, s, param); + //}); +} + const int nthread_reduce = kMaxThreadsPerBlock; template __launch_bounds__(nthread_reduce) @@ -660,16 +772,16 @@ void Reduce(Stream *s, const TBlob& small, const OpReqType req, } template -size_t ReduceWorkspaceSize(Stream *s, const mxnet::TShape& small, const OpReqType req, - const mxnet::TShape& big) { +size_t ReduceWorkspaceSize(Stream *s, const ::mxnet::TShape& small, const OpReqType req, + const ::mxnet::TShape& big) { if (req == kNullOp) return 0; ReduceImplConfig config = ConfigureReduceImpl(small, big, nullptr, nullptr); return config.workspace_size; } template -size_t ReduceWorkspaceSize(Stream *s, const mxnet::TShape& small, const OpReqType req, - const mxnet::TShape& big, const mxnet::TShape& lhs, const mxnet::TShape& rhs) { +size_t ReduceWorkspaceSize(Stream *s, const ::mxnet::TShape& small, const OpReqType req, + const ::mxnet::TShape& big, const ::mxnet::TShape& lhs, const ::mxnet::TShape& rhs) { if (req == kNullOp) return 0; ReduceImplConfig config = ConfigureReduceImpl(small, big, &lhs, &rhs); return config.workspace_size; diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h index 841fbcd28a68..199e696e8a5d 100644 --- a/src/operator/tensor/broadcast_reduce-inl.h +++ b/src/operator/tensor/broadcast_reduce-inl.h @@ -32,6 +32,7 @@ #include #include "../mshadow_op.h" #include "../operator_common.h" +#include "../../common/cuda_vectorization.cuh" namespace mxnet { namespace op { @@ -179,6 +180,11 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const #else +template +void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, + const TBlob& lhs, const TBlob& rhs, const TBlob& out) { +} + template void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs, const DType *rhs, DType *out, const Shape lshape, diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 28cb528020f0..a81eefe7164c 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -395,10 +395,16 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, } MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { - if (dmlc::GetEnv("DEBUG_BCAST", false)) { + int choice = dmlc::GetEnv("DEBUG_BCAST", 0); + if (choice != 0) { + if (choice == 1) { broadcast::BinaryBroadcastComputeImpl(s, req[0], inputs[0].reshape(new_lshape), inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); } else { + broadcast::BinaryBroadcastComputeImpl2(s, req[0], inputs[0].reshape(new_lshape), + inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); + } + } else { mshadow::Shape oshape = new_oshape.get(); mshadow::Shape lstride = mxnet_op::calc_stride(new_lshape.get()); mshadow::Shape rstride = mxnet_op::calc_stride(new_rshape.get()); From 85f607096c75d546f6be49ab0d55bf9061bb4e85 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 16 Mar 2020 11:50:33 -0700 Subject: [PATCH 15/37] Adding single side vectorized bcast kernel --- src/operator/tensor/broadcast_reduce-inl.cuh | 79 ++++++++++++++------ 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index c9acfb164a96..72797a7dd507 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -58,27 +58,11 @@ template struct VectorizedBinaryBroadcastParam { const DType* inputs[2]; DType* outputs[1]; - Shape lstride; - //Shape lshape; - Shape rstride; - //Shape rshape; + Shape stride[2]; Shape oshape; index_t size[2]; }; -template -MSHADOW_XINLINE index_t calc_index(const index_t idx, - const Shape& stride, - const Shape& shape) { - index_t ret = idx; -#pragma unroll - for (int i = 0; i < ndim; ++i) { - ret -= ((idx / stride[i]) % shape[i]) * stride[i]; - } - - return ret; -} - using common::cuda::VectorizedLoader; using common::cuda::VectorizedStorer; @@ -98,10 +82,8 @@ __global__ void VectorizedBinaryBroadcastKernel( idx += gridDim.x * blockDim.x) { index_t lindex, rindex; unravel_dot(idx * nvec, param.oshape, - param.lstride, param.rstride, + param.stride[0], param.stride[1], &lindex, &rindex); - //index_t lindex = calc_index(idx * nvec, lstride); - //index_t rindex = calc_index(idx * nvec, rstride); lloader.load(lindex / nvec, param.size[0]); rloader.load(rindex / nvec, param.size[1]); @@ -123,6 +105,55 @@ __global__ void VectorizedBinaryBroadcastKernel( } } +template +__global__ void VectorizedBinaryBroadcastSingleSideKernel( + const VectorizedBinaryBroadcastParam param, + const index_t N) { + constexpr int nvec = sizeof(LType) / sizeof(DType); + const index_t M = N / nvec; + constexpr int other_side = 1 - side; + + VectorizedLoader lloader(param.inputs[side], param.size[side]); + VectorizedStorer storer(param.outputs[0], N); + + for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < M; + idx += gridDim.x * blockDim.x) { + index_t lindex, rindex; + unravel_dot(idx * nvec, param.oshape, + param.stride[side], param.stride[other_side], + &lindex, &rindex); + lloader.load(lindex / nvec, param.size[side]); + + if (req == kAddTo) { + storer.load(idx, N); + } +#pragma unroll + for (int i = 0; i < lloader.nvec(); ++i) { + if (i != 0) { + rindex = unravel_dot(idx * nvec + i, param.oshape, param.stride[other_side]); + } + DType rinput = param.inputs[other_side][rindex]; + DType temp; + if (side == 0) { + // Left side is vectorized + temp = OP::Map(lloader.separate()[i], + rinput); + } else { + // Right side is vectorized + temp = OP::Map(rinput, + lloader.separate()[i]); + } + + if (req == kAddTo) { + storer.separate()[i] += temp; + } else { + storer.separate()[i] = temp; + } + } + storer.store(idx, N); + } +} template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { @@ -166,7 +197,7 @@ void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, Shape lstride = calc_stride(lhs.shape_.get()); Shape rstride = calc_stride(rhs.shape_.get()); constexpr int Req = kWriteTo; - //MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + MXNET_ASSIGN_REQ_SWITCH(req, Req, { using LType = uint2; using Kernel = VectorizedBinaryBroadcastFwd; @@ -175,12 +206,12 @@ void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, param.inputs[0] = lhs.dptr(); param.inputs[1] = rhs.dptr(); param.outputs[0] = out.dptr(); - param.lstride = lstride; - param.rstride = rstride; + param.stride[0] = lstride; + param.stride[1] = rstride; param.oshape = out.shape_.get(); VectorizedKernelLauncher(N, s, param); - //}); + }); } const int nthread_reduce = kMaxThreadsPerBlock; From ed8d745c0c284fb1644d1777ccc78a6b2ee9402f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 16 Mar 2020 12:50:13 -0700 Subject: [PATCH 16/37] Removing debug prints --- src/operator/tensor/elemwise_binary_broadcast_op.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index a81eefe7164c..360affb4e780 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -154,16 +154,11 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: int j = 0; index_t lprod = 1, rprod = 1, oprod = 1; for (int i = 0; i < oshape.ndim(); ++i) { - std::cout << "Doing " << i << std::endl; - std::cout << "lprod " << lprod << std::endl; - std::cout << "rprod " << rprod << std::endl; index_t l = 1, r = 1, o = oshape[i]; if (i >= bl) l = lshape[i-bl]; if (i >= br) r = rshape[i-br]; - std::cout << "lr " << l << " " << r << std::endl; if ((lprod != rprod || l != r) && lprod*l > 1 && rprod*r > 1) { - std::cout << "Changing new shapes" << std::endl; (*new_lshape)[j] = lprod; (*new_rshape)[j] = rprod; (*new_oshape)[j] = oprod; @@ -172,9 +167,6 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: lprod *= l; rprod *= r; oprod *= o; - std::cout << *new_lshape << std::endl; - std::cout << *new_rshape << std::endl; - std::cout << *new_oshape << std::endl; } if (lprod > 1 || rprod > 1) { (*new_lshape)[j] = lprod; @@ -191,9 +183,6 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: } else { LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape; } - std::cout << "lshape " << lshape << " -> " << *new_lshape << std::endl; - std::cout << "rshape " << rshape << " -> " << *new_rshape << std::endl; - std::cout << "oshape " << oshape << " -> " << *new_oshape << std::endl; return j; } From 3d84675d3d2d9ccff7047ec8f5052f2a49617848 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 16 Mar 2020 13:13:11 -0700 Subject: [PATCH 17/37] Actually run the single side kernel --- src/operator/tensor/broadcast_reduce-inl.cuh | 42 ++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 72797a7dd507..500b92c6ee07 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -57,7 +57,8 @@ __global__ void binary_broadcast_kernel(const int N, const bool addto, template struct VectorizedBinaryBroadcastParam { const DType* inputs[2]; - DType* outputs[1]; + DType* outputs[2]; // Only the first one is used in the computation + // the other one is used for alignment checking Shape stride[2]; Shape oshape; index_t size[2]; @@ -180,8 +181,29 @@ class VectorizedBinaryBroadcastFwd { static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, const ParamType params, const index_t N) { - VectorizedBinaryBroadcastKernel - <<>>(params, N); + int common_shape = 1; + int first_different = -1; + for (int i = ndim - 1; i >= 0; --i) { + if (params.stride[0][i] == params.stride[1][i]) { + common_shape *= params.oshape[i]; + } else { + first_different = i; + break; + } + } + + if (common_shape != 1) { + VectorizedBinaryBroadcastKernel + <<>>(params, N); + } else { + if (params.stride[0][first_different] == 0) { + VectorizedBinaryBroadcastSingleSideKernel + <<>>(params, N); + } else { + VectorizedBinaryBroadcastSingleSideKernel + <<>>(params, N); + } + } } }; @@ -210,6 +232,20 @@ void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, param.stride[1] = rstride; param.oshape = out.shape_.get(); + for (int i = ndim - 1; i >= 0; --i) { + /* Find the first non-1 dimension + to check the alignment + */ + if (param.oshape[i] != 1) { + param.outputs[1] = param.outputs[0] + param.oshape[i]; + break; + } + if (i == 0) { + /* All dimensions are 1 */ + param.outputs[1] = param.outputs[0]; + } + } + VectorizedKernelLauncher(N, s, param); }); } From 32274763e4c62d2d7c2a537a1c3556b926184f39 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 16 Mar 2020 18:07:14 -0700 Subject: [PATCH 18/37] Move the default implementation of bcast to the vectorized one --- src/operator/tensor/elemwise_binary_broadcast_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 360affb4e780..afc7ed94d600 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -384,7 +384,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, } MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { - int choice = dmlc::GetEnv("DEBUG_BCAST", 0); + int choice = dmlc::GetEnv("DEBUG_BCAST", 2); if (choice != 0) { if (choice == 1) { broadcast::BinaryBroadcastComputeImpl(s, req[0], inputs[0].reshape(new_lshape), From 2017f75a19fb75c71b24ce5c5383a10fed14aaec Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 17 Mar 2020 08:36:11 -0700 Subject: [PATCH 19/37] Limit the new implementation to GPU only --- src/operator/tensor/elemwise_binary_broadcast_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index afc7ed94d600..c6a2e016bf48 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -385,7 +385,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { int choice = dmlc::GetEnv("DEBUG_BCAST", 2); - if (choice != 0) { + if (choice != 0 && ctx.run_ctx.ctx.dev_type == kGPU) { if (choice == 1) { broadcast::BinaryBroadcastComputeImpl(s, req[0], inputs[0].reshape(new_lshape), inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); From 320e91aa0d1ed8f6529c15cd5e8fcf6398848c67 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 17 Mar 2020 10:17:14 -0700 Subject: [PATCH 20/37] Enabling vectorization when broadcast does not actually do broadcast --- src/operator/tensor/broadcast_reduce-inl.cuh | 1 - .../tensor/elemwise_binary_broadcast_op.h | 12 +++-- src/operator/tensor/elemwise_binary_op.cuh | 16 ++++-- src/operator/tensor/elemwise_binary_op.h | 41 +++++++++++---- .../tensor/elemwise_binary_op_basic.cu | 23 ++++---- .../tensor/elemwise_binary_op_extended.cu | 17 +++--- .../tensor/elemwise_binary_op_logic.cu | 19 ++++--- .../tensor/elemwise_binary_scalar_op.cuh | 16 ++++-- .../tensor/elemwise_binary_scalar_op.h | 39 +++++++++++--- .../tensor/elemwise_binary_scalar_op_basic.cu | 27 +++++----- .../elemwise_binary_scalar_op_extended.cu | 25 +++++---- .../tensor/elemwise_binary_scalar_op_logic.cu | 19 ++++--- src/operator/tensor/elemwise_unary_op.cuh | 24 ++++++--- src/operator/tensor/elemwise_unary_op.h | 39 +++++++++++--- .../tensor/elemwise_unary_op_basic.cu | 52 ++++++++++--------- src/operator/tensor/elemwise_unary_op_pow.cu | 1 + src/operator/tensor/elemwise_unary_op_trig.cu | 1 + 17 files changed, 233 insertions(+), 139 deletions(-) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 500b92c6ee07..641bb5225958 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -218,7 +218,6 @@ void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, Shape lstride = calc_stride(lhs.shape_.get()); Shape rstride = calc_stride(rhs.shape_.get()); - constexpr int Req = kWriteTo; MXNET_ASSIGN_REQ_SWITCH(req, Req, { using LType = uint2; using Kernel = VectorizedBinaryBroadcastFwd; diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index c6a2e016bf48..70aa11e330ce 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -387,11 +387,15 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, int choice = dmlc::GetEnv("DEBUG_BCAST", 2); if (choice != 0 && ctx.run_ctx.ctx.dev_type == kGPU) { if (choice == 1) { - broadcast::BinaryBroadcastComputeImpl(s, req[0], inputs[0].reshape(new_lshape), - inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); + broadcast::BinaryBroadcastComputeImpl(s, req[0], + inputs[0].reshape(new_lshape), + inputs[1].reshape(new_rshape), + outputs[0].reshape(new_oshape)); } else { - broadcast::BinaryBroadcastComputeImpl2(s, req[0], inputs[0].reshape(new_lshape), - inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); + broadcast::BinaryBroadcastComputeImpl2(s, req[0], + inputs[0].reshape(new_lshape), + inputs[1].reshape(new_rshape), + outputs[0].reshape(new_oshape)); } } else { mshadow::Shape oshape = new_oshape.get(); diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh index db03cb807d79..72a934de345e 100644 --- a/src/operator/tensor/elemwise_binary_op.cuh +++ b/src/operator/tensor/elemwise_binary_op.cuh @@ -37,7 +37,7 @@ namespace mxnet { namespace op { -namespace { +namespace binary { using common::cuda::VectorizedKernelLauncher; using common::cuda::VectorizedLoader; @@ -227,12 +227,11 @@ class VectorizedBinaryBwdUseIn { template void VectorizedCompute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, + mshadow::Stream *s, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { if (req[0] == kNullOp) return; - mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { @@ -310,7 +309,16 @@ void VectorizedBackwardUseInCompute(const nnvm::NodeAttrs &attrs, } } -} // namespace +} // namespace binary + +template +void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream *s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + binary::VectorizedCompute(attrs, s, inputs, req, outputs); +} } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 1db6c29a3eab..2dcf143d42df 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -498,15 +498,13 @@ class ElemwiseBinaryOp : public OpBase { }); } - template - static void Compute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { + template + static void Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream *s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { using namespace mxnet_op; - if (req[0] == kNullOp) return; - Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); if (outputs[0].type_flag_ == mshadow::kBool) { @@ -517,7 +515,7 @@ class ElemwiseBinaryOp : public OpBase { const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size()) + DataType::kLanes - 1) / DataType::kLanes; if (size != 0) { - Kernel, xpu>::Launch(s, size, + Kernel, cpu>::Launch(s, size, outputs[0].dptr(), inputs[0].dptr(), inputs[1].dptr()); } @@ -525,6 +523,26 @@ class ElemwiseBinaryOp : public OpBase { }); } +#if MXNET_USE_CUDA + template + static void Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream *s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); +#endif + + template + static void Compute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + if (req[0] == kNullOp) return; + mshadow::Stream *s = ctx.get_stream(); + Compute_(attrs, s, inputs, req, outputs); + } + template static void ComputeWithBool(const nnvm::NodeAttrs &attrs, const OpContext &ctx, @@ -817,4 +835,9 @@ class ElemwiseBinaryOp : public OpBase { } // namespace op } // namespace mxnet + +#ifdef __CUDACC__ +#include "elemwise_binary_op.cuh" +#endif // __CUDACC__ + #endif // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_H_ diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index 772fcb4db5cc..b2213faf577c 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -26,7 +26,6 @@ #include "./elemwise_binary_op.h" #include "./elemwise_binary_op-inl.h" #include "./indexing_op.h" -#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { @@ -219,51 +218,51 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream *s, } NNVM_REGISTER_OP(elemwise_add) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", ElemwiseBinaryOp::Compute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_grad_add) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_add) .set_attr("FCompute", - VectorizedBackwardUseNoneCompute); NNVM_REGISTER_OP(elemwise_sub) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", ElemwiseBinaryOp::Compute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_sub) .set_attr("FCompute", - VectorizedBackwardUseNoneCompute); NNVM_REGISTER_OP(elemwise_mul) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", ElemwiseBinaryOp::Compute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeDnsLRValueEx); NNVM_REGISTER_OP(_backward_mul) .set_attr("FCompute", - VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(elemwise_div) .set_attr("FCompute", - VectorizedCompute); + ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_div) .set_attr("FCompute", - VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(_mod) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_mod) .set_attr("FCompute", - VectorizedBackwardUseInCompute); + binary::VectorizedBackwardUseInCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu index 0cd0d79d3453..dc2d11808bb0 100644 --- a/src/operator/tensor/elemwise_binary_op_extended.cu +++ b/src/operator/tensor/elemwise_binary_op_extended.cu @@ -24,36 +24,35 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" -#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_power) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_power) -.set_attr("FCompute", VectorizedBackwardUseInCompute< +.set_attr("FCompute", binary::VectorizedBackwardUseInCompute< mshadow_op::power_grad, mshadow_op::power_rgrad>); NNVM_REGISTER_OP(_maximum) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_maximum) -.set_attr("FCompute", VectorizedBackwardUseInCompute("FCompute", binary::VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(_minimum) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_minimum) -.set_attr("FCompute", VectorizedBackwardUseInCompute("FCompute", binary::VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(_hypot) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_hypot) -.set_attr("FCompute", VectorizedBackwardUseInCompute< +.set_attr("FCompute", binary::VectorizedBackwardUseInCompute< mshadow_op::hypot_grad_left, mshadow_op::hypot_grad_right>); } // namespace op diff --git a/src/operator/tensor/elemwise_binary_op_logic.cu b/src/operator/tensor/elemwise_binary_op_logic.cu index cb1eaa4612c5..e36e6971148f 100644 --- a/src/operator/tensor/elemwise_binary_op_logic.cu +++ b/src/operator/tensor/elemwise_binary_op_logic.cu @@ -24,36 +24,35 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" -#include "./elemwise_binary_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_equal) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_not_equal) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_greater) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_greater_equal) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_lesser) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_lesser_equal) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_logical_and) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_logical_or) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_logical_xor) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", ElemwiseBinaryOp::Compute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh index c9d21a5ec9ba..b4658377f78d 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op.cuh +++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh @@ -37,7 +37,7 @@ namespace mxnet { namespace op { -namespace { +namespace binary_scalar { using common::cuda::VectorizedKernelLauncher; using common::cuda::VectorizedLoader; @@ -143,12 +143,11 @@ class VectorizedBinaryScalarBwd { template void VectorizedCompute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, + mshadow::Stream* s, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { if (req[0] == kNullOp) return; - mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); const double alpha = nnvm::get(attrs.parsed); @@ -196,7 +195,16 @@ void VectorizedBwdCompute(const nnvm::NodeAttrs &attrs, }); } -} // namespace +} // namespace binary_scalar + +template +void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + binary_scalar::VectorizedCompute(attrs, s, inputs, req, outputs); +} } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h index 3e8702813a7c..6791b87607ca 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op.h +++ b/src/operator/tensor/elemwise_binary_scalar_op.h @@ -224,26 +224,44 @@ class BinaryScalarOp : public UnaryOp { } public: - template - static void Compute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { + template + static void Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { DCHECK_EQ(inputs.size(), 1); DCHECK_EQ(outputs.size(), 1); using namespace mshadow; using namespace mshadow::expr; - Stream *s = ctx.get_stream(); const double alpha = nnvm::get(attrs.parsed); MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - mxnet_op::Kernel, xpu>::Launch( + mxnet_op::Kernel, cpu>::Launch( s, inputs[0].Size(), outputs[0].dptr(), inputs[0].dptr(), DType(alpha)); }); }); } +#if MXNET_USE_CUDA + template + static void Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); +#endif + + template + static void Compute(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + mshadow::Stream *s = ctx.get_stream(); + Compute_(attrs, s, inputs, req, outputs); + } + template static void ComputeInt(const nnvm::NodeAttrs &attrs, const OpContext &ctx, @@ -375,4 +393,9 @@ class BinaryScalarOp : public UnaryOp { } // namespace op } // namespace mxnet + +#ifdef __CUDACC__ +#include "elemwise_binary_scalar_op.cuh" +#endif + #endif // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_H_ diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu index ccfb324bcd2d..c1d27ace98a3 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu @@ -25,52 +25,51 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" #include "./elemwise_binary_scalar_op.h" -#include "./elemwise_binary_scalar_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_plus_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_minus_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_rminus_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_mul_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_backward_mul_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_div_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_backward_div_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::ComputeEx); NNVM_REGISTER_OP(_rdiv_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_rdiv_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(_mod_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_mod_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(_rmod_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_rmod_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu index 7864c465e19f..981efc591480 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu @@ -25,45 +25,44 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op.h" #include "./elemwise_binary_scalar_op.h" -#include "./elemwise_binary_scalar_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_maximum_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_maximum_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(_minimum_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_minimum_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(_power_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_power_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(_rpower_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_rpower_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(_hypot_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_hypot_scalar) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); NNVM_REGISTER_OP(smooth_l1) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_smooth_l1) -.set_attr("FCompute", VectorizedBwdCompute); +.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu index 4b61542f00c7..6c393e0719a5 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu @@ -23,43 +23,42 @@ * \brief GPU Implementation of binary scalar logic functions. */ #include "elemwise_binary_scalar_op.h" -#include "elemwise_binary_scalar_op.cuh" namespace mxnet { namespace op { NNVM_REGISTER_OP(_equal_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_not_equal_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_greater_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_greater_equal_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_lesser_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_lesser_equal_scalar) -.set_attr("FCompute", VectorizedCompute) +.set_attr("FCompute", BinaryScalarOp::Compute) .set_attr("FComputeEx", BinaryScalarOp::LogicComputeEx); NNVM_REGISTER_OP(_logical_and_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_logical_or_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_logical_xor_scalar) -.set_attr("FCompute", VectorizedCompute); +.set_attr("FCompute", BinaryScalarOp::Compute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh index e0e2c84c7ff7..e4d0f2072e77 100644 --- a/src/operator/tensor/elemwise_unary_op.cuh +++ b/src/operator/tensor/elemwise_unary_op.cuh @@ -37,7 +37,7 @@ namespace mxnet { namespace op { -namespace { +namespace unary { using common::cuda::VectorizedKernelLauncher; using common::cuda::VectorizedLoader; @@ -93,13 +93,12 @@ class VectorizedUnaryScalarFwd { }; template -void VectorizedUnaryCompute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +void VectorizedCompute(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { if (req[0] == kNullOp) return; - mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { @@ -117,7 +116,16 @@ void VectorizedUnaryCompute(const nnvm::NodeAttrs &attrs, }); } -} // namespace +} // namespace unary + +template +void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + unary::VectorizedCompute(attrs, s, inputs, req, outputs); +} } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index e731db70f3bf..86686c6f1278 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -235,23 +235,42 @@ class UnaryOp : public OpBase { } } - template - static void Compute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); + template + static void Compute_(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { if (inputs[0].Size() != 0) { - mxnet_op::Kernel, xpu>::Launch( + mxnet_op::Kernel, cpu>::Launch( s, inputs[0].Size(), outputs[0].dptr(), inputs[0].dptr()); } }); }); } +#if MXNET_USE_CUDA + template + static void Compute_(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); + +#endif + + template + static void Compute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + mshadow::Stream *s = ctx.get_stream(); + Compute_(attrs, s, inputs, req, outputs); + } + template static void ComputeInt(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -860,4 +879,8 @@ void NumpyNanToNumOpBackward(const nnvm::NodeAttrs& attrs, } // namespace op } // namespace mxnet +#ifdef __CUDACC__ +#include "elemwise_unary_op.cuh" +#endif + #endif // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_ diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu index 8d41c53b5ba2..7c0550735519 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cu +++ b/src/operator/tensor/elemwise_unary_op_basic.cu @@ -22,13 +22,12 @@ * \brief GPU Implementation of unary functions. */ #include "./elemwise_binary_op.h" -#include "./elemwise_unary_op.cuh" -#include "./elemwise_binary_op.cuh" +#include "./elemwise_unary_op.h" namespace mxnet { namespace op { NNVM_REGISTER_OP(relu) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_relu) @@ -36,7 +35,7 @@ NNVM_REGISTER_OP(_backward_relu) gpu, unary_bwd>); NNVM_REGISTER_OP(sigmoid) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); NNVM_REGISTER_OP(_backward_sigmoid) .set_attr("FCompute", ElemwiseBinaryOp::Compute< @@ -50,7 +49,7 @@ NNVM_REGISTER_OP(_backward_hard_sigmoid) // softsign NNVM_REGISTER_OP(softsign) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); NNVM_REGISTER_OP(_backward_softsign) .set_attr("FCompute", ElemwiseBinaryOp::Compute< @@ -58,19 +57,19 @@ NNVM_REGISTER_OP(_backward_softsign) // erf NNVM_REGISTER_OP(erf) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); NNVM_REGISTER_OP(_backward_erf) .set_attr("FCompute", - VectorizedCompute>); + ElemwiseBinaryOp::Compute>); // erfinv NNVM_REGISTER_OP(erfinv) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); NNVM_REGISTER_OP(_backward_erfinv) .set_attr("FCompute", - VectorizedCompute>); + ElemwiseBinaryOp::Compute>); // copy NNVM_REGISTER_OP(_copy) @@ -153,72 +152,75 @@ NNVM_REGISTER_OP(_backward_cast) // negative NNVM_REGISTER_OP(negative) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // abs NNVM_REGISTER_OP(abs) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_abs) -.set_attr("FCompute", VectorizedCompute >); +.set_attr("FCompute", ElemwiseBinaryOp::Compute >); // sign NNVM_REGISTER_OP(sign) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); NNVM_REGISTER_OP(_backward_sign) -.set_attr("FCompute", VectorizedCompute >); +.set_attr("FCompute", ElemwiseBinaryOp::Compute< + gpu, unary_bwd >); // round NNVM_REGISTER_OP(round) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // ceil NNVM_REGISTER_OP(ceil) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // floor NNVM_REGISTER_OP(floor) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // trunc NNVM_REGISTER_OP(trunc) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // rint NNVM_REGISTER_OP(rint) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // fix NNVM_REGISTER_OP(fix) -.set_attr("FCompute", VectorizedUnaryCompute) +.set_attr("FCompute", UnaryOp::Compute) .set_attr("FComputeEx", UnaryOp::ComputeEx); // gamma NNVM_REGISTER_OP(gamma) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); NNVM_REGISTER_OP(_backward_gamma) -.set_attr("FCompute", VectorizedCompute >); +.set_attr("FCompute", ElemwiseBinaryOp::Compute< + gpu, unary_bwd >); // gammaln NNVM_REGISTER_OP(gammaln) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); NNVM_REGISTER_OP(_backward_gammaln) -.set_attr("FCompute", VectorizedCompute >); +.set_attr("FCompute", ElemwiseBinaryOp::Compute< + gpu, unary_bwd >); // logical not NNVM_REGISTER_OP(logical_not) -.set_attr("FCompute", VectorizedUnaryCompute); +.set_attr("FCompute", UnaryOp::Compute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op_pow.cu b/src/operator/tensor/elemwise_unary_op_pow.cu index 4dbdf349cdb0..287a2e87be73 100644 --- a/src/operator/tensor/elemwise_unary_op_pow.cu +++ b/src/operator/tensor/elemwise_unary_op_pow.cu @@ -22,6 +22,7 @@ * \brief GPU Implementation of power (x^k for fixed k) functions. */ #include "./elemwise_binary_op.h" +#include "./elemwise_unary_op.h" namespace mxnet { namespace op { diff --git a/src/operator/tensor/elemwise_unary_op_trig.cu b/src/operator/tensor/elemwise_unary_op_trig.cu index 8e28b9c609fa..f5e9d1ccbd6c 100644 --- a/src/operator/tensor/elemwise_unary_op_trig.cu +++ b/src/operator/tensor/elemwise_unary_op_trig.cu @@ -22,6 +22,7 @@ * \brief GPU Implementation of unary trigonometric function. */ #include "./elemwise_binary_op.h" +#include "./elemwise_unary_op.h" namespace mxnet { namespace op { From 4decacdd0d0cbb10b0e85b927af1646e44083111 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 17 Mar 2020 15:38:01 -0700 Subject: [PATCH 21/37] Cleaning --- src/operator/tensor/elemwise_binary_op.cuh | 48 +++---- src/operator/tensor/elemwise_binary_op.h | 119 ++++++++++-------- .../tensor/elemwise_binary_op_basic.cu | 10 +- .../tensor/elemwise_binary_op_extended.cu | 8 +- .../tensor/elemwise_binary_scalar_op.cuh | 40 +++--- .../tensor/elemwise_binary_scalar_op.h | 36 ++++-- .../tensor/elemwise_binary_scalar_op_basic.cu | 6 +- .../elemwise_binary_scalar_op_extended.cu | 12 +- src/operator/tensor/elemwise_unary_op.cuh | 24 ++-- 9 files changed, 158 insertions(+), 145 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh index 72a934de345e..9ddb82a7ad53 100644 --- a/src/operator/tensor/elemwise_binary_op.cuh +++ b/src/operator/tensor/elemwise_binary_op.cuh @@ -225,12 +225,15 @@ class VectorizedBinaryBwdUseIn { } }; +} // namespace binary + template -void VectorizedCompute(const nnvm::NodeAttrs &attrs, - mshadow::Stream *s, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream *s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace binary; if (req[0] == kNullOp) return; CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); @@ -251,12 +254,12 @@ void VectorizedCompute(const nnvm::NodeAttrs &attrs, } template -void VectorizedBackwardUseNoneCompute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - mshadow::Stream *s = ctx.get_stream(); +void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace binary; cudaStream_t stream = mshadow::Stream::GetStream(s); MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { @@ -280,12 +283,12 @@ void VectorizedBackwardUseNoneCompute(const nnvm::NodeAttrs &attrs, } template -void VectorizedBackwardUseInCompute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - mshadow::Stream *s = ctx.get_stream(); +void ElemwiseBinaryOp::BackwardUseIn_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace binary; if (req[0] != kNullOp || req[1] != kNullOp) { MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { MXNET_REQ_TYPE_SWITCH(req[0], lreq, { @@ -309,17 +312,6 @@ void VectorizedBackwardUseInCompute(const nnvm::NodeAttrs &attrs, } } -} // namespace binary - -template -void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs, - mshadow::Stream *s, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - binary::VectorizedCompute(attrs, s, inputs, req, outputs); -} - } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 2dcf143d42df..5c56edf41f3f 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -106,63 +106,82 @@ class ElemwiseBinaryOp : public OpBase { } private: - template + template static void BackwardUseNone_(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, + mshadow::Stream* s, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mxnet_op; - Stream *s = ctx.get_stream(); - const int size = static_cast((outputs[0].Size() + DataType::kLanes - 1) - / DataType::kLanes); - const DType *ograd_dptr = inputs[0].dptr(); - if (std::is_same::value && req[0] == kWriteInplace) { - CHECK_EQ(ograd_dptr, outputs[0].dptr()); - } else if (req[0] != kNullOp) { - DType *lgrad_dptr = outputs[0].dptr(); - MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - Kernel, xpu>::Launch(s, size, lgrad_dptr, ograd_dptr); - }); - } - if (std::is_same::value && req[1] == kWriteInplace) { - CHECK_EQ(ograd_dptr, outputs[1].dptr()); - } else if (req[1] != kNullOp) { - DType *rgrad_dptr = outputs[1].dptr(); - MXNET_ASSIGN_REQ_SWITCH(req[1], Req, { - Kernel, xpu>::Launch(s, size, rgrad_dptr, ograd_dptr); - }); - } + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using namespace mxnet_op; + const int size = static_cast((outputs[0].Size() + DataType::kLanes - 1) + / DataType::kLanes); + const DType *ograd_dptr = inputs[0].dptr(); + if (std::is_same::value && req[0] == kWriteInplace) { + CHECK_EQ(ograd_dptr, outputs[0].dptr()); + } else if (req[0] != kNullOp) { + DType *lgrad_dptr = outputs[0].dptr(); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + Kernel, cpu>::Launch(s, size, lgrad_dptr, ograd_dptr); + }); + } + if (std::is_same::value && req[1] == kWriteInplace) { + CHECK_EQ(ograd_dptr, outputs[1].dptr()); + } else if (req[1] != kNullOp) { + DType *rgrad_dptr = outputs[1].dptr(); + MXNET_ASSIGN_REQ_SWITCH(req[1], Req, { + Kernel, cpu>::Launch(s, size, rgrad_dptr, ograd_dptr); + }); + } + }); } +#if MXNET_USE_CUDA + template + static void BackwardUseNone_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); +#endif - template + template static void BackwardUseIn_(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, + mshadow::Stream* s, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - DCHECK_EQ(outputs.size(), 2U); - DCHECK_EQ(inputs.size(), 3U); - mxnet_op::Stream *s = ctx.get_stream(); - const DType *ograd_dptr = inputs[0].dptr(); - const DType *lhs_dptr = inputs[1].dptr(); - const DType *rhs_dptr = inputs[2].dptr(); - MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - const int size = static_cast( - (outputs[0].Size() + mxnet_op::DataType::kLanes - 1) - / mxnet_op::DataType::kLanes); - DType * lgrad_dptr = outputs[0].dptr(); - mxnet_op::Kernel, Req>, xpu>::Launch( - s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);}); - MXNET_ASSIGN_REQ_SWITCH(req[1], Req, { - const int size = static_cast( - (outputs[1].Size() + mxnet_op::DataType::kLanes - 1) - / mxnet_op::DataType::kLanes); - DType * rgrad_dptr = outputs[1].dptr(); - mxnet_op::Kernel, Req>, xpu>::Launch( - s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);}); + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + DCHECK_EQ(outputs.size(), 2U); + DCHECK_EQ(inputs.size(), 3U); + const DType *ograd_dptr = inputs[0].dptr(); + const DType *lhs_dptr = inputs[1].dptr(); + const DType *rhs_dptr = inputs[2].dptr(); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + const int size = static_cast( + (outputs[0].Size() + mxnet_op::DataType::kLanes - 1) + / mxnet_op::DataType::kLanes); + DType * lgrad_dptr = outputs[0].dptr(); + mxnet_op::Kernel, Req>, cpu>::Launch( + s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);}); + MXNET_ASSIGN_REQ_SWITCH(req[1], Req, { + const int size = static_cast( + (outputs[1].Size() + mxnet_op::DataType::kLanes - 1) + / mxnet_op::DataType::kLanes); + DType * rgrad_dptr = outputs[1].dptr(); + mxnet_op::Kernel, Req>, cpu>::Launch( + s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);}); + }); } +#if MXNET_USE_CUDA + template + static void BackwardUseIn_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); +#endif + template< typename xpu, typename LOP, @@ -688,9 +707,8 @@ class ElemwiseBinaryOp : public OpBase { const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - BackwardUseNone_(attrs, ctx, inputs, req, outputs); - }); + mshadow::Stream *s = ctx.get_stream(); + BackwardUseNone_(attrs, s, inputs, req, outputs); } template @@ -734,9 +752,8 @@ class ElemwiseBinaryOp : public OpBase { const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - BackwardUseIn_(attrs, ctx, inputs, req, outputs); - }); + mshadow::Stream *s = ctx.get_stream(); + BackwardUseIn_(attrs, s, inputs, req, outputs); } template< diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index b2213faf577c..c88b0300dd0c 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -226,7 +226,7 @@ NNVM_REGISTER_OP(_grad_add) NNVM_REGISTER_OP(_backward_add) .set_attr("FCompute", - binary::VectorizedBackwardUseNoneCompute); NNVM_REGISTER_OP(elemwise_sub) @@ -235,7 +235,7 @@ NNVM_REGISTER_OP(elemwise_sub) NNVM_REGISTER_OP(_backward_sub) .set_attr("FCompute", - binary::VectorizedBackwardUseNoneCompute); NNVM_REGISTER_OP(elemwise_mul) @@ -245,7 +245,7 @@ NNVM_REGISTER_OP(elemwise_mul) NNVM_REGISTER_OP(_backward_mul) .set_attr("FCompute", - binary::VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(elemwise_div) @@ -254,7 +254,7 @@ NNVM_REGISTER_OP(elemwise_div) NNVM_REGISTER_OP(_backward_div) .set_attr("FCompute", - binary::VectorizedBackwardUseInCompute); NNVM_REGISTER_OP(_mod) @@ -262,7 +262,7 @@ NNVM_REGISTER_OP(_mod) NNVM_REGISTER_OP(_backward_mod) .set_attr("FCompute", - binary::VectorizedBackwardUseInCompute); + ElemwiseBinaryOp::BackwardUseIn); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu index dc2d11808bb0..0ae6ac966a2b 100644 --- a/src/operator/tensor/elemwise_binary_op_extended.cu +++ b/src/operator/tensor/elemwise_binary_op_extended.cu @@ -31,28 +31,28 @@ NNVM_REGISTER_OP(_power) .set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_power) -.set_attr("FCompute", binary::VectorizedBackwardUseInCompute< +.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseIn); NNVM_REGISTER_OP(_maximum) .set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_maximum) -.set_attr("FCompute", binary::VectorizedBackwardUseInCompute("FCompute", ElemwiseBinaryOp::BackwardUseIn); NNVM_REGISTER_OP(_minimum) .set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_minimum) -.set_attr("FCompute", binary::VectorizedBackwardUseInCompute("FCompute", ElemwiseBinaryOp::BackwardUseIn); NNVM_REGISTER_OP(_hypot) .set_attr("FCompute", ElemwiseBinaryOp::Compute); NNVM_REGISTER_OP(_backward_hypot) -.set_attr("FCompute", binary::VectorizedBackwardUseInCompute< +.set_attr("FCompute", ElemwiseBinaryOp::BackwardUseIn); } // namespace op diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh index b4658377f78d..1de262360109 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op.cuh +++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh @@ -141,12 +141,15 @@ class VectorizedBinaryScalarBwd { } }; -template -void VectorizedCompute(const nnvm::NodeAttrs &attrs, - mshadow::Stream* s, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +} // namespace binary_scalar + +template +void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace binary_scalar; if (req[0] == kNullOp) return; CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); @@ -167,14 +170,14 @@ void VectorizedCompute(const nnvm::NodeAttrs &attrs, }); } -template -void VectorizedBwdCompute(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +template +void BinaryScalarOp::Backward_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace binary_scalar; if (req[0] == kNullOp) return; - mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); const double alpha = nnvm::get(attrs.parsed); @@ -195,17 +198,6 @@ void VectorizedBwdCompute(const nnvm::NodeAttrs &attrs, }); } -} // namespace binary_scalar - -template -void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs, - mshadow::Stream* s, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - binary_scalar::VectorizedCompute(attrs, s, inputs, req, outputs); -} - } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h index 6791b87607ca..f974332252d8 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op.h +++ b/src/operator/tensor/elemwise_binary_scalar_op.h @@ -353,26 +353,46 @@ class BinaryScalarOp : public UnaryOp { } } - template - static void Backward(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { + template + static void Backward_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { using namespace mshadow; using namespace mshadow::expr; - Stream *s = ctx.get_stream(); const double alpha = nnvm::get(attrs.parsed); MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { mxnet::op::mxnet_op::Kernel, Req>, xpu>:: + mxnet::op::mxnet_op::backward_grad_tuned, Req>, cpu>:: Launch(s, inputs[0].Size(), outputs[0].dptr(), inputs[0].dptr(), inputs[1].dptr(), DType(alpha)); }); }); } + +#if MXNET_USE_CUDA + template + static void Backward_(const nnvm::NodeAttrs &attrs, + mshadow::Stream* s, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); +#endif + + template + static void Backward(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + Backward_(attrs, s, inputs, req, outputs); + } }; #define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name) \ diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu index c1d27ace98a3..3fd017f09ec7 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu @@ -57,19 +57,19 @@ NNVM_REGISTER_OP(_rdiv_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_rdiv_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(_mod_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_mod_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(_rmod_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_rmod_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu index 981efc591480..f09e40a2eee7 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu +++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu @@ -32,37 +32,37 @@ NNVM_REGISTER_OP(_maximum_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_maximum_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(_minimum_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_minimum_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(_power_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_power_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(_rpower_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_rpower_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(_hypot_scalar) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_hypot_scalar) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); NNVM_REGISTER_OP(smooth_l1) .set_attr("FCompute", BinaryScalarOp::Compute); NNVM_REGISTER_OP(_backward_smooth_l1) -.set_attr("FCompute", binary_scalar::VectorizedBwdCompute); +.set_attr("FCompute", BinaryScalarOp::Backward); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh index e4d0f2072e77..5b8467f5c26c 100644 --- a/src/operator/tensor/elemwise_unary_op.cuh +++ b/src/operator/tensor/elemwise_unary_op.cuh @@ -92,12 +92,15 @@ class VectorizedUnaryScalarFwd { } }; +} // namespace unary + template -void VectorizedCompute(const nnvm::NodeAttrs &attrs, - mshadow::Stream* s, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace unary; if (req[0] == kNullOp) return; CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); @@ -116,17 +119,6 @@ void VectorizedCompute(const nnvm::NodeAttrs &attrs, }); } -} // namespace unary - -template -void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs, - mshadow::Stream* s, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - unary::VectorizedCompute(attrs, s, inputs, req, outputs); -} - } // namespace op } // namespace mxnet From a16cec04360b980102b0303229b0aecc93e6071f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 17 Mar 2020 17:22:26 -0700 Subject: [PATCH 22/37] Cleaning part 2 --- src/common/cuda_vectorization.cuh | 3 +- src/operator/numpy/np_diff-inl.h | 4 +- src/operator/tensor/broadcast_reduce-inl.cuh | 87 ++---- src/operator/tensor/broadcast_reduce-inl.h | 272 +++++++++++------- src/operator/tensor/broadcast_reduce_op.h | 4 +- .../tensor/elemwise_binary_broadcast_op.h | 163 ----------- 6 files changed, 204 insertions(+), 329 deletions(-) diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh index b82029ef2222..0d04d9da0cea 100644 --- a/src/common/cuda_vectorization.cuh +++ b/src/common/cuda_vectorization.cuh @@ -26,10 +26,11 @@ #ifndef MXNET_COMMON_CUDA_VECTORIZATION_CUH_ #define MXNET_COMMON_CUDA_VECTORIZATION_CUH_ +#if MXNET_USE_CUDA && __CUDACC__ + #include #include "cuda_utils.h" -#if MXNET_USE_CUDA && __CUDACC__ namespace mxnet { namespace common { diff --git a/src/operator/numpy/np_diff-inl.h b/src/operator/numpy/np_diff-inl.h index 69f175e802dd..fac03b79bfb0 100644 --- a/src/operator/numpy/np_diff-inl.h +++ b/src/operator/numpy/np_diff-inl.h @@ -67,7 +67,7 @@ struct diff_forward { const int stride, const mshadow::Shape oshape, const mshadow::Shape ishape) { - using namespace broadcast; + using namespace mxnet_op; // j represent the memory index of the corresponding input entry int j = ravel(unravel(i, oshape), ishape); @@ -139,7 +139,7 @@ struct diff_backward { const int stride, const int axis, const mshadow::Shape oshape, const mshadow::Shape ishape) { - using namespace broadcast; + using namespace mxnet_op; if (n == 0) { igrad[i] = ograd[i]; return; diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 641bb5225958..0599d0aec2aa 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -28,32 +28,6 @@ using namespace mshadow::cuda; -template -__launch_bounds__(kMaxThreadsPerBlock) -__global__ void binary_broadcast_kernel(const int N, const bool addto, - const DType* __restrict lhs, - const DType* __restrict rhs, DType *out, - const Shape lstride, const Shape rstride, - const Shape oshape) { - for (int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x; idx < N; - idx += blockDim.x * gridDim.x * unroll) - { - int j[unroll]; - int k[unroll]; - DType val[unroll]; - #pragma unroll - for (int i=0;i < unroll;i++) { - unravel_dot(idx + i*blockDim.x, oshape, lstride, rstride, &j[i], &k[i]); - val[i] = OP::Map(lhs[j[i]], rhs[k[i]]); - } - #pragma unroll - for (int i=0;i < unroll;i++) { - if (idx + i*blockDim.x < N) assign(&out[idx + i*blockDim.x], addto, val[i]); - } - - } -} - template struct VectorizedBinaryBroadcastParam { const DType* inputs[2]; @@ -132,7 +106,7 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( #pragma unroll for (int i = 0; i < lloader.nvec(); ++i) { if (i != 0) { - rindex = unravel_dot(idx * nvec + i, param.oshape, param.stride[other_side]); + rindex = mxnet_op::unravel_dot(idx * nvec + i, param.oshape, param.stride[other_side]); } DType rinput = param.inputs[other_side][rindex]; DType temp; @@ -155,22 +129,6 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( storer.store(idx, N); } } -template -void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, - const TBlob& lhs, const TBlob& rhs, const TBlob& out) { - if (req == kNullOp) return; - cudaStream_t stream = Stream::GetStream(s); - int N = out.shape_.Size(); - const int warpSize = 32; - const int unroll = 2; - int nthread = std::min(kMaxThreadsPerBlock, ((N + warpSize - 1)/warpSize)*warpSize ); - int ngrid = std::min(kBaseGridNum, (N + nthread*unroll - 1) / (nthread*unroll)); - Shape lstride = calc_stride(lhs.shape_.get()); - Shape rstride = calc_stride(rhs.shape_.get()); - binary_broadcast_kernel<<>>( - N, req == kAddTo, lhs.dptr(), rhs.dptr(), out.dptr(), lstride, rstride, - out.shape_.get()); -} template class VectorizedBinaryBroadcastFwd { @@ -207,17 +165,16 @@ class VectorizedBinaryBroadcastFwd { } }; -using common::cuda::VectorizedKernelLauncher; - template -void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, - const TBlob& lhs, const TBlob& rhs, const TBlob& out) { +void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, + const TBlob& lhs, const TBlob& rhs, const TBlob& out) { + using common::cuda::VectorizedKernelLauncher; if (req == kNullOp) return; cudaStream_t stream = Stream::GetStream(s); const index_t N = out.shape_.Size(); - Shape lstride = calc_stride(lhs.shape_.get()); - Shape rstride = calc_stride(rhs.shape_.get()); + Shape lstride = mxnet_op::calc_stride(lhs.shape_.get()); + Shape rstride = mxnet_op::calc_stride(rhs.shape_.get()); MXNET_ASSIGN_REQ_SWITCH(req, Req, { using LType = uint2; using Kernel = VectorizedBinaryBroadcastFwd; @@ -270,8 +227,8 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, const int Mend = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext); for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) { int idx = idx0 + tidx; - Shape coord = unravel(idx, small_shape); - int idx_big0 = ravel(coord, big_shape0); + Shape coord = mxnet_op::unravel(idx, small_shape); + int idx_big0 = mxnet_op::ravel(coord, big_shape0); AType val, residual; Reducer::SetInitValue(val, residual); @@ -280,7 +237,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, int idx_big[unroll]; #pragma unroll for (int u=0;u < unroll;u++) { - idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride); + idx_big[u] = idx_big0 + mxnet_op::unravel_dot(k + u*by, big_shape, big_stride); } DType tmp[unroll]; #pragma unroll @@ -353,10 +310,10 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, const int Mend = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext); for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) { int idx = idx0 + tidx; - Shape coord = unravel(idx, small_shape); - int idx_big0 = ravel(coord, big_shape0); - int idx_lhs0 = ravel(coord, lhs_shape0); - int idx_rhs0 = ravel(coord, rhs_shape0); + Shape coord = mxnet_op::unravel(idx, small_shape); + int idx_big0 = mxnet_op::ravel(coord, big_shape0); + int idx_lhs0 = mxnet_op::ravel(coord, lhs_shape0); + int idx_rhs0 = mxnet_op::ravel(coord, rhs_shape0); DType val, residual; Reducer::SetInitValue(val, residual); @@ -367,9 +324,9 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, int idx_rhs[unroll]; #pragma unroll for (int u=0;u < unroll;u++) { - idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride); - idx_lhs[u] = idx_lhs0 + unravel_dot(k + u*by, lhs_shape, lhs_stride); - idx_rhs[u] = idx_rhs0 + unravel_dot(k + u*by, rhs_shape, rhs_stride); + idx_big[u] = idx_big0 + mxnet_op::unravel_dot(k + u*by, big_shape, big_stride); + idx_lhs[u] = idx_lhs0 + mxnet_op::unravel_dot(k + u*by, lhs_shape, lhs_stride); + idx_rhs[u] = idx_rhs0 + mxnet_op::unravel_dot(k + u*by, rhs_shape, rhs_stride); } DType tmp[unroll]; #pragma unroll @@ -445,8 +402,8 @@ __global__ void reduce_kernel_M1(const int N, const bool addto, const DType* __restrict big, OType *small, const Shape bshape, const Shape sshape) { for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) { - Shape coord = unravel(idx, sshape); - int j = ravel(coord, bshape); + Shape coord = mxnet_op::unravel(idx, sshape); + int j = mxnet_op::ravel(coord, bshape); AType val, residual; Reducer::SetInitValue(val, residual); Reducer::Reduce(val, AType(OP::Map(big[j])), residual); @@ -467,10 +424,10 @@ __global__ void reduce_kernel_M1(const int N, const bool addto, const Shape rhs_shape, const Shape small_shape) { for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) { - Shape coord = unravel(idx, small_shape); - int idx_big = ravel(coord, big_shape); - int idx_lhs = ravel(coord, lhs_shape); - int idx_rhs = ravel(coord, rhs_shape); + Shape coord = mxnet_op::unravel(idx, small_shape); + int idx_big = mxnet_op::ravel(coord, big_shape); + int idx_lhs = mxnet_op::ravel(coord, lhs_shape); + int idx_rhs = mxnet_op::ravel(coord, rhs_shape); DType val, residual; Reducer::SetInitValue(val, residual); Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual); diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h index 199e696e8a5d..6570ea9e9dd5 100644 --- a/src/operator/tensor/broadcast_reduce-inl.h +++ b/src/operator/tensor/broadcast_reduce-inl.h @@ -31,28 +31,165 @@ #include #include #include "../mshadow_op.h" +#include "../mxnet_op.h" #include "../operator_common.h" +#if MXNET_USE_CUDA #include "../../common/cuda_vectorization.cuh" +#endif namespace mxnet { namespace op { +namespace mxnet_op { +template +struct binary_broadcast_kernel { + /*! \brief Map function for binary_broadcast_kernel */ + template + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType *lhs, IType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); + } + } + + /*! \brief Map function for binary_broadcast_kernel */ + template + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType lhs, IType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); + } + } + +#ifndef _WIN32 + /*! \brief Map function for binary_broadcast_kernel */ + template::value, int>::type = 0> + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType *lhs, DType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); + } + } + + /*! \brief Map function for binary_broadcast_kernel */ + template::value && + !std::is_pointer::value, int>::type = 0> + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType lhs, DType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); + } + } +#endif +}; + +template +struct csr_dns_csr_broadcast_kernel { + /*! + * \brief Map function for broadcast between csr and 1D vector + * \param row global thread id/assigned row id + * \param csr_data ptr to data buffer of csr matrix + * \param csr_indices ptr to indices buffer of csr matrix + * \param csr_indptr ptr to indptr buffer of csr matrix + * \param dns ptr to data buffer of the dense vector + * \param out ptr to the data buffer of the result csr matrix + */ + template + MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, + const RType *csr_indptr, const DType *dns, DType *out) { + const nnvm::dim_t curr_row_i = csr_indptr[row]; + const nnvm::dim_t next_row_i = csr_indptr[row + 1]; + for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { + KERNEL_ASSIGN(out[iter], req, OP::Map(csr_data[iter], + (col_vec)? dns[row] : dns[csr_indices[iter]])); + } + } + + /*! + * \brief Map function for broadcast between csr and a scalar + * \param i global thread id + * \param csr_data ptr to data buffer of csr matrix + * \param scalar_ptr ptr to data buffer of the scalar tensor, only the 0-th element is used + * \param out ptr to the data buffer of output csr matrix + * \param nnz number of non-zero elements in input csr matrix + */ + template + MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr, + DType *out, const nnvm::dim_t nnz) { + const DType scale = scalar_ptr[0]; + if (i < nnz) { + KERNEL_ASSIGN(out[i], req, OP::Map(csr_data[i], scale)); + } + } +}; + +template +struct csr_dns_map_kernel { + template + MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, + const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + if (row < num_rows) { + const nnvm::dim_t curr_row_i = csr_indptr[row]; + const nnvm::dim_t next_row_i = csr_indptr[row + 1]; + for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { + const nnvm::dim_t target = row * num_cols + csr_indices[iter]; + KERNEL_ASSIGN(out[target], req, + reverse ? OP::Map(out[target], csr_data[iter]) : + OP::Map(csr_data[iter], out[target])); + } + } + } +}; + +} // namespace mxnet_op + namespace broadcast { using namespace mshadow; const int MAX_DIM = 5; -template -MSHADOW_XINLINE Shape calc_stride(const Shape& shape) { - Shape stride; - index_t cumprod = 1; - #pragma unroll - for (int i = ndim - 1; i >= 0; --i) { - stride[i] = (shape[i] > 1) ? cumprod : 0; - cumprod *= shape[i]; - } - return stride; -} - template MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape& shape, const Shape& stridej, const Shape& stridek, index_t* j, index_t* k) { @@ -68,28 +205,6 @@ MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape& shape, } } -template -MSHADOW_XINLINE Shape unravel(const index_t idx, const Shape& shape) { - Shape ret; - #pragma unroll - for (index_t i = ndim-1, j = idx; i >=0; --i) { - auto tmp = j / shape[i]; - ret[i] = j - tmp*shape[i]; - j = tmp; - } - return ret; -} - -template -MSHADOW_XINLINE index_t ravel(const Shape& coord, const Shape& shape) { - index_t ret = 0; - #pragma unroll - for (index_t i = 0; i < ndim; ++i) { - ret = ret * shape[i] + (shape[i] > 1) * coord[i]; - } - return ret; -} - template MSHADOW_XINLINE int diff(const Shape& small, const Shape& big, @@ -115,28 +230,6 @@ MSHADOW_XINLINE int diff(const Shape& small, return mdim; } -template -MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape& shape, - const Shape& stride) { - index_t ret = 0; - #pragma unroll - for (index_t i = ndim-1, j = idx; i >=0; --i) { - auto tmp = j / shape[i]; - ret += (j - tmp*shape[i])*stride[i]; - j = tmp; - } - return ret; -} - -template -MSHADOW_XINLINE index_t dot(const Shape& coord, const Shape& stride) { - index_t ret = 0; - #pragma unroll - for (int i = 0; i < ndim; ++i) - ret += coord[i] * stride[i]; - return ret; -} - template MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) { if (addto) { @@ -152,9 +245,9 @@ MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto const DType* __restrict rhs, DType* out, const Shape& lshape, const Shape& rshape, const Shape& oshape) { - const Shape coord = unravel(idx, oshape); - const index_t j = ravel(coord, lshape); - const index_t k = ravel(coord, rshape); + const Shape coord = mxnet_op::unravel(idx, oshape); + const index_t j = mxnet_op::ravel(coord, lshape); + const index_t k = mxnet_op::ravel(coord, rshape); assign(&out[idx], addto, OP::Map(lhs[j], rhs[k])); } @@ -163,13 +256,13 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const const DType* __restrict big, OType *small, const Shape& bshape, const Shape& sshape, const Shape& rshape, const Shape& rstride) { - Shape coord = unravel(idx, sshape); - index_t j = ravel(coord, bshape); + Shape coord = mxnet_op::unravel(idx, sshape); + index_t j = mxnet_op::ravel(coord, bshape); AType val, residual; Reducer::SetInitValue(val, residual); for (size_t k = 0; k < M; ++k) { - coord = unravel(k, rshape); - Reducer::Reduce(val, AType(OP::Map(big[j + dot(coord, rstride)])), residual); + coord = mxnet_op::unravel(k, rshape); + Reducer::Reduce(val, AType(OP::Map(big[j + mxnet_op::dot(coord, rstride)])), residual); } Reducer::Finalize(val, residual); assign(&small[idx], addto, OType(val)); @@ -180,28 +273,15 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const #else -template -void BinaryBroadcastComputeImpl2(Stream *s, const OpReqType req, - const TBlob& lhs, const TBlob& rhs, const TBlob& out) { -} - -template -void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs, - const DType *rhs, DType *out, const Shape lshape, - const Shape rshape, const Shape oshape) { - for (size_t idx = 0; idx < N; ++idx) { - binary_broadcast_assign(idx, addto, lhs, rhs, out, lshape, rshape, oshape); - } -} - template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { - if (req == kNullOp) return; - size_t N = out.shape_.Size(); - binary_broadcast_compute(N, req == kAddTo, lhs.dptr(), rhs.dptr(), - out.dptr(), lhs.shape_.get(), rhs.shape_.get(), - out.shape_.get()); + mshadow::Shape oshape = out.shape_.get(); + mshadow::Shape lstride = mxnet_op::calc_stride(lhs.shape_.get()); + mshadow::Shape rstride = mxnet_op::calc_stride(rhs.shape_.get()); + mxnet_op::Kernel, cpu>:: + template LaunchEx(s, out.shape_.Size(), req, lstride, rstride, oshape, + lhs.dptr(), rhs.dptr(), out.dptr()); } template @@ -226,8 +306,8 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add const index_t* ws_dptr) { #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) for (index_t idx = 0; idx < static_cast(N); ++idx) { - Shape coord = unravel(idx, sshape); - index_t j = ravel(coord, bshape); + Shape coord = mxnet_op::unravel(idx, sshape); + index_t j = mxnet_op::ravel(coord, bshape); DType val, residual; Reducer::SetInitValue(val, residual); for (size_t k = 0; k < M; ++k) { @@ -284,8 +364,8 @@ void ReduceWithExtraMem(Stream* s, const TBlob& small, const OpReqType req, size_t N = small.shape_.Size(), M = rshape.Size(); #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) for (index_t k = 0; k < static_cast(M); k++) { - Shape coord = unravel(k, rshape); - ws_dptr[k] = dot(coord, rstride); + Shape coord = mxnet_op::unravel(k, rshape); + ws_dptr[k] = mxnet_op::dot(coord, rstride); } seq_reduce_compute_extra_mem( @@ -316,21 +396,21 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const const Shape& lhs_shape, const Shape& rhs_shape, const Shape& rstride, const Shape& lhs_stride, const Shape& rhs_stride) { - Shape coord = unravel(idx, small_shape); - const index_t idx_big0 = ravel(coord, big_shape); - const index_t idx_lhs0 = ravel(coord, lhs_shape0); - const index_t idx_rhs0 = ravel(coord, rhs_shape0); + Shape coord = mxnet_op::unravel(idx, small_shape); + const index_t idx_big0 = mxnet_op::ravel(coord, big_shape); + const index_t idx_lhs0 = mxnet_op::ravel(coord, lhs_shape0); + const index_t idx_rhs0 = mxnet_op::ravel(coord, rhs_shape0); DType val, residual; Reducer::SetInitValue(val, residual); for (size_t k = 0; k < M; ++k) { - Shape coord_big = unravel(k, rshape); - index_t idx_big = idx_big0 + dot(coord_big, rstride); + Shape coord_big = mxnet_op::unravel(k, rshape); + index_t idx_big = idx_big0 + mxnet_op::dot(coord_big, rstride); - Shape coord_lhs = unravel(k, lhs_shape); - index_t idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride); + Shape coord_lhs = mxnet_op::unravel(k, lhs_shape); + index_t idx_lhs = idx_lhs0 + mxnet_op::dot(coord_lhs, lhs_stride); - Shape coord_rhs = unravel(k, rhs_shape); - index_t idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride); + Shape coord_rhs = mxnet_op::unravel(k, rhs_shape); + index_t idx_rhs = idx_rhs0 + mxnet_op::dot(coord_rhs, rhs_stride); Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual); } diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h index 5eb0c41aa36c..8ca62b91e008 100644 --- a/src/operator/tensor/broadcast_reduce_op.h +++ b/src/operator/tensor/broadcast_reduce_op.h @@ -1449,7 +1449,7 @@ struct pick { const IType *idx, index_t M, int stride, mshadow::Shape bshape, mshadow::Shape sshape) { - using namespace broadcast; + using namespace mxnet_op; index_t j = static_cast(idx[i]); if (clip) { if (j <= 0) j = 0; @@ -1471,7 +1471,7 @@ struct pick_grad { const IType *idx, index_t M, int stride, mshadow::Shape bshape, mshadow::Shape sshape) { - using namespace broadcast; + using namespace mxnet_op; index_t j = static_cast(idx[i]); if (clip) { if (j <= 0) j = 0; diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 70aa11e330ce..62011efb5462 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -187,152 +187,6 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: return j; } -namespace mxnet_op { -template -struct binary_broadcast_kernel { - /*! \brief Map function for binary_broadcast_kernel */ - template - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType *lhs, IType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); - } - } - - /*! \brief Map function for binary_broadcast_kernel */ - template - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType lhs, IType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); - } - } - -#ifndef _WIN32 - /*! \brief Map function for binary_broadcast_kernel */ - template::value, int>::type = 0> - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType *lhs, DType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); - } - } - - /*! \brief Map function for binary_broadcast_kernel */ - template::value && - !std::is_pointer::value, int>::type = 0> - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType lhs, DType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); - } - } -#endif -}; - -template -struct csr_dns_csr_broadcast_kernel { - /*! - * \brief Map function for broadcast between csr and 1D vector - * \param row global thread id/assigned row id - * \param csr_data ptr to data buffer of csr matrix - * \param csr_indices ptr to indices buffer of csr matrix - * \param csr_indptr ptr to indptr buffer of csr matrix - * \param dns ptr to data buffer of the dense vector - * \param out ptr to the data buffer of the result csr matrix - */ - template - MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, - const RType *csr_indptr, const DType *dns, DType *out) { - const nnvm::dim_t curr_row_i = csr_indptr[row]; - const nnvm::dim_t next_row_i = csr_indptr[row + 1]; - for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { - KERNEL_ASSIGN(out[iter], req, OP::Map(csr_data[iter], - (col_vec)? dns[row] : dns[csr_indices[iter]])); - } - } - - /*! - * \brief Map function for broadcast between csr and a scalar - * \param i global thread id - * \param csr_data ptr to data buffer of csr matrix - * \param scalar_ptr ptr to data buffer of the scalar tensor, only the 0-th element is used - * \param out ptr to the data buffer of output csr matrix - * \param nnz number of non-zero elements in input csr matrix - */ - template - MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr, - DType *out, const nnvm::dim_t nnz) { - const DType scale = scalar_ptr[0]; - if (i < nnz) { - KERNEL_ASSIGN(out[i], req, OP::Map(csr_data[i], scale)); - } - } -}; - -template -struct csr_dns_map_kernel { - template - MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, - const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows, - const nnvm::dim_t num_cols) { - if (row < num_rows) { - const nnvm::dim_t curr_row_i = csr_indptr[row]; - const nnvm::dim_t next_row_i = csr_indptr[row + 1]; - for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { - const nnvm::dim_t target = row * num_cols + csr_indices[iter]; - KERNEL_ASSIGN(out[target], req, - reverse ? OP::Map(out[target], csr_data[iter]) : - OP::Map(csr_data[iter], out[target])); - } - } - } -}; - -} // namespace mxnet_op - template void BinaryBroadcastIntCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -384,27 +238,10 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, } MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { - int choice = dmlc::GetEnv("DEBUG_BCAST", 2); - if (choice != 0 && ctx.run_ctx.ctx.dev_type == kGPU) { - if (choice == 1) { broadcast::BinaryBroadcastComputeImpl(s, req[0], inputs[0].reshape(new_lshape), inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape)); - } else { - broadcast::BinaryBroadcastComputeImpl2(s, req[0], - inputs[0].reshape(new_lshape), - inputs[1].reshape(new_rshape), - outputs[0].reshape(new_oshape)); - } - } else { - mshadow::Shape oshape = new_oshape.get(); - mshadow::Shape lstride = mxnet_op::calc_stride(new_lshape.get()); - mshadow::Shape rstride = mxnet_op::calc_stride(new_rshape.get()); - mxnet_op::Kernel, xpu>:: - template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape, - inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr()); - } }); }); } From ff2243d7e3fb9429301013e8d8dd74727ad1256f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 17 Mar 2020 20:28:08 -0700 Subject: [PATCH 23/37] Fix for numpy ops using stuff from broadcast --- src/operator/numpy/linalg/broadcast_reduce_customized-inl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h index 2b5970d4f4ae..766fe208b2e7 100644 --- a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h +++ b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h @@ -31,6 +31,9 @@ namespace mxnet { namespace op { namespace broadcast { using namespace mshadow; +using mxnet_op::unravel; +using mxnet_op::ravel; +using mxnet_op::dot; template MSHADOW_XINLINE void seq_reduce_assign_wr(const index_t idx, const size_t M, const bool addto, From ecbdc6dbffd44c3dec03b31299906a9ee2c87793 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 18 Mar 2020 09:51:39 -0700 Subject: [PATCH 24/37] Fix --- src/operator/numpy/linalg/broadcast_reduce_customized-inl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h index 766fe208b2e7..0226df45f960 100644 --- a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h +++ b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h @@ -34,6 +34,7 @@ using namespace mshadow; using mxnet_op::unravel; using mxnet_op::ravel; using mxnet_op::dot; +using mxnet_op::unravel_dot; template MSHADOW_XINLINE void seq_reduce_assign_wr(const index_t idx, const size_t M, const bool addto, From 2592e534a1d095b2fcc54208f0d590c73cbb2880 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 17 Mar 2020 20:34:35 -0700 Subject: [PATCH 25/37] Fix lint --- src/operator/tensor/elemwise_binary_op.h | 12 ++++++++---- src/operator/tensor/elemwise_binary_op_basic.cu | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 5c56edf41f3f..b9396aee204e 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -161,15 +161,19 @@ class ElemwiseBinaryOp : public OpBase { (outputs[0].Size() + mxnet_op::DataType::kLanes - 1) / mxnet_op::DataType::kLanes); DType * lgrad_dptr = outputs[0].dptr(); - mxnet_op::Kernel, Req>, cpu>::Launch( - s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);}); + mxnet_op::Kernel< + mxnet_op::op_with_req, Req>, cpu>::Launch( + s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr); + }); MXNET_ASSIGN_REQ_SWITCH(req[1], Req, { const int size = static_cast( (outputs[1].Size() + mxnet_op::DataType::kLanes - 1) / mxnet_op::DataType::kLanes); DType * rgrad_dptr = outputs[1].dptr(); - mxnet_op::Kernel, Req>, cpu>::Launch( - s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);}); + mxnet_op::Kernel< + mxnet_op::op_with_req, Req>, cpu>::Launch( + s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr); + }); }); } diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index c88b0300dd0c..b21b08d03217 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -218,7 +218,7 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream *s, } NNVM_REGISTER_OP(elemwise_add) -.set_attr("FCompute", ElemwiseBinaryOp::Compute) +.set_attr("FCompute", ElemwiseBinaryOp::Compute) .set_attr("FComputeEx", ElemwiseBinaryOp::ComputeEx); NNVM_REGISTER_OP(_grad_add) From 78d25a9d2f3f3bd330670f01d366c6535ea7b600 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 10 Apr 2020 14:47:52 -0700 Subject: [PATCH 26/37] Try to debug pinv numpy test --- tests/python/unittest/test_numpy_op.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 7f2532bfba5e..3224d85b744a 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -5651,6 +5651,10 @@ def check_pinv(x, a_np, rcond_np, hermitian, use_rcond): print(e) else: assert x.shape == x_expected.shape + print("a shape:", a_np.shape) + print("a: ", a_np) + print("actual: ", x.asnumpy()) + print("expected: ", x_expected) assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol) shapes = [ From a801f8b3dc301899f15e89bbd3d365648d56694c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 13 Apr 2020 18:55:29 -0700 Subject: [PATCH 27/37] Fix --- src/operator/tensor/broadcast_reduce-inl.cuh | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 0599d0aec2aa..1edcf81b1e87 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -120,6 +120,8 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( lloader.separate()[i]); } + printf("thread %d %d %d: %d %d %d %f %f %f\n", threadIdx.x, blockIdx.x, idx, lindex, rindex, i, (float)lloader.separate()[i], (float)rinput, (float)temp); + if (req == kAddTo) { storer.separate()[i] += temp; } else { @@ -165,6 +167,18 @@ class VectorizedBinaryBroadcastFwd { } }; +inline void PrintTensor(const TBlob& blob, const std::string& name) { + const index_t size = blob.shape_.Size(); + float* temp = new float[size]; + cudaMemcpy(temp, blob.dptr_, size * sizeof(float), cudaMemcpyDeviceToHost); + std::cout << name << std::endl; + for (int i = 0; i < size; ++i) { + std::cout << i << ": " << temp[i] << std::endl; + } + std::cout << "End: " << name << std::endl; + delete[] temp; +} + template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { @@ -175,6 +189,11 @@ void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, Shape lstride = mxnet_op::calc_stride(lhs.shape_.get()); Shape rstride = mxnet_op::calc_stride(rhs.shape_.get()); + std::cout << "lshape: " << lhs.shape_ << std::endl; + std::cout << "rshape: " << rhs.shape_ << std::endl; + PrintTensor(lhs, "lhs"); + PrintTensor(rhs, "rhs"); + MXNET_ASSIGN_REQ_SWITCH(req, Req, { using LType = uint2; using Kernel = VectorizedBinaryBroadcastFwd; @@ -187,6 +206,8 @@ void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, param.stride[0] = lstride; param.stride[1] = rstride; param.oshape = out.shape_.get(); + param.size[0] = lhs.shape_.Size(); + param.size[1] = rhs.shape_.Size(); for (int i = ndim - 1; i >= 0; --i) { /* Find the first non-1 dimension @@ -204,6 +225,7 @@ void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, VectorizedKernelLauncher(N, s, param); }); + PrintTensor(out, "out"); } const int nthread_reduce = kMaxThreadsPerBlock; From 810f8c86d036c9dd44e095c08bedf1edf79b9daa Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 16:00:09 -0700 Subject: [PATCH 28/37] Fix the vectorized broadcast implementation for misaligned input pointers --- src/common/cuda_vectorization.cuh | 55 ++++++--- src/operator/tensor/broadcast_reduce-inl.cuh | 106 +++++++++--------- src/operator/tensor/elemwise_binary_op.cuh | 21 ++-- .../tensor/elemwise_binary_scalar_op.cuh | 14 ++- src/operator/tensor/elemwise_sum.cu | 9 +- src/operator/tensor/elemwise_unary_op.cuh | 7 +- 6 files changed, 118 insertions(+), 94 deletions(-) diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh index 0d04d9da0cea..33c4f6dbec43 100644 --- a/src/common/cuda_vectorization.cuh +++ b/src/common/cuda_vectorization.cuh @@ -75,6 +75,10 @@ class VectorizedAccessor { } } + MSHADOW_XINLINE int alignment() const { + return alignment_; + } + MSHADOW_XINLINE DType* separate() { return storage_.scratch_.separate; } @@ -92,7 +96,7 @@ class VectorizedAccessor { storage_.scratch_.aligned = aligned_ptr_[id]; } else { if (id > 0 && id < n_elems_ - 1) { - storage_.scratch_.aligned = aligned_ptr_[id]; + storage_.scratch_.aligned = aligned_ptr_[id]; } else { #pragma unroll for (int j = 0; j < storage_.nvec; ++j) { @@ -105,7 +109,6 @@ class VectorizedAccessor { } } } - }; template @@ -158,8 +161,9 @@ int CalcAlignment(const DType* ptr) { } template -Alignment CheckAlignment(const Params& params) { +Alignment CheckAlignment(const Params& params, const index_t lead_dim, const index_t other_dim) { int align = -1; + constexpr int nvec = sizeof(LType) / sizeof(DType); for (const DType* ptr : params.inputs) { int new_align = CalcAlignment(ptr); @@ -183,8 +187,17 @@ Alignment CheckAlignment(const Params& params) { } } - return align == 0 ? Alignment::SAME_ALIGNED - : Alignment::SAME_UNALIGNED; + if ((other_dim != 1) && + (lead_dim % nvec != 0)) { + return Alignment::DIFFERENT; + } + + if ((align == 0) && + (lead_dim % nvec == 0)) { + return Alignment::SAME_ALIGNED; + } else { + return Alignment::SAME_UNALIGNED; + } } constexpr int vectorized_kernel_thread_num = 512; @@ -192,29 +205,35 @@ constexpr int vectorized_kernel_thread_num = 512; } // namespace template -void VectorizedKernelLauncher(const index_t size, mshadow::Stream* s, typename Kernel::ParamType params) { +void VectorizedKernelLauncher(const index_t lead_dim, + const index_t other_dim, + mshadow::Stream* s, + typename Kernel::ParamType params) { static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type"); - if (size != 0) { + if (lead_dim * other_dim != 0) { cudaStream_t stream = mshadow::Stream::GetStream(s); - constexpr int nvec = sizeof(LType) / sizeof(DType); - VectorizedLoader l(params.inputs[0], size); - size_t num_elements = l.num_aligned_elements(); + VectorizedLoader l(params.inputs[0], lead_dim); + size_t num_elements = other_dim * l.num_aligned_elements(); constexpr int threads = vectorized_kernel_thread_num; constexpr int max_blocks = 65535; index_t blocks = std::min(static_cast((num_elements + threads - 1) / threads), max_blocks); - auto align = CheckAlignment(params); - if (align == Alignment::SAME_ALIGNED && (size % nvec == 0)) { - Kernel::template Launch(blocks, threads, stream, params, size); - } else { - if (align != Alignment::DIFFERENT) { - Kernel::template Launch(blocks, threads, stream, params, size); - } else { + auto align = CheckAlignment(params, lead_dim, other_dim); + switch (align) { + case Alignment::SAME_ALIGNED: + Kernel::template Launch(blocks, threads, stream, params, lead_dim, other_dim); + break; + case Alignment::SAME_UNALIGNED: + Kernel::template Launch(blocks, threads, stream, params, lead_dim, other_dim); + break; + case Alignment::DIFFERENT: { + const index_t size = lead_dim * other_dim; index_t blocks = std::min(static_cast((size + threads - 1) / threads), max_blocks); // If the pointers are aligned differently we cannot vectorize - Kernel::template Launch(blocks, threads, stream, params, size); + Kernel::template Launch(blocks, threads, stream, params, lead_dim, other_dim); + break; } } } diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 1edcf81b1e87..64adb0880346 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -18,10 +18,10 @@ */ /*! - * Copyright (c) 2015-2017 by Contributors + * Copyright (c) 2015-2020 by Contributors * \file broadcast_reduce-inl.cuh * \brief CUDA implementations for binary broadcast and reduce - * \author Antti-Pekka Hynninen + * \author Antti-Pekka Hynninen, Przemyslaw Tredak */ #ifndef MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_ #define MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_ @@ -31,8 +31,7 @@ using namespace mshadow::cuda; template struct VectorizedBinaryBroadcastParam { const DType* inputs[2]; - DType* outputs[2]; // Only the first one is used in the computation - // the other one is used for alignment checking + DType* outputs[1]; Shape stride[2]; Shape oshape; index_t size[2]; @@ -44,26 +43,33 @@ using common::cuda::VectorizedStorer; template __global__ void VectorizedBinaryBroadcastKernel( const VectorizedBinaryBroadcastParam param, - const index_t N) { + const index_t lead_dim, const index_t other_dim, + const index_t num_aligned_elements) { constexpr int nvec = sizeof(LType) / sizeof(DType); - const index_t M = N / nvec; + const index_t M = num_aligned_elements * other_dim; VectorizedLoader lloader(param.inputs[0], param.size[0]); VectorizedLoader rloader(param.inputs[1], param.size[1]); - VectorizedStorer storer(param.outputs[0], N); for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < M; idx += gridDim.x * blockDim.x) { + const index_t row = idx / num_aligned_elements; + const index_t lead_dim_idx = idx - row * num_aligned_elements; + VectorizedStorer storer(param.outputs[0] + row * lead_dim, lead_dim); + index_t lindex, rindex; - unravel_dot(idx * nvec, param.oshape, + const index_t original_idx = max(lead_dim_idx * nvec - + lloader.alignment() + row * lead_dim, + 0); + unravel_dot(original_idx, param.oshape, param.stride[0], param.stride[1], &lindex, &rindex); - lloader.load(lindex / nvec, param.size[0]); - rloader.load(rindex / nvec, param.size[1]); + lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]); + rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]); if (req == kAddTo) { - storer.load(idx, N); + storer.load(lead_dim_idx, lead_dim); } #pragma unroll for (int i = 0; i < lloader.nvec(); ++i) { @@ -76,38 +82,44 @@ __global__ void VectorizedBinaryBroadcastKernel( storer.separate()[i] = temp; } } - storer.store(idx, N); + storer.store(lead_dim_idx, lead_dim); } } template __global__ void VectorizedBinaryBroadcastSingleSideKernel( const VectorizedBinaryBroadcastParam param, - const index_t N) { + const index_t lead_dim, const index_t other_dim, + const index_t num_aligned_elements) { constexpr int nvec = sizeof(LType) / sizeof(DType); - const index_t M = N / nvec; + const index_t M = num_aligned_elements * other_dim; constexpr int other_side = 1 - side; VectorizedLoader lloader(param.inputs[side], param.size[side]); - VectorizedStorer storer(param.outputs[0], N); for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < M; idx += gridDim.x * blockDim.x) { - index_t lindex, rindex; - unravel_dot(idx * nvec, param.oshape, - param.stride[side], param.stride[other_side], - &lindex, &rindex); - lloader.load(lindex / nvec, param.size[side]); + const index_t row = idx / num_aligned_elements; + const index_t lead_dim_idx = idx - row * num_aligned_elements; + VectorizedStorer storer(param.outputs[0] + row * lead_dim, lead_dim); + const index_t original_idx = lead_dim_idx * nvec - + lloader.alignment() + row * lead_dim; + const index_t original_idx_clamped = max(0, original_idx); + const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape, + param.stride[side]); + lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]); if (req == kAddTo) { - storer.load(idx, N); + storer.load(lead_dim_idx, lead_dim); } #pragma unroll for (int i = 0; i < lloader.nvec(); ++i) { - if (i != 0) { - rindex = mxnet_op::unravel_dot(idx * nvec + i, param.oshape, param.stride[other_side]); - } + const index_t rindex = min(max(mxnet_op::unravel_dot(original_idx + i, + param.oshape, + param.stride[other_side]), + 0), + param.size[other_side] - 1); DType rinput = param.inputs[other_side][rindex]; DType temp; if (side == 0) { @@ -120,15 +132,13 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( lloader.separate()[i]); } - printf("thread %d %d %d: %d %d %d %f %f %f\n", threadIdx.x, blockIdx.x, idx, lindex, rindex, i, (float)lloader.separate()[i], (float)rinput, (float)temp); - if (req == kAddTo) { storer.separate()[i] += temp; } else { storer.separate()[i] = temp; } } - storer.store(idx, N); + storer.store(lead_dim_idx, lead_dim); } } @@ -140,7 +150,8 @@ class VectorizedBinaryBroadcastFwd { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t other_dim) { int common_shape = 1; int first_different = -1; for (int i = ndim - 1; i >= 0; --i) { @@ -153,32 +164,26 @@ class VectorizedBinaryBroadcastFwd { } if (common_shape != 1) { + VectorizedLoader loader(params.inputs[0], lead_dim); + const index_t num_elements_per_row = loader.num_aligned_elements(); VectorizedBinaryBroadcastKernel - <<>>(params, N); + <<>>(params, lead_dim, other_dim, num_elements_per_row); } else { if (params.stride[0][first_different] == 0) { + VectorizedLoader loader(params.inputs[1], lead_dim); + const index_t num_elements_per_row = loader.num_aligned_elements(); VectorizedBinaryBroadcastSingleSideKernel - <<>>(params, N); + <<>>(params, lead_dim, other_dim, num_elements_per_row); } else { + VectorizedLoader loader(params.inputs[0], lead_dim); + const index_t num_elements_per_row = loader.num_aligned_elements(); VectorizedBinaryBroadcastSingleSideKernel - <<>>(params, N); + <<>>(params, lead_dim, other_dim, num_elements_per_row); } } } }; -inline void PrintTensor(const TBlob& blob, const std::string& name) { - const index_t size = blob.shape_.Size(); - float* temp = new float[size]; - cudaMemcpy(temp, blob.dptr_, size * sizeof(float), cudaMemcpyDeviceToHost); - std::cout << name << std::endl; - for (int i = 0; i < size; ++i) { - std::cout << i << ": " << temp[i] << std::endl; - } - std::cout << "End: " << name << std::endl; - delete[] temp; -} - template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { @@ -189,10 +194,6 @@ void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, Shape lstride = mxnet_op::calc_stride(lhs.shape_.get()); Shape rstride = mxnet_op::calc_stride(rhs.shape_.get()); - std::cout << "lshape: " << lhs.shape_ << std::endl; - std::cout << "rshape: " << rhs.shape_ << std::endl; - PrintTensor(lhs, "lhs"); - PrintTensor(rhs, "rhs"); MXNET_ASSIGN_REQ_SWITCH(req, Req, { using LType = uint2; @@ -209,23 +210,20 @@ void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, param.size[0] = lhs.shape_.Size(); param.size[1] = rhs.shape_.Size(); + index_t lead_dim = 1; for (int i = ndim - 1; i >= 0; --i) { /* Find the first non-1 dimension to check the alignment */ if (param.oshape[i] != 1) { - param.outputs[1] = param.outputs[0] + param.oshape[i]; + lead_dim = param.oshape[i]; break; } - if (i == 0) { - /* All dimensions are 1 */ - param.outputs[1] = param.outputs[0]; - } } + const index_t other_dim = out.shape_.Size() / lead_dim; - VectorizedKernelLauncher(N, s, param); + VectorizedKernelLauncher(lead_dim, other_dim, s, param); }); - PrintTensor(out, "out"); } const int nthread_reduce = kMaxThreadsPerBlock; diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh index 9ddb82a7ad53..0bb9fa636f45 100644 --- a/src/operator/tensor/elemwise_binary_op.cuh +++ b/src/operator/tensor/elemwise_binary_op.cuh @@ -191,9 +191,10 @@ class VectorizedBinaryFwd { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedBinaryKernelFwd - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -205,9 +206,10 @@ class VectorizedBinaryBwdUseNone { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedBinaryKernelBwdUseNone - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -219,9 +221,10 @@ class VectorizedBinaryBwdUseIn { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedBinaryKernelBwdUseIn - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -248,7 +251,7 @@ void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs, params.inputs[1] = inputs[1].dptr(); params.outputs[0] = outputs[0].dptr(); - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); }); }); } @@ -275,7 +278,7 @@ void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs, params.outputs[0] = outputs[0].dptr(); params.outputs[1] = outputs[1].dptr(); - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); }); }); } @@ -305,7 +308,7 @@ void ElemwiseBinaryOp::BackwardUseIn_(const nnvm::NodeAttrs &attrs, params.outputs[0] = outputs[0].dptr(); params.outputs[1] = outputs[1].dptr(); - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); }); }); }); diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh index 1de262360109..062c18767ac6 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op.cuh +++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh @@ -121,9 +121,10 @@ class VectorizedBinaryScalarFwd { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedBinaryScalarKernelFwd - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -135,9 +136,10 @@ class VectorizedBinaryScalarBwd { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedBinaryScalarKernelBwd - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -165,7 +167,7 @@ void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs, params.outputs[0] = outputs[0].dptr(); params.scalar = (DType)alpha; - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); }); }); } @@ -193,7 +195,7 @@ void BinaryScalarOp::Backward_(const nnvm::NodeAttrs &attrs, params.outputs[0] = outputs[0].dptr(); params.scalar = (DType)alpha; - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); }); }); } diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu index 8947de2fb52e..352c74ea9445 100644 --- a/src/operator/tensor/elemwise_sum.cu +++ b/src/operator/tensor/elemwise_sum.cu @@ -89,9 +89,10 @@ class VectorizedElementwiseSumFwd { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedElementwiseSumKernel - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -116,7 +117,7 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs, params.inputs[j] = inputs[i + j].dptr(); } params.outputs[0] = outputs[0].dptr(); - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); } else { /* During subsequent launches we need to accumulate into the previous outputs @@ -128,7 +129,7 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs, params.inputs[j] = inputs[i + j].dptr(); } params.outputs[0] = outputs[0].dptr(); - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); } } }); diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh index 5b8467f5c26c..8688a8b8ac66 100644 --- a/src/operator/tensor/elemwise_unary_op.cuh +++ b/src/operator/tensor/elemwise_unary_op.cuh @@ -86,9 +86,10 @@ class VectorizedUnaryScalarFwd { template static void Launch(const index_t blocks, const index_t threads, cudaStream_t stream, - const ParamType params, const index_t N) { + const ParamType params, const index_t lead_dim, + const index_t /* other_dim */) { VectorizedUnaryScalarKernelFwd - <<>>(params, N); + <<>>(params, lead_dim); } }; @@ -114,7 +115,7 @@ void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs, params.inputs[0] = inputs[0].dptr(); params.outputs[0] = outputs[0].dptr(); - VectorizedKernelLauncher(size, s, params); + VectorizedKernelLauncher(size, 1, s, params); }); }); } From f5f5d3e8fef23fb1ae1e5e43330a00bc4cfe6825 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 17:36:48 -0700 Subject: [PATCH 29/37] Added tests --- tests/python/unittest/test_operator.py | 43 ++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 230073aedf90..fa5fdefd1028 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -9937,6 +9937,49 @@ def test_elemwise_sum_for_gradient_accumulation(): assert stored_grad['write'] == stored_grad['add'] assert stored_grad['write'] == 2 * nrepeat +def test_elementwise_ops_on_misaligned_input(): + a = mx.nd.array([1,2,3,4], dtype='float16') + b = mx.nd.array([1,2,3,4], dtype='float16') + + c = a[1:3] + d = b[1:3] + # Note: testing just elemwise_add since all elemwise_ops + # share the implementation + mx.nd.elemwise_add(c, d, out=c) + mx.nd.waitall() + + a = mx.nd.array([1,2,3,4], dtype='float16') + b = mx.nd.array([1,2,3,4], dtype='float16') + + c = a[0:3] + d = b[0:3] + mx.nd.elemwise_add(c, d, out=c) + mx.nd.waitall() + assert a[3].asscalar() == 4.0 + +def test_broadcast_ops_on_misaligned_input(): + a = mx.nd.array([1,2,3,4,5,6,7,8]) + b = mx.nd.array([1,2,3,4,5,6,7,8]) + c = a[1:7].reshape((3,2)) + d = b[1:3] + e = mx.nd.arange(7) + f = e[1:].reshape((3,2)) + mx.nd.broadcast_add(c, d, out=f) + expected = np.array([[4,6],[6,8],[8,10]]) + mx.nd.waitall() + assert_almost_equal(f, expected) + + a = mx.nd.array([1,2,3,4,5,6,7,8]) + b = mx.nd.array([1,2,3,4,5,6,7,8]) + + c = a[1:7].reshape((3,2)) + d = b[1:4].reshape((3,1)) + e = mx.nd.arange(7) + f = e[1:].reshape((3,2)) + mx.nd.broadcast_add(c, d, out=f) + expected = np.array([[4,5],[7,8],[10,11]]) + mx.nd.waitall() + assert_almost_equal(f, expected) if __name__ == '__main__': import nose From 4120fe82ef564350c557f8182c1d5b6200dd11ac Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 18:02:03 -0700 Subject: [PATCH 30/37] Added docs to cuda_vectorization.cuh --- src/common/cuda_vectorization.cuh | 47 +++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh index 33c4f6dbec43..7803afb901ab 100644 --- a/src/common/cuda_vectorization.cuh +++ b/src/common/cuda_vectorization.cuh @@ -36,6 +36,9 @@ namespace mxnet { namespace common { namespace cuda { +/* \brief Helper class that enables storing multiple values of type DType + as 1 value of type LType. +*/ template class VectorizedStorage { public: @@ -49,6 +52,11 @@ class VectorizedStorage { } scratch_; }; +/* \brief Helper class that enables accessing multiple values of type DType + as 1 value of type LType. Additional aligned template argument + allows performance optimizations if the pointer and the size of + the allocation is aligned to sizeof(LType) / sizeof(DType) elements. +*/ template class VectorizedAccessor { public: @@ -61,36 +69,44 @@ class VectorizedAccessor { int alignment_; index_t n_elems_; - MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t N) { + MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t size) { unaligned_ptr_ = ptr; if (aligned) { alignment_ = 0; aligned_ptr_ = reinterpret_cast(ptr); - n_elems_ = (N + storage_.nvec - 1) / storage_.nvec; + n_elems_ = (size + storage_.nvec - 1) / storage_.nvec; } else { size_t ptr_as_number = reinterpret_cast(ptr); alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType); aligned_ptr_ = reinterpret_cast(ptr - alignment_); - n_elems_ = (N + alignment_ + storage_.nvec - 1) / storage_.nvec; + n_elems_ = (size + alignment_ + storage_.nvec - 1) / storage_.nvec; } } + /* \brief Alignment of the input pointer in elements. */ MSHADOW_XINLINE int alignment() const { return alignment_; } + /* \brief Access to separate elements. */ MSHADOW_XINLINE DType* separate() { return storage_.scratch_.separate; } + /* \brief Number of elements stored. */ MSHADOW_XINLINE constexpr int nvec() const { return storage_.nvec; } + /* \brief Number of aligned elements that span the entire input tensor. */ MSHADOW_XINLINE index_t num_aligned_elements() const { return n_elems_; } + /* \brief Load values from the input. + \param id Aligned index of the element. + \param N size of the tensor. + */ MSHADOW_XINLINE void load(const index_t id, const index_t N) { if (aligned) { storage_.scratch_.aligned = aligned_ptr_[id]; @@ -111,6 +127,7 @@ class VectorizedAccessor { } }; +/* \brief Class used for vectorized read-only access. */ template class VectorizedLoader : public VectorizedAccessor { public: @@ -119,6 +136,7 @@ class VectorizedLoader : public VectorizedAccessor class VectorizedStorer : public VectorizedAccessor { public: @@ -126,6 +144,10 @@ class VectorizedStorer : public VectorizedAccessor { VectorizedAccessor(ptr, N) { } + /* \brief Store values to the output. + \param id Aligned index of the element. + \param N size of the tensor. + */ MSHADOW_XINLINE void store(const index_t id, const index_t N) { if (aligned) { this->aligned_ptr_[id] = this->storage_.scratch_.aligned; @@ -149,9 +171,9 @@ class VectorizedStorer : public VectorizedAccessor { namespace { enum class Alignment { - SAME_ALIGNED, - SAME_UNALIGNED, - DIFFERENT + SAME_ALIGNED, // All tensors aligned + SAME_UNALIGNED, // All tensors have the same misalignment + DIFFERENT // Tensors have different alignment }; template @@ -160,6 +182,11 @@ int CalcAlignment(const DType* ptr) { return ptr_as_number % sizeof(LType); } +/* \brief Check alignment of the inputs and outputs when cast to LType*. + \param params Structuce containing arrays with inputs' and outputs' pointers + \param lead_dim Leading dimension of the tensors. + \param other_dim The size of the other dimensions of the tensors. +*/ template Alignment CheckAlignment(const Params& params, const index_t lead_dim, const index_t other_dim) { int align = -1; @@ -204,6 +231,14 @@ constexpr int vectorized_kernel_thread_num = 512; } // namespace +/* \brief Helper launcher function for the vectorized kernels. Checks for alignment of the + input and output tensors and launches a proper template. + \param lead_dim Leading dimension of the tensors. + \param other_dim The size of the other dimensions. + \param s Stream which should be used for launching the kernel. + \param params Input parameters to the kernel. Needs to contain at least 2 arrays of DType*: + inputs and outputs, which contain input and output pointers. +*/ template void VectorizedKernelLauncher(const index_t lead_dim, const index_t other_dim, From c1a734a7cfaf1d0c08566d62ab955c08f45a3695 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 20:59:08 -0700 Subject: [PATCH 31/37] Another fix for broadcast and fix INT64 compilation --- src/operator/tensor/broadcast_reduce-inl.cuh | 12 ++-- tests/python/unittest/test_operator.py | 75 ++++++++++++++------ 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 64adb0880346..8de7e5b489dc 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -59,9 +59,9 @@ __global__ void VectorizedBinaryBroadcastKernel( VectorizedStorer storer(param.outputs[0] + row * lead_dim, lead_dim); index_t lindex, rindex; - const index_t original_idx = max(lead_dim_idx * nvec - - lloader.alignment() + row * lead_dim, - 0); + const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(), + static_cast(0)) + + row * lead_dim; unravel_dot(original_idx, param.oshape, param.stride[0], param.stride[1], &lindex, &rindex); @@ -105,7 +105,9 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( VectorizedStorer storer(param.outputs[0] + row * lead_dim, lead_dim); const index_t original_idx = lead_dim_idx * nvec - lloader.alignment() + row * lead_dim; - const index_t original_idx_clamped = max(0, original_idx); + const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(), + static_cast(0)) + + row * lead_dim; const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape, param.stride[side]); lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]); @@ -118,7 +120,7 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( const index_t rindex = min(max(mxnet_op::unravel_dot(original_idx + i, param.oshape, param.stride[other_side]), - 0), + static_cast(0)), param.size[other_side] - 1); DType rinput = param.inputs[other_side][rindex]; DType temp; diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index fa5fdefd1028..b6f81e2f9e48 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -9937,6 +9937,7 @@ def test_elemwise_sum_for_gradient_accumulation(): assert stored_grad['write'] == stored_grad['add'] assert stored_grad['write'] == 2 * nrepeat +@with_seed() def test_elementwise_ops_on_misaligned_input(): a = mx.nd.array([1,2,3,4], dtype='float16') b = mx.nd.array([1,2,3,4], dtype='float16') @@ -9957,29 +9958,63 @@ def test_elementwise_ops_on_misaligned_input(): mx.nd.waitall() assert a[3].asscalar() == 4.0 +@with_seed() def test_broadcast_ops_on_misaligned_input(): - a = mx.nd.array([1,2,3,4,5,6,7,8]) - b = mx.nd.array([1,2,3,4,5,6,7,8]) - c = a[1:7].reshape((3,2)) - d = b[1:3] - e = mx.nd.arange(7) - f = e[1:].reshape((3,2)) - mx.nd.broadcast_add(c, d, out=f) - expected = np.array([[4,6],[6,8],[8,10]]) - mx.nd.waitall() - assert_almost_equal(f, expected) + dtypes = ['float16', 'float32', 'float64'] + lead_dims = [2,4,6,10] - a = mx.nd.array([1,2,3,4,5,6,7,8]) - b = mx.nd.array([1,2,3,4,5,6,7,8]) + for dtype in dtypes: + for lead_dim in lead_dims: + for both_ways in [False, True]: + shape = list(rand_shape_2d()) + [lead_dim] + small_shape = [shape[0], 1, lead_dim] + if both_ways: + # Broadcast in both ways [1, K, L] x [M, 1, L] + big_shape = [1, shape[1], lead_dim] + else: + big_shape = shape + size = np.product(shape) + small_size = np.product(small_shape) + big_size = np.product(big_shape) + a = mx.nd.arange(5000) + b = mx.nd.arange(5000) + e = mx.nd.arange(5000) + c = a[1:big_size + 1].reshape(big_shape) + d = b[1:small_size + 1].reshape(small_shape) + f = e[1:size + 1].reshape(shape) + mx.nd.broadcast_add(c, d, out=f) + expected = c.asnumpy() + d.asnumpy() + mx.nd.waitall() + assert_almost_equal(f, expected) + +@with_seed() +def test_broadcast_ops_on_misaligned_input_oneside(): + dtypes = ['float16', 'float32', 'float64'] + lead_dims = [2,4,6,10] - c = a[1:7].reshape((3,2)) - d = b[1:4].reshape((3,1)) - e = mx.nd.arange(7) - f = e[1:].reshape((3,2)) - mx.nd.broadcast_add(c, d, out=f) - expected = np.array([[4,5],[7,8],[10,11]]) - mx.nd.waitall() - assert_almost_equal(f, expected) + for dtype in dtypes: + for lead_dim in lead_dims: + for both_ways in [False, True]: + shape = list(rand_shape_2d()) + [lead_dim] + small_shape = [shape[0], shape[1], 1] + if both_ways: + # Broadcast in both ways [1, K, L] x [M, 1, 1] + big_shape = [1, shape[1], lead_dim] + else: + big_shape = shape + size = np.product(shape) + small_size = np.product(small_shape) + big_size = np.product(big_shape) + a = mx.nd.arange(5000) + b = mx.nd.arange(5000) + e = mx.nd.arange(5000) + c = a[1:big_size + 1].reshape(big_shape) + d = b[1:small_size + 1].reshape(small_shape) + f = e[1:size + 1].reshape(shape) + mx.nd.broadcast_add(c, d, out=f) + expected = c.asnumpy() + d.asnumpy() + mx.nd.waitall() + assert_almost_equal(f, expected) if __name__ == '__main__': import nose From de6125e61c310167e4243a6f41b3d037e636ff8f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 23:06:57 -0700 Subject: [PATCH 32/37] Optimize for aligned=true --- src/operator/tensor/broadcast_reduce-inl.cuh | 95 ++++++++++++++------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 8de7e5b489dc..790f1afb857c 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -47,6 +47,7 @@ __global__ void VectorizedBinaryBroadcastKernel( const index_t num_aligned_elements) { constexpr int nvec = sizeof(LType) / sizeof(DType); const index_t M = num_aligned_elements * other_dim; + const index_t N = lead_dim * other_dim; VectorizedLoader lloader(param.inputs[0], param.size[0]); VectorizedLoader rloader(param.inputs[1], param.size[1]); @@ -54,22 +55,41 @@ __global__ void VectorizedBinaryBroadcastKernel( for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < M; idx += gridDim.x * blockDim.x) { - const index_t row = idx / num_aligned_elements; - const index_t lead_dim_idx = idx - row * num_aligned_elements; - VectorizedStorer storer(param.outputs[0] + row * lead_dim, lead_dim); - - index_t lindex, rindex; - const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(), - static_cast(0)) + - row * lead_dim; - unravel_dot(original_idx, param.oshape, - param.stride[0], param.stride[1], - &lindex, &rindex); - lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]); - rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]); + DType * current_output_pointer; + index_t output_size; + index_t output_idx; + if (aligned) { + // Simplified case + index_t lindex, rindex; + unravel_dot(idx * nvec, param.oshape, + param.stride[0], param.stride[1], + &lindex, &rindex); + lloader.load(lindex / nvec, param.size[0]); + rloader.load(rindex / nvec, param.size[1]); + current_output_pointer = param.outputs[0]; + output_size = N; + output_idx = idx; + } else { + const index_t row = idx / num_aligned_elements; + const index_t lead_dim_idx = idx - row * num_aligned_elements; + + index_t lindex, rindex; + const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(), + static_cast(0)) + + row * lead_dim; + unravel_dot(original_idx, param.oshape, + param.stride[0], param.stride[1], + &lindex, &rindex); + lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]); + rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]); + current_output_pointer = param.outputs[0] + row * lead_dim; + output_size = lead_dim; + output_idx = lead_dim_idx; + } + VectorizedStorer storer(current_output_pointer, output_size); if (req == kAddTo) { - storer.load(lead_dim_idx, lead_dim); + storer.load(output_idx, output_size); } #pragma unroll for (int i = 0; i < lloader.nvec(); ++i) { @@ -82,7 +102,7 @@ __global__ void VectorizedBinaryBroadcastKernel( storer.separate()[i] = temp; } } - storer.store(lead_dim_idx, lead_dim); + storer.store(output_idx, output_size); } } @@ -93,6 +113,7 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( const index_t num_aligned_elements) { constexpr int nvec = sizeof(LType) / sizeof(DType); const index_t M = num_aligned_elements * other_dim; + const index_t N = lead_dim * other_dim; constexpr int other_side = 1 - side; VectorizedLoader lloader(param.inputs[side], param.size[side]); @@ -100,20 +121,38 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < M; idx += gridDim.x * blockDim.x) { - const index_t row = idx / num_aligned_elements; - const index_t lead_dim_idx = idx - row * num_aligned_elements; - VectorizedStorer storer(param.outputs[0] + row * lead_dim, lead_dim); - const index_t original_idx = lead_dim_idx * nvec - - lloader.alignment() + row * lead_dim; - const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(), - static_cast(0)) + - row * lead_dim; - const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape, - param.stride[side]); - lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]); + index_t original_idx; + DType * current_output_pointer; + index_t output_size; + index_t output_idx; + if (aligned) { + //Simplified case + original_idx = idx * nvec; + const index_t lindex = mxnet_op::unravel_dot(original_idx, param.oshape, + param.stride[side]); + lloader.load(lindex / nvec, param.size[side]); + current_output_pointer = param.outputs[0]; + output_size = N; + output_idx = idx; + } else { + const index_t row = idx / num_aligned_elements; + const index_t lead_dim_idx = idx - row * num_aligned_elements; + original_idx = lead_dim_idx * nvec - + lloader.alignment() + row * lead_dim; + const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(), + static_cast(0)) + + row * lead_dim; + const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape, + param.stride[side]); + lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]); + current_output_pointer = param.outputs[0] + row * lead_dim; + output_size = lead_dim; + output_idx = lead_dim_idx; + } + VectorizedStorer storer(current_output_pointer, output_size); if (req == kAddTo) { - storer.load(lead_dim_idx, lead_dim); + storer.load(output_idx, output_size); } #pragma unroll for (int i = 0; i < lloader.nvec(); ++i) { @@ -140,7 +179,7 @@ __global__ void VectorizedBinaryBroadcastSingleSideKernel( storer.separate()[i] = temp; } } - storer.store(lead_dim_idx, lead_dim); + storer.store(output_idx, output_size); } } From 9129ba21ae34d36cd7f55881ea5e75f41345bcb3 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 23:46:45 -0700 Subject: [PATCH 33/37] 1 more addition to test --- tests/python/unittest/test_operator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index b6f81e2f9e48..61e385b3f4f8 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -9961,7 +9961,7 @@ def test_elementwise_ops_on_misaligned_input(): @with_seed() def test_broadcast_ops_on_misaligned_input(): dtypes = ['float16', 'float32', 'float64'] - lead_dims = [2,4,6,10] + lead_dims = [2,3,4,6,10] for dtype in dtypes: for lead_dim in lead_dims: @@ -9990,7 +9990,7 @@ def test_broadcast_ops_on_misaligned_input(): @with_seed() def test_broadcast_ops_on_misaligned_input_oneside(): dtypes = ['float16', 'float32', 'float64'] - lead_dims = [2,4,6,10] + lead_dims = [2,3,4,6,10] for dtype in dtypes: for lead_dim in lead_dims: From 43f4b4e32d988e9049a74a018237e24b39758b12 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 16 Apr 2020 23:53:44 -0700 Subject: [PATCH 34/37] Reverting the change to Numpy op test --- tests/python/unittest/test_numpy_op.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 6e80c59d9e78..4adf382802d0 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -5851,10 +5851,6 @@ def check_pinv(x, a_np, rcond_np, hermitian, use_rcond): print(e) else: assert x.shape == x_expected.shape - print("a shape:", a_np.shape) - print("a: ", a_np) - print("actual: ", x.asnumpy()) - print("expected: ", x_expected) assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol) shapes = [ From 1af684c507dd5b2c7ab7ffe89d21799320e3d9c6 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 17 Apr 2020 09:07:55 -0700 Subject: [PATCH 35/37] Trying mcmodel=medium to fix the failure in CMake static build --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ca92ff19a93..c0a0d53ee314 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,9 @@ else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}") + if(USE_CUDA) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcmodel=medium") + endif() endif() if(NOT mxnet_LINKER_LIBS) From 31c84b1a55dff7846acf5e40cc473f3d07346178 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 17 Apr 2020 10:58:40 -0700 Subject: [PATCH 36/37] Revert "Trying mcmodel=medium to fix the failure in CMake static build" This reverts commit 1af684c507dd5b2c7ab7ffe89d21799320e3d9c6. --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c0a0d53ee314..1ca92ff19a93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,9 +176,6 @@ else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}") - if(USE_CUDA) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcmodel=medium") - endif() endif() if(NOT mxnet_LINKER_LIBS) From b9d1760c8b321f9baa945f0ce6a9219b374fa887 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 17 Apr 2020 11:54:49 -0700 Subject: [PATCH 37/37] Limiting the PR to just elementwise ops --- .../linalg/broadcast_reduce_customized-inl.h | 4 - src/operator/numpy/np_diff-inl.h | 4 +- src/operator/tensor/broadcast_reduce-inl.cuh | 300 +++--------------- src/operator/tensor/broadcast_reduce-inl.h | 268 ++++++---------- src/operator/tensor/broadcast_reduce_op.h | 4 +- .../tensor/elemwise_binary_broadcast_op.h | 157 ++++++++- 6 files changed, 299 insertions(+), 438 deletions(-) diff --git a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h index 0226df45f960..2b5970d4f4ae 100644 --- a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h +++ b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h @@ -31,10 +31,6 @@ namespace mxnet { namespace op { namespace broadcast { using namespace mshadow; -using mxnet_op::unravel; -using mxnet_op::ravel; -using mxnet_op::dot; -using mxnet_op::unravel_dot; template MSHADOW_XINLINE void seq_reduce_assign_wr(const index_t idx, const size_t M, const bool addto, diff --git a/src/operator/numpy/np_diff-inl.h b/src/operator/numpy/np_diff-inl.h index 3d80e2d941c8..8a8bc558962a 100644 --- a/src/operator/numpy/np_diff-inl.h +++ b/src/operator/numpy/np_diff-inl.h @@ -73,7 +73,7 @@ struct diff_forward { const int stride, const mshadow::Shape oshape, const mshadow::Shape ishape) { - using namespace mxnet_op; + using namespace broadcast; // j represent the memory index of the corresponding input entry int j = ravel(unravel(i, oshape), ishape); @@ -145,7 +145,7 @@ struct diff_backward { const int stride, const int axis, const mshadow::Shape oshape, const mshadow::Shape ishape) { - using namespace mxnet_op; + using namespace broadcast; if (n == 0) { igrad[i] = ograd[i]; return; diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh index 790f1afb857c..379443dc1688 100644 --- a/src/operator/tensor/broadcast_reduce-inl.cuh +++ b/src/operator/tensor/broadcast_reduce-inl.cuh @@ -18,253 +18,57 @@ */ /*! - * Copyright (c) 2015-2020 by Contributors + * Copyright (c) 2015-2017 by Contributors * \file broadcast_reduce-inl.cuh * \brief CUDA implementations for binary broadcast and reduce - * \author Antti-Pekka Hynninen, Przemyslaw Tredak + * \author Antti-Pekka Hynninen */ #ifndef MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_ #define MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_ using namespace mshadow::cuda; -template -struct VectorizedBinaryBroadcastParam { - const DType* inputs[2]; - DType* outputs[1]; - Shape stride[2]; - Shape oshape; - index_t size[2]; -}; - -using common::cuda::VectorizedLoader; -using common::cuda::VectorizedStorer; - -template -__global__ void VectorizedBinaryBroadcastKernel( - const VectorizedBinaryBroadcastParam param, - const index_t lead_dim, const index_t other_dim, - const index_t num_aligned_elements) { - constexpr int nvec = sizeof(LType) / sizeof(DType); - const index_t M = num_aligned_elements * other_dim; - const index_t N = lead_dim * other_dim; - - VectorizedLoader lloader(param.inputs[0], param.size[0]); - VectorizedLoader rloader(param.inputs[1], param.size[1]); - - for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < M; - idx += gridDim.x * blockDim.x) { - DType * current_output_pointer; - index_t output_size; - index_t output_idx; - if (aligned) { - // Simplified case - index_t lindex, rindex; - unravel_dot(idx * nvec, param.oshape, - param.stride[0], param.stride[1], - &lindex, &rindex); - lloader.load(lindex / nvec, param.size[0]); - rloader.load(rindex / nvec, param.size[1]); - current_output_pointer = param.outputs[0]; - output_size = N; - output_idx = idx; - } else { - const index_t row = idx / num_aligned_elements; - const index_t lead_dim_idx = idx - row * num_aligned_elements; - - index_t lindex, rindex; - const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(), - static_cast(0)) + - row * lead_dim; - unravel_dot(original_idx, param.oshape, - param.stride[0], param.stride[1], - &lindex, &rindex); - lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]); - rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]); - current_output_pointer = param.outputs[0] + row * lead_dim; - output_size = lead_dim; - output_idx = lead_dim_idx; +template +__launch_bounds__(kMaxThreadsPerBlock) +__global__ void binary_broadcast_kernel(const int N, const bool addto, + const DType* __restrict lhs, + const DType* __restrict rhs, DType *out, + const Shape lstride, const Shape rstride, + const Shape oshape) { + for (int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x; idx < N; + idx += blockDim.x * gridDim.x * unroll) + { + int j[unroll]; + int k[unroll]; + DType val[unroll]; + #pragma unroll + for (int i=0;i < unroll;i++) { + unravel_dot(idx + i*blockDim.x, oshape, lstride, rstride, &j[i], &k[i]); + val[i] = OP::Map(lhs[j[i]], rhs[k[i]]); } - VectorizedStorer storer(current_output_pointer, output_size); - - if (req == kAddTo) { - storer.load(output_idx, output_size); + #pragma unroll + for (int i=0;i < unroll;i++) { + if (idx + i*blockDim.x < N) assign(&out[idx + i*blockDim.x], addto, val[i]); } -#pragma unroll - for (int i = 0; i < lloader.nvec(); ++i) { - DType temp = OP::Map(lloader.separate()[i], - rloader.separate()[i]); - if (req == kAddTo) { - storer.separate()[i] += temp; - } else { - storer.separate()[i] = temp; - } - } - storer.store(output_idx, output_size); } } -template -__global__ void VectorizedBinaryBroadcastSingleSideKernel( - const VectorizedBinaryBroadcastParam param, - const index_t lead_dim, const index_t other_dim, - const index_t num_aligned_elements) { - constexpr int nvec = sizeof(LType) / sizeof(DType); - const index_t M = num_aligned_elements * other_dim; - const index_t N = lead_dim * other_dim; - constexpr int other_side = 1 - side; - - VectorizedLoader lloader(param.inputs[side], param.size[side]); - - for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < M; - idx += gridDim.x * blockDim.x) { - index_t original_idx; - DType * current_output_pointer; - index_t output_size; - index_t output_idx; - if (aligned) { - //Simplified case - original_idx = idx * nvec; - const index_t lindex = mxnet_op::unravel_dot(original_idx, param.oshape, - param.stride[side]); - lloader.load(lindex / nvec, param.size[side]); - current_output_pointer = param.outputs[0]; - output_size = N; - output_idx = idx; - } else { - const index_t row = idx / num_aligned_elements; - const index_t lead_dim_idx = idx - row * num_aligned_elements; - original_idx = lead_dim_idx * nvec - - lloader.alignment() + row * lead_dim; - const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(), - static_cast(0)) + - row * lead_dim; - const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape, - param.stride[side]); - lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]); - current_output_pointer = param.outputs[0] + row * lead_dim; - output_size = lead_dim; - output_idx = lead_dim_idx; - } - VectorizedStorer storer(current_output_pointer, output_size); - - if (req == kAddTo) { - storer.load(output_idx, output_size); - } -#pragma unroll - for (int i = 0; i < lloader.nvec(); ++i) { - const index_t rindex = min(max(mxnet_op::unravel_dot(original_idx + i, - param.oshape, - param.stride[other_side]), - static_cast(0)), - param.size[other_side] - 1); - DType rinput = param.inputs[other_side][rindex]; - DType temp; - if (side == 0) { - // Left side is vectorized - temp = OP::Map(lloader.separate()[i], - rinput); - } else { - // Right side is vectorized - temp = OP::Map(rinput, - lloader.separate()[i]); - } - - if (req == kAddTo) { - storer.separate()[i] += temp; - } else { - storer.separate()[i] = temp; - } - } - storer.store(output_idx, output_size); - } -} - -template -class VectorizedBinaryBroadcastFwd { - public: - using ParamType = VectorizedBinaryBroadcastParam; - - template - static void Launch(const index_t blocks, const index_t threads, - cudaStream_t stream, - const ParamType params, const index_t lead_dim, - const index_t other_dim) { - int common_shape = 1; - int first_different = -1; - for (int i = ndim - 1; i >= 0; --i) { - if (params.stride[0][i] == params.stride[1][i]) { - common_shape *= params.oshape[i]; - } else { - first_different = i; - break; - } - } - - if (common_shape != 1) { - VectorizedLoader loader(params.inputs[0], lead_dim); - const index_t num_elements_per_row = loader.num_aligned_elements(); - VectorizedBinaryBroadcastKernel - <<>>(params, lead_dim, other_dim, num_elements_per_row); - } else { - if (params.stride[0][first_different] == 0) { - VectorizedLoader loader(params.inputs[1], lead_dim); - const index_t num_elements_per_row = loader.num_aligned_elements(); - VectorizedBinaryBroadcastSingleSideKernel - <<>>(params, lead_dim, other_dim, num_elements_per_row); - } else { - VectorizedLoader loader(params.inputs[0], lead_dim); - const index_t num_elements_per_row = loader.num_aligned_elements(); - VectorizedBinaryBroadcastSingleSideKernel - <<>>(params, lead_dim, other_dim, num_elements_per_row); - } - } - } -}; - template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { - using common::cuda::VectorizedKernelLauncher; if (req == kNullOp) return; cudaStream_t stream = Stream::GetStream(s); - const index_t N = out.shape_.Size(); - - Shape lstride = mxnet_op::calc_stride(lhs.shape_.get()); - Shape rstride = mxnet_op::calc_stride(rhs.shape_.get()); - - MXNET_ASSIGN_REQ_SWITCH(req, Req, { - using LType = uint2; - using Kernel = VectorizedBinaryBroadcastFwd; - - typename Kernel::ParamType param; - - param.inputs[0] = lhs.dptr(); - param.inputs[1] = rhs.dptr(); - param.outputs[0] = out.dptr(); - param.stride[0] = lstride; - param.stride[1] = rstride; - param.oshape = out.shape_.get(); - param.size[0] = lhs.shape_.Size(); - param.size[1] = rhs.shape_.Size(); - - index_t lead_dim = 1; - for (int i = ndim - 1; i >= 0; --i) { - /* Find the first non-1 dimension - to check the alignment - */ - if (param.oshape[i] != 1) { - lead_dim = param.oshape[i]; - break; - } - } - const index_t other_dim = out.shape_.Size() / lead_dim; - - VectorizedKernelLauncher(lead_dim, other_dim, s, param); - }); + int N = out.shape_.Size(); + const int warpSize = 32; + const int unroll = 2; + int nthread = std::min(kMaxThreadsPerBlock, ((N + warpSize - 1)/warpSize)*warpSize ); + int ngrid = std::min(kBaseGridNum, (N + nthread*unroll - 1) / (nthread*unroll)); + Shape lstride = calc_stride(lhs.shape_.get()); + Shape rstride = calc_stride(rhs.shape_.get()); + binary_broadcast_kernel<<>>( + N, req == kAddTo, lhs.dptr(), rhs.dptr(), out.dptr(), lstride, rstride, + out.shape_.get()); } const int nthread_reduce = kMaxThreadsPerBlock; @@ -288,8 +92,8 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, const int Mend = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext); for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) { int idx = idx0 + tidx; - Shape coord = mxnet_op::unravel(idx, small_shape); - int idx_big0 = mxnet_op::ravel(coord, big_shape0); + Shape coord = unravel(idx, small_shape); + int idx_big0 = ravel(coord, big_shape0); AType val, residual; Reducer::SetInitValue(val, residual); @@ -298,7 +102,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, int idx_big[unroll]; #pragma unroll for (int u=0;u < unroll;u++) { - idx_big[u] = idx_big0 + mxnet_op::unravel_dot(k + u*by, big_shape, big_stride); + idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride); } DType tmp[unroll]; #pragma unroll @@ -371,10 +175,10 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, const int Mend = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext); for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) { int idx = idx0 + tidx; - Shape coord = mxnet_op::unravel(idx, small_shape); - int idx_big0 = mxnet_op::ravel(coord, big_shape0); - int idx_lhs0 = mxnet_op::ravel(coord, lhs_shape0); - int idx_rhs0 = mxnet_op::ravel(coord, rhs_shape0); + Shape coord = unravel(idx, small_shape); + int idx_big0 = ravel(coord, big_shape0); + int idx_lhs0 = ravel(coord, lhs_shape0); + int idx_rhs0 = ravel(coord, rhs_shape0); DType val, residual; Reducer::SetInitValue(val, residual); @@ -385,9 +189,9 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto, int idx_rhs[unroll]; #pragma unroll for (int u=0;u < unroll;u++) { - idx_big[u] = idx_big0 + mxnet_op::unravel_dot(k + u*by, big_shape, big_stride); - idx_lhs[u] = idx_lhs0 + mxnet_op::unravel_dot(k + u*by, lhs_shape, lhs_stride); - idx_rhs[u] = idx_rhs0 + mxnet_op::unravel_dot(k + u*by, rhs_shape, rhs_stride); + idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride); + idx_lhs[u] = idx_lhs0 + unravel_dot(k + u*by, lhs_shape, lhs_stride); + idx_rhs[u] = idx_rhs0 + unravel_dot(k + u*by, rhs_shape, rhs_stride); } DType tmp[unroll]; #pragma unroll @@ -463,8 +267,8 @@ __global__ void reduce_kernel_M1(const int N, const bool addto, const DType* __restrict big, OType *small, const Shape bshape, const Shape sshape) { for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) { - Shape coord = mxnet_op::unravel(idx, sshape); - int j = mxnet_op::ravel(coord, bshape); + Shape coord = unravel(idx, sshape); + int j = ravel(coord, bshape); AType val, residual; Reducer::SetInitValue(val, residual); Reducer::Reduce(val, AType(OP::Map(big[j])), residual); @@ -485,10 +289,10 @@ __global__ void reduce_kernel_M1(const int N, const bool addto, const Shape rhs_shape, const Shape small_shape) { for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) { - Shape coord = mxnet_op::unravel(idx, small_shape); - int idx_big = mxnet_op::ravel(coord, big_shape); - int idx_lhs = mxnet_op::ravel(coord, lhs_shape); - int idx_rhs = mxnet_op::ravel(coord, rhs_shape); + Shape coord = unravel(idx, small_shape); + int idx_big = ravel(coord, big_shape); + int idx_lhs = ravel(coord, lhs_shape); + int idx_rhs = ravel(coord, rhs_shape); DType val, residual; Reducer::SetInitValue(val, residual); Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual); @@ -856,16 +660,16 @@ void Reduce(Stream *s, const TBlob& small, const OpReqType req, } template -size_t ReduceWorkspaceSize(Stream *s, const ::mxnet::TShape& small, const OpReqType req, - const ::mxnet::TShape& big) { +size_t ReduceWorkspaceSize(Stream *s, const mxnet::TShape& small, const OpReqType req, + const mxnet::TShape& big) { if (req == kNullOp) return 0; ReduceImplConfig config = ConfigureReduceImpl(small, big, nullptr, nullptr); return config.workspace_size; } template -size_t ReduceWorkspaceSize(Stream *s, const ::mxnet::TShape& small, const OpReqType req, - const ::mxnet::TShape& big, const ::mxnet::TShape& lhs, const ::mxnet::TShape& rhs) { +size_t ReduceWorkspaceSize(Stream *s, const mxnet::TShape& small, const OpReqType req, + const mxnet::TShape& big, const mxnet::TShape& lhs, const mxnet::TShape& rhs) { if (req == kNullOp) return 0; ReduceImplConfig config = ConfigureReduceImpl(small, big, &lhs, &rhs); return config.workspace_size; diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h index 6570ea9e9dd5..841fbcd28a68 100644 --- a/src/operator/tensor/broadcast_reduce-inl.h +++ b/src/operator/tensor/broadcast_reduce-inl.h @@ -31,165 +31,27 @@ #include #include #include "../mshadow_op.h" -#include "../mxnet_op.h" #include "../operator_common.h" -#if MXNET_USE_CUDA -#include "../../common/cuda_vectorization.cuh" -#endif namespace mxnet { namespace op { -namespace mxnet_op { -template -struct binary_broadcast_kernel { - /*! \brief Map function for binary_broadcast_kernel */ - template - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType *lhs, IType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); - } - } - - /*! \brief Map function for binary_broadcast_kernel */ - template - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType lhs, IType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); - } - } - -#ifndef _WIN32 - /*! \brief Map function for binary_broadcast_kernel */ - template::value, int>::type = 0> - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType *lhs, DType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); - } - } - - /*! \brief Map function for binary_broadcast_kernel */ - template::value && - !std::is_pointer::value, int>::type = 0> - MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, - const Shape &lstride, const Shape &rstride, - const Shape &oshape, IType lhs, DType *rhs, - DType *out) { - Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (index_t i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); - } - } -#endif -}; - -template -struct csr_dns_csr_broadcast_kernel { - /*! - * \brief Map function for broadcast between csr and 1D vector - * \param row global thread id/assigned row id - * \param csr_data ptr to data buffer of csr matrix - * \param csr_indices ptr to indices buffer of csr matrix - * \param csr_indptr ptr to indptr buffer of csr matrix - * \param dns ptr to data buffer of the dense vector - * \param out ptr to the data buffer of the result csr matrix - */ - template - MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, - const RType *csr_indptr, const DType *dns, DType *out) { - const nnvm::dim_t curr_row_i = csr_indptr[row]; - const nnvm::dim_t next_row_i = csr_indptr[row + 1]; - for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { - KERNEL_ASSIGN(out[iter], req, OP::Map(csr_data[iter], - (col_vec)? dns[row] : dns[csr_indices[iter]])); - } - } - - /*! - * \brief Map function for broadcast between csr and a scalar - * \param i global thread id - * \param csr_data ptr to data buffer of csr matrix - * \param scalar_ptr ptr to data buffer of the scalar tensor, only the 0-th element is used - * \param out ptr to the data buffer of output csr matrix - * \param nnz number of non-zero elements in input csr matrix - */ - template - MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr, - DType *out, const nnvm::dim_t nnz) { - const DType scale = scalar_ptr[0]; - if (i < nnz) { - KERNEL_ASSIGN(out[i], req, OP::Map(csr_data[i], scale)); - } - } -}; - -template -struct csr_dns_map_kernel { - template - MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, - const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows, - const nnvm::dim_t num_cols) { - if (row < num_rows) { - const nnvm::dim_t curr_row_i = csr_indptr[row]; - const nnvm::dim_t next_row_i = csr_indptr[row + 1]; - for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { - const nnvm::dim_t target = row * num_cols + csr_indices[iter]; - KERNEL_ASSIGN(out[target], req, - reverse ? OP::Map(out[target], csr_data[iter]) : - OP::Map(csr_data[iter], out[target])); - } - } - } -}; - -} // namespace mxnet_op - namespace broadcast { using namespace mshadow; const int MAX_DIM = 5; +template +MSHADOW_XINLINE Shape calc_stride(const Shape& shape) { + Shape stride; + index_t cumprod = 1; + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + stride[i] = (shape[i] > 1) ? cumprod : 0; + cumprod *= shape[i]; + } + return stride; +} + template MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape& shape, const Shape& stridej, const Shape& stridek, index_t* j, index_t* k) { @@ -205,6 +67,28 @@ MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape& shape, } } +template +MSHADOW_XINLINE Shape unravel(const index_t idx, const Shape& shape) { + Shape ret; + #pragma unroll + for (index_t i = ndim-1, j = idx; i >=0; --i) { + auto tmp = j / shape[i]; + ret[i] = j - tmp*shape[i]; + j = tmp; + } + return ret; +} + +template +MSHADOW_XINLINE index_t ravel(const Shape& coord, const Shape& shape) { + index_t ret = 0; + #pragma unroll + for (index_t i = 0; i < ndim; ++i) { + ret = ret * shape[i] + (shape[i] > 1) * coord[i]; + } + return ret; +} + template MSHADOW_XINLINE int diff(const Shape& small, const Shape& big, @@ -230,6 +114,28 @@ MSHADOW_XINLINE int diff(const Shape& small, return mdim; } +template +MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape& shape, + const Shape& stride) { + index_t ret = 0; + #pragma unroll + for (index_t i = ndim-1, j = idx; i >=0; --i) { + auto tmp = j / shape[i]; + ret += (j - tmp*shape[i])*stride[i]; + j = tmp; + } + return ret; +} + +template +MSHADOW_XINLINE index_t dot(const Shape& coord, const Shape& stride) { + index_t ret = 0; + #pragma unroll + for (int i = 0; i < ndim; ++i) + ret += coord[i] * stride[i]; + return ret; +} + template MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) { if (addto) { @@ -245,9 +151,9 @@ MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto const DType* __restrict rhs, DType* out, const Shape& lshape, const Shape& rshape, const Shape& oshape) { - const Shape coord = mxnet_op::unravel(idx, oshape); - const index_t j = mxnet_op::ravel(coord, lshape); - const index_t k = mxnet_op::ravel(coord, rshape); + const Shape coord = unravel(idx, oshape); + const index_t j = ravel(coord, lshape); + const index_t k = ravel(coord, rshape); assign(&out[idx], addto, OP::Map(lhs[j], rhs[k])); } @@ -256,13 +162,13 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const const DType* __restrict big, OType *small, const Shape& bshape, const Shape& sshape, const Shape& rshape, const Shape& rstride) { - Shape coord = mxnet_op::unravel(idx, sshape); - index_t j = mxnet_op::ravel(coord, bshape); + Shape coord = unravel(idx, sshape); + index_t j = ravel(coord, bshape); AType val, residual; Reducer::SetInitValue(val, residual); for (size_t k = 0; k < M; ++k) { - coord = mxnet_op::unravel(k, rshape); - Reducer::Reduce(val, AType(OP::Map(big[j + mxnet_op::dot(coord, rstride)])), residual); + coord = unravel(k, rshape); + Reducer::Reduce(val, AType(OP::Map(big[j + dot(coord, rstride)])), residual); } Reducer::Finalize(val, residual); assign(&small[idx], addto, OType(val)); @@ -273,15 +179,23 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const #else +template +void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs, + const DType *rhs, DType *out, const Shape lshape, + const Shape rshape, const Shape oshape) { + for (size_t idx = 0; idx < N; ++idx) { + binary_broadcast_assign(idx, addto, lhs, rhs, out, lshape, rshape, oshape); + } +} + template void BinaryBroadcastComputeImpl(Stream *s, const OpReqType req, const TBlob& lhs, const TBlob& rhs, const TBlob& out) { - mshadow::Shape oshape = out.shape_.get(); - mshadow::Shape lstride = mxnet_op::calc_stride(lhs.shape_.get()); - mshadow::Shape rstride = mxnet_op::calc_stride(rhs.shape_.get()); - mxnet_op::Kernel, cpu>:: - template LaunchEx(s, out.shape_.Size(), req, lstride, rstride, oshape, - lhs.dptr(), rhs.dptr(), out.dptr()); + if (req == kNullOp) return; + size_t N = out.shape_.Size(); + binary_broadcast_compute(N, req == kAddTo, lhs.dptr(), rhs.dptr(), + out.dptr(), lhs.shape_.get(), rhs.shape_.get(), + out.shape_.get()); } template @@ -306,8 +220,8 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add const index_t* ws_dptr) { #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) for (index_t idx = 0; idx < static_cast(N); ++idx) { - Shape coord = mxnet_op::unravel(idx, sshape); - index_t j = mxnet_op::ravel(coord, bshape); + Shape coord = unravel(idx, sshape); + index_t j = ravel(coord, bshape); DType val, residual; Reducer::SetInitValue(val, residual); for (size_t k = 0; k < M; ++k) { @@ -364,8 +278,8 @@ void ReduceWithExtraMem(Stream* s, const TBlob& small, const OpReqType req, size_t N = small.shape_.Size(), M = rshape.Size(); #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) for (index_t k = 0; k < static_cast(M); k++) { - Shape coord = mxnet_op::unravel(k, rshape); - ws_dptr[k] = mxnet_op::dot(coord, rstride); + Shape coord = unravel(k, rshape); + ws_dptr[k] = dot(coord, rstride); } seq_reduce_compute_extra_mem( @@ -396,21 +310,21 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const const Shape& lhs_shape, const Shape& rhs_shape, const Shape& rstride, const Shape& lhs_stride, const Shape& rhs_stride) { - Shape coord = mxnet_op::unravel(idx, small_shape); - const index_t idx_big0 = mxnet_op::ravel(coord, big_shape); - const index_t idx_lhs0 = mxnet_op::ravel(coord, lhs_shape0); - const index_t idx_rhs0 = mxnet_op::ravel(coord, rhs_shape0); + Shape coord = unravel(idx, small_shape); + const index_t idx_big0 = ravel(coord, big_shape); + const index_t idx_lhs0 = ravel(coord, lhs_shape0); + const index_t idx_rhs0 = ravel(coord, rhs_shape0); DType val, residual; Reducer::SetInitValue(val, residual); for (size_t k = 0; k < M; ++k) { - Shape coord_big = mxnet_op::unravel(k, rshape); - index_t idx_big = idx_big0 + mxnet_op::dot(coord_big, rstride); + Shape coord_big = unravel(k, rshape); + index_t idx_big = idx_big0 + dot(coord_big, rstride); - Shape coord_lhs = mxnet_op::unravel(k, lhs_shape); - index_t idx_lhs = idx_lhs0 + mxnet_op::dot(coord_lhs, lhs_stride); + Shape coord_lhs = unravel(k, lhs_shape); + index_t idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride); - Shape coord_rhs = mxnet_op::unravel(k, rhs_shape); - index_t idx_rhs = idx_rhs0 + mxnet_op::dot(coord_rhs, rhs_stride); + Shape coord_rhs = unravel(k, rhs_shape); + index_t idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride); Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual); } diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h index c3331b095660..03aa8b932dff 100644 --- a/src/operator/tensor/broadcast_reduce_op.h +++ b/src/operator/tensor/broadcast_reduce_op.h @@ -1461,7 +1461,7 @@ struct pick { const IType *idx, index_t M, int stride, mshadow::Shape bshape, mshadow::Shape sshape) { - using namespace mxnet_op; + using namespace broadcast; index_t j = static_cast(idx[i]); if (clip) { if (j <= 0) j = 0; @@ -1483,7 +1483,7 @@ struct pick_grad { const IType *idx, index_t M, int stride, mshadow::Shape bshape, mshadow::Shape sshape) { - using namespace mxnet_op; + using namespace broadcast; index_t j = static_cast(idx[i]); if (clip) { if (j <= 0) j = 0; diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 62011efb5462..ffd0f123070a 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -183,10 +183,155 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet: } else { LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape; } - return j; } +namespace mxnet_op { +template +struct binary_broadcast_kernel { + /*! \brief Map function for binary_broadcast_kernel */ + template + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType *lhs, IType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); + } + } + + /*! \brief Map function for binary_broadcast_kernel */ + template + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType lhs, IType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); + } + } + +#ifndef _WIN32 + /*! \brief Map function for binary_broadcast_kernel */ + template::value, int>::type = 0> + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType *lhs, DType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); + } + } + + /*! \brief Map function for binary_broadcast_kernel */ + template::value && + !std::is_pointer::value, int>::type = 0> + MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req, + const Shape &lstride, const Shape &rstride, + const Shape &oshape, IType lhs, DType *rhs, + DType *out) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx])); + // starts from 1 to avoid extra inc at end of loop + for (index_t i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx])); + } + } +#endif +}; + +template +struct csr_dns_csr_broadcast_kernel { + /*! + * \brief Map function for broadcast between csr and 1D vector + * \param row global thread id/assigned row id + * \param csr_data ptr to data buffer of csr matrix + * \param csr_indices ptr to indices buffer of csr matrix + * \param csr_indptr ptr to indptr buffer of csr matrix + * \param dns ptr to data buffer of the dense vector + * \param out ptr to the data buffer of the result csr matrix + */ + template + MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, + const RType *csr_indptr, const DType *dns, DType *out) { + const nnvm::dim_t curr_row_i = csr_indptr[row]; + const nnvm::dim_t next_row_i = csr_indptr[row + 1]; + for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { + KERNEL_ASSIGN(out[iter], req, OP::Map(csr_data[iter], + (col_vec)? dns[row] : dns[csr_indices[iter]])); + } + } + + /*! + * \brief Map function for broadcast between csr and a scalar + * \param i global thread id + * \param csr_data ptr to data buffer of csr matrix + * \param scalar_ptr ptr to data buffer of the scalar tensor, only the 0-th element is used + * \param out ptr to the data buffer of output csr matrix + * \param nnz number of non-zero elements in input csr matrix + */ + template + MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr, + DType *out, const nnvm::dim_t nnz) { + const DType scale = scalar_ptr[0]; + if (i < nnz) { + KERNEL_ASSIGN(out[i], req, OP::Map(csr_data[i], scale)); + } + } +}; + +template +struct csr_dns_map_kernel { + template + MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices, + const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + if (row < num_rows) { + const nnvm::dim_t curr_row_i = csr_indptr[row]; + const nnvm::dim_t next_row_i = csr_indptr[row + 1]; + for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) { + const nnvm::dim_t target = row * num_cols + csr_indices[iter]; + KERNEL_ASSIGN(out[target], req, + reverse ? OP::Map(out[target], csr_data[iter]) : + OP::Map(csr_data[iter], out[target])); + } + } + } +}; + +} // namespace mxnet_op + template void BinaryBroadcastIntCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -238,10 +383,12 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, } MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { - broadcast::BinaryBroadcastComputeImpl(s, req[0], - inputs[0].reshape(new_lshape), - inputs[1].reshape(new_rshape), - outputs[0].reshape(new_oshape)); + mshadow::Shape oshape = new_oshape.get(); + mshadow::Shape lstride = mxnet_op::calc_stride(new_lshape.get()); + mshadow::Shape rstride = mxnet_op::calc_stride(new_rshape.get()); + mxnet_op::Kernel, xpu>:: + template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape, + inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr()); }); }); }