From 2512e2a8672afcccb7d0f430a507af0342799c0a Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Wed, 16 Sep 2015 14:09:27 -0600 Subject: [PATCH] Add rrelu, dropout --- src/operator/cudnn_convolution-inl.h | 36 +++-- src/operator/dropout-inl.h | 192 +++++++++++++++++++++++++++ src/operator/dropout.cc | 32 +++++ src/operator/dropout.cu | 19 +++ src/operator/leaky_relu-inl.h | 93 +++++++++++-- tests/python/train/test_conv.py | 2 +- 6 files changed, 342 insertions(+), 32 deletions(-) create mode 100644 src/operator/dropout-inl.h create mode 100644 src/operator/dropout.cc create mode 100644 src/operator/dropout.cu diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h index ad0324a811eb..57c90d241f8f 100644 --- a/src/operator/cudnn_convolution-inl.h +++ b/src/operator/cudnn_convolution-inl.h @@ -55,7 +55,7 @@ class CuDNNConvolutionOp : public Operator { Init(s, in_data, out_data); } Tensor workspace = ctx.requested[kTempSpace].get_space( - mshadow::Shape1(workspace_), s); + mshadow::Shape1(forward_workspace_), s); CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_, &alpha, in_desc_, @@ -65,7 +65,7 @@ class CuDNNConvolutionOp : public Operator { conv_desc_, algo_, workspace.dptr_, - workspace_size_, + forward_workspace_byte_, &beta, out_desc_, out.dptr_), CUDNN_STATUS_SUCCESS); @@ -106,7 +106,7 @@ class CuDNNConvolutionOp : public Operator { Tensor data = in_data[kData].get(s); Tensor gdata = in_grad[kData].get(s); Tensor workspace = ctx.requested[kTempSpace].get_space( - mshadow::Shape1(workspace_), s); + mshadow::Shape1(backward_workspace_), s); if (!param_.no_bias) { Tensor gbias = in_grad[kBias].get(s); CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_, @@ -126,7 +126,7 @@ class CuDNNConvolutionOp : public Operator { conv_desc_, back_algo_w_, workspace.dptr_, - workspace_size_, + backward_workspace_byte_, &beta, filter_desc_, gwmat.dptr_), CUDNN_STATUS_SUCCESS); @@ -139,7 +139,7 @@ class CuDNNConvolutionOp : public Operator { conv_desc_, back_algo_, workspace.dptr_, - workspace_size_, + backward_workspace_byte_, &beta, in_desc_, gdata.dptr_), CUDNN_STATUS_SUCCESS); @@ -155,7 +155,7 @@ class CuDNNConvolutionOp : public Operator { CHECK_EQ(out_data.size(), 1); if (!init_cudnn_) { init_cudnn_ = true; - size_t workspace = static_cast(param_.workspace * sizeof(real_t)); + size_t workspace_byte = static_cast(param_.workspace * sizeof(real_t)); size_t back_size = 0; size_t back_size_w = 0; Tensor data = in_data[kData].get(s); @@ -210,7 +210,7 @@ class CuDNNConvolutionOp : public Operator { conv_desc_, out_desc_, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - workspace, + workspace_byte, &algo_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_, in_desc_, @@ -218,7 +218,7 @@ class CuDNNConvolutionOp : public Operator { conv_desc_, filter_desc_, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, - workspace, + workspace_byte, &back_algo_w_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_, filter_desc_, @@ -226,7 +226,7 @@ class CuDNNConvolutionOp : public Operator { conv_desc_, in_desc_, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, - workspace, + workspace_byte, &back_algo_), CUDNN_STATUS_SUCCESS); CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_, filter_desc_, @@ -242,26 +242,24 @@ class CuDNNConvolutionOp : public Operator { filter_desc_, back_algo_w_, &back_size_w), CUDNN_STATUS_SUCCESS); - back_size = std::max(back_size, back_size_w); + backward_workspace_byte_ = std::max(back_size, back_size_w); CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_, in_desc_, filter_desc_, conv_desc_, out_desc_, algo_, - &workspace), CUDNN_STATUS_SUCCESS); - workspace = std::max(workspace, back_size); - CHECK_GE(param_.workspace * sizeof(real_t), workspace + sizeof(real_t)) - << "\nMinimum workspace: " << workspace << "\n" - << "Given: " << param_.workspace * sizeof(real_t); - workspace_ = workspace / sizeof(real_t) + 1; - workspace_size_ = workspace_ * sizeof(real_t); + &forward_workspace_byte_), CUDNN_STATUS_SUCCESS); + forward_workspace_ = forward_workspace_byte_ / sizeof(real_t) + 1; + backward_workspace_ = backward_workspace_byte_ / sizeof(real_t) + 1; } } bool init_cudnn_; - size_t workspace_; - size_t workspace_size_; + size_t forward_workspace_; + size_t backward_workspace_; + size_t forward_workspace_byte_; + size_t backward_workspace_byte_; cudnnDataType_t dtype_; cudnnTensorDescriptor_t in_desc_; cudnnTensorDescriptor_t out_desc_; diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h new file mode 100644 index 000000000000..675694a1cca1 --- /dev/null +++ b/src/operator/dropout-inl.h @@ -0,0 +1,192 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file dropout-inl.h + * \brief + * \author Bing Xu +*/ + +#ifndef MXNET_OPERATOR_DROPOUT_INL_H_ +#define MXNET_OPERATOR_DROPOUT_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" +#include "./mshadow_op.h" + +enum DropoutOpInputs {kData}; +enum DropoutOpOutputs {kOut, kMask}; +enum DropoutOpForwardResource {kRandom}; + +namespace mxnet { +namespace op { + +struct DropoutParam : public dmlc::Parameter { + float p; + DMLC_DECLARE_PARAMETER(DropoutParam) { + DMLC_DECLARE_FIELD(p).set_default(0.5) + .set_range(0, 1) + .describe("Fraction of the input that gets dropped out at training time"); + } +}; // struct DropoutParam + +template +class DropoutOp : public Operator { + public: + explicit DropoutOp(DropoutParam param) { + this->pkeep_ = 1.0f - param.p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1); + if (ctx.is_train) { + CHECK_EQ(out_data.size(), 2); + } + Stream *s = ctx.get_stream(); + Tensor data, out, mask; + if (in_data[kData].ndim() == 2) { + uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1}; + TShape dshape(ds, ds + 4); + data = in_data[kData].get_with_shape(dshape, s); + out = out_data[kOut].get_with_shape(dshape, s); + if (ctx.is_train) { + mask = out_data[kMask].get_with_shape(dshape, s); + } + } else { + data = in_data[kData].get(s); + out = out_data[kOut].get(s); + if (ctx.is_train) { + mask = out_data[kMask].get(s); + } + } + if (ctx.is_train) { + Random *prnd = ctx.requested[kRandom].get_random(s); + mask = F(prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_); + Assign(out, req[kOut], data * mask); + } else { + Assign(out, req[kOut], data + 0.0f); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_grad.size(), 1); + Stream *s = ctx.get_stream(); + Tensor grad, gdata, mask; + if (out_grad[kOut].ndim() == 2) { + uint32_t ds[] = {out_grad[kOut].shape_[0], out_grad[kOut].shape_[1], 1, 1}; + TShape dshape(ds, ds + 4); + gdata = in_grad[kData].get_with_shape(dshape, s); + grad = out_grad[kOut].get_with_shape(dshape, s); + mask = out_data[kMask].get_with_shape(dshape, s); + } else { + grad = out_grad[kOut].get(s); + gdata = in_grad[kData].get(s); + mask = out_data[kMask].get(s); + } + Assign(gdata, req[kData], grad * mask); + } + + private: + real_t pkeep_; +}; // class DropoutOp + + +template +Operator *CreateOp(DropoutParam param); + +#if DMLC_USE_CXX11 +class DropoutProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1); + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DropoutProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "Dropout"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[kOut], out_data[kMask]}; + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { + return {{out_grad[kOut], in_grad[kData]}}; + } + + std::vector > ForwardInplaceOption( + const std::vector &in_data, + const std::vector &out_data) const override { + return {{in_data[kData], out_data[kOut]}}; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kRandom}; + } + + int NumVisibleOutputs() const override { + return 1; + } + + int NumOutputs() const override { + return 2; + } + + std::vector ListOutputs() const override { + return {"output", "mask"}; + } + + Operator* CreateOperator(Context ctx) const; + + private: + DropoutParam param_; +}; // class DropoutProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_DROPOUT_INL_H_ + diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc new file mode 100644 index 000000000000..506d7ef544b7 --- /dev/null +++ b/src/operator/dropout.cc @@ -0,0 +1,32 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file dropout.cc + * \brief + * \author Bing Xu +*/ + +#include "./dropout-inl.h" + +namespace mxnet { +namespace op { +template<> +Operator *CreateOp(DropoutParam param) { + return new DropoutOp(param); +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator *DropoutProp::CreateOperator(Context ctx) const { + DO_BIND_DISPATCH(CreateOp, param_); +} + +DMLC_REGISTER_PARAMETER(DropoutParam); + +MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp) +.describe("Apply dropout to input") +.add_argument("data", "Symbol", "Input data to dropout.") +.add_arguments(DropoutParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet + + diff --git a/src/operator/dropout.cu b/src/operator/dropout.cu new file mode 100644 index 000000000000..f0c1da8dbd95 --- /dev/null +++ b/src/operator/dropout.cu @@ -0,0 +1,19 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file dropout.cc + * \brief + * \author Bing Xu +*/ + +#include "./dropout-inl.h" + +namespace mxnet { +namespace op { +template<> +Operator *CreateOp(DropoutParam param) { + return new DropoutOp(param); +} +} // namespace op +} // namespace mxnet + + diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h index 9a635d86e722..ba5a874213cf 100644 --- a/src/operator/leaky_relu-inl.h +++ b/src/operator/leaky_relu-inl.h @@ -22,8 +22,9 @@ namespace mxnet { namespace op { enum LeakyReLUOpInputs {kData, kGamma}; -enum LeakyReLUOpOutputs {kOut}; +enum LeakyReLUOpOutputs {kOut, kMask}; enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU}; +enum LeakyReLUOpResource {kRandom}; struct LeakyReLUParam : public dmlc::Parameter { // use int for enumeration @@ -68,19 +69,25 @@ class LeakyReLUOp : public Operator { using namespace mshadow::expr; size_t expected = param_.act_type == kPReLU ? 2 : 1; CHECK_EQ(in_data.size(), expected); - CHECK_EQ(out_data.size(), 1); Stream *s = ctx.get_stream(); Tensor data; Tensor out; + Tensor mask; Tensor weight; if (in_data[kData].ndim() == 2) { uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1}; TShape dshape(ds, ds + 4); data = in_data[kData].get_with_shape(dshape, s); out = out_data[kOut].get_with_shape(dshape, s); + if (param_.act_type == kRReLU) { + mask = out_data[kMask].get_with_shape(dshape, s); + } } else { data = in_data[kData].get(s); out = out_data[kOut].get(s); + if (param_.act_type == kRReLU) { + mask = out_data[kMask].get(s); + } } switch (param_.act_type) { case kLeakyReLU: { @@ -93,7 +100,15 @@ class LeakyReLUOp : public Operator { break; } case kRReLU: { - LOG(FATAL) << "Not implmented"; + if (ctx.is_train) { + Random* prnd = ctx.requested[kRandom].get_random(s); + mask = prnd->uniform(mask.shape_); + mask = mask * (param_.upper_bound - param_.lower_bound) + param_.lower_bound; + Assign(out, req[kOut], F(data, mask)); + } else { + const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f; + Assign(out, req[kOut], F(data, slope)); + } break; } default: @@ -110,40 +125,56 @@ class LeakyReLUOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + // TODO(bing): double check size_t expected = param_.act_type == kPReLU ? 2 : 1; CHECK_EQ(out_grad.size(), 1); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data.size(), expected); Stream *s = ctx.get_stream(); - Tensor data, gdata; + Tensor output; + Tensor data; + Tensor gdata; Tensor grad; + Tensor mask; Tensor weight; Tensor grad_weight; if (in_data[kData].ndim() == 2) { uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1}; TShape dshape(ds, ds + 4); - data = in_data[kData].get_with_shape(dshape, s); grad = out_grad[kOut].get_with_shape(dshape, s); gdata = in_grad[kData].get_with_shape(dshape, s); + output = out_data[kOut].get_with_shape(dshape, s); + if (param_.act_type == kRReLU) { + mask = out_data[kMask].get_with_shape(dshape, s); + } + if (param_.act_type == kPReLU) { + data = in_data[kData].get_with_shape(dshape, s); + } } else { - data = in_data[kData].get(s); grad = out_grad[kOut].get(s); gdata = in_grad[kData].get(s); + output = out_data[kOut].get(s); + if (param_.act_type == kRReLU) { + mask = out_data[kMask].get(s); + } + if (param_.act_type == kPReLU) { + data = in_data[kData].get(s); + } } switch (param_.act_type) { case kLeakyReLU: { - Assign(gdata, req[kData], F(data, param_.slope) * grad); + Assign(gdata, req[kData], F(output, param_.slope) * grad); break; } case kPReLU: { weight = in_data[kGamma].get(s); grad_weight = in_grad[kGamma].get(s); grad_weight = sumall_except_dim<1>(F(data) * grad); - gdata = F(data, broadcast<1>(weight, data.shape_)) * grad; + gdata = F(output, broadcast<1>(weight, data.shape_)) * grad; break; } case kRReLU: { - LOG(FATAL) << "Not implmented"; + Assign(gdata, req[kData], F(output, mask) * grad); break; } default: @@ -181,6 +212,9 @@ class LeakyReLUProp : public OperatorProperty { } out_shape->clear(); out_shape->push_back(dshape); + if (param_.act_type == kRReLU) { + out_shape->push_back(dshape); + } return true; } @@ -200,9 +234,11 @@ class LeakyReLUProp : public OperatorProperty { const std::vector &in_data, const std::vector &out_data) const override { if (param_.act_type == kPReLU) { - return {out_grad[kOut], in_data[kData], in_data[kGamma]}; + return {out_grad[kOut], out_data[kOut], in_data[kData], in_data[kGamma]}; + } else if (param_.act_type == kRReLU) { + return {out_grad[kOut], out_data[kMask], out_data[kOut]}; } else { - return {out_grad[kOut], in_data[kData]}; + return {out_grad[kOut], out_data[kData]}; } } @@ -217,7 +253,11 @@ class LeakyReLUProp : public OperatorProperty { std::vector > ForwardInplaceOption( const std::vector &in_data, const std::vector &out_data) const override { - return {}; + if (param_.act_type == kPReLU) { + return {}; + } else { + return {{in_data[kData], out_data[kOut]}}; + } } std::vector ListArguments() const override { @@ -228,6 +268,35 @@ class LeakyReLUProp : public OperatorProperty { } } + std::vector ListOutputs() const override { + if (param_.act_type == kRReLU) { + return {"output", "mask"}; + } else { + return {"output"}; + } + } + + int NumOutputs() const override { + if (param_.act_type == kRReLU) { + return 2; + } else { + return 1; + } + } + + int NumVisibleOutputs() const override { + return 1; + } + + virtual std::vector ForwardResource( + const std::vector &in_shape) const { + if (param_.act_type == kRReLU) { + return {ResourceRequest::kRandom}; + } else { + return std::vector(); + } + } + Operator* CreateOperator(Context ctx) const; private: diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py index f5f19982dd9f..f7f0f1acb043 100644 --- a/tests/python/train/test_conv.py +++ b/tests/python/train/test_conv.py @@ -53,7 +53,7 @@ def CalAcc(out, label): executor = softmax.bind(mx.cpu(), arg_narrays, grad_narrays, 'write', aux_narrays) # update -print executor.debug_str() +print(executor.debug_str()) out_narray = executor.outputs[0] grad_narray = mx.nd.empty(out_narray.shape)