From 45fafc23e7a3c95bbc5486295e010320b5643f80 Mon Sep 17 00:00:00 2001 From: Pedro Larroy <928489+larroy@users.noreply.github.com> Date: Tue, 4 Dec 2018 19:02:31 +0100 Subject: [PATCH] [v1.4.x][MXNET-1234] Port of #13409 to the release branch (#13487) * Provide a failing test for ReLU activation shape inference bug * Fix Activation backward shape inference fixes: #13333 * Add softsign Activation to test_gluon.py * Don't disable MKLDNN --- src/operator/elemwise_op_common.h | 20 ++++--- src/operator/nn/activation-inl.h | 12 ++--- src/operator/nn/activation.cc | 77 +++++++++++++++++---------- src/operator/nn/activation.cu | 30 ++++++----- tests/cpp/operator/activation_perf.cc | 26 ++++++--- tests/python/unittest/test_gluon.py | 12 ++--- 6 files changed, 109 insertions(+), 68 deletions(-) diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h index cf44da699156..faac69269f00 100644 --- a/src/operator/elemwise_op_common.h +++ b/src/operator/elemwise_op_common.h @@ -128,29 +128,33 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs, if (n_out != -1) out_size = static_cast(n_out); - auto deduce = [&](std::vector *vec, size_t size, const char *name) { + CHECK_LE(in_size, in_attrs->size()); + CHECK_LE(out_size, out_attrs->size()); + auto deduce = [&](const std::vector& vec, size_t size, const char *name) { for (size_t i = 0; i < size; ++i) { - CHECK(assign(&dattr, (*vec)[i])) + CHECK(assign(&dattr, vec.at(i))) << "Incompatible attr in node " << attrs.name << " at " << i << "-th " << name << ": " << "expected " << attr_string(dattr) - << ", got " << attr_string((*vec)[i]); + << ", got " << attr_string(vec.at(i)); } }; - deduce(in_attrs, in_size, "input"); - if (reverse_infer) deduce(out_attrs, out_size, "output"); + deduce(*in_attrs, in_size, "input"); + if (reverse_infer) + deduce(*out_attrs, out_size, "output"); auto write = [&](std::vector *vec, size_t size, const char *name) { for (size_t i = 0; i < size; ++i) { - CHECK(assign(&(*vec)[i], dattr)) + CHECK(assign(&(vec->at(i)), dattr)) << "Incompatible attr in node " << attrs.name << " at " << i << "-th " << name << ": " << "expected " << attr_string(dattr) - << ", got " << attr_string((*vec)[i]); + << ", got " << attr_string(vec->at(i)); } }; write(in_attrs, in_size, "input"); write(out_attrs, out_size, "output"); - if (is_none(dattr)) return false; + if (is_none(dattr)) + return false; return true; } diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index 2705177f951d..1d8e4c2b6cda 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -48,6 +48,9 @@ enum ActivationOpInputs {kData}; enum ActivationOpOutputs {kOut}; enum ActivationOpResource {kTempSpace}; enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU, kSoftSign}; + +// Get the number of inputs to the gradient depending on the activation type +int GradNumInputs(int act_type); } // activation struct ActivationParam : public dmlc::Parameter { @@ -199,13 +202,8 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { const ActivationParam& param = nnvm::get(attrs.parsed); -#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1) - bool relu = param.act_type == activation::kReLU; - CHECK_EQ(inputs.size(), relu ? 2U : 3U); -#else - bool softsign = param.act_type == activation::kSoftSign; - CHECK_EQ(inputs.size(), softsign ? 3U : 2U); -#endif + const int act_type = param.act_type; + CHECK_EQ(inputs.size(), activation::GradNumInputs(act_type)); CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); ActivationGradComputeImpl(attrs, ctx, inputs, req, outputs); diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index ba44ebd4ed4d..1348b4efc81b 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -30,13 +30,34 @@ #if MXNET_USE_MKLDNN == 1 #include "./mkldnn/mkldnn_base-inl.h" #include "./mkldnn/mkldnn_ops-inl.h" -#endif // MXNET_USE_MKLDNN +#endif // MXNET_USE_MKLDNN == 1 #include "../operator_common.h" #include "../../common/utils.h" namespace mxnet { namespace op { +namespace activation { + +int GradNumInputs(int act_type) { + // check activation.cu \sa ActivationGradCompute + switch (act_type) { + case kReLU: + return 2; + case kSoftReLU: + case kSoftSign: + case kTanh: + case kSigmoid: + return 3; + default: + CHECK(false) << "missing activation type"; + } + // unreachable + return -1; +} + +} // namespace activation + DMLC_REGISTER_PARAMETER(ActivationParam); // This will determine the order of the inputs for backward computation. @@ -44,24 +65,28 @@ struct ActivationGrad { const char *op_name; std::vector operator()(const nnvm::NodePtr& n, const std::vector& ograds) const { + // ograds, output... std::vector heads(ograds.begin(), ograds.end()); heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0}); const NodeAttrs& attrs = n->attrs; + using namespace activation; int act_type = dmlc::get(attrs.parsed).act_type; - if (act_type == activation::kSoftSign) { - // for softsign need the inputs to compute the activation. - heads.push_back(n->inputs[activation::kData]); - } - -#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1) // for ReLU, no need to pass input data. This enables inplace optimization during the // forward pass. - if (act_type != activation::kReLU && - act_type != activation::kSoftSign) { - heads.push_back(n->inputs[activation::kData]); + // check activation.cu \sa ActivationGradCompute + switch (act_type) { + case kReLU: + break; + case kSoftReLU: + case kSoftSign: + case kTanh: + case kSigmoid: + heads.push_back(n->inputs[activation::kData]); + break; + default: + CHECK(false) << "missing activation type"; } -#endif return MakeGradNode(op_name, n, heads, n->attrs.dict); } }; @@ -89,14 +114,14 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { const ActivationParam& param = nnvm::get(attrs.parsed); - bool relu = param.act_type == activation::kReLU; - CHECK_EQ(inputs.size(), relu ? 2U : 3U); + CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type)); if (SupportMKLDNN(inputs[0])) { MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); // XXX: for y = relu(x), y is passed as "in_data" to Backward() - MKLDNNActivationBackward(attrs, ctx, inputs[0], relu ? inputs[1] : inputs[2], req[0], + const bool relu = param.act_type == activation::kReLU; + MKLDNNActivationBackward(attrs, ctx, inputs.at(0), relu ? inputs.at(1) : inputs.at(2), req[0], outputs[0]); - MKLDNN_OPCHECK_RUN(ActivationGradCompute, attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(ActivationGradCompute, attrs, ctx, inputs, req, outputs); return; } FallBackCompute(ActivationGradComputeImpl, attrs, ctx, inputs, req, outputs); @@ -122,16 +147,12 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { const ActivationParam& param = nnvm::get(attrs.parsed); - if (param.act_type != activation::kReLU) { - CHECK_EQ(in_attrs->size(), 3U); - } else { - // for ReLU activation, the backward pass only needs ograd and output - CHECK_EQ(in_attrs->size(), 2U); - } + CHECK_EQ(in_attrs->size(), activation::GradNumInputs(param.act_type)); return MKLDNNStorageType(attrs, dev_mask, SupportMKLDNNAct(param), dispatch_mode, in_attrs, out_attrs); } -#endif +#endif // MXNET_USE_MKLDNN == 1 + MXNET_OPERATOR_REGISTER_UNARY(Activation) .describe(R"code(Applies an activation function element-wise to the input. @@ -163,18 +184,16 @@ The following activation functions are supported: NNVM_REGISTER_OP(_backward_Activation) .set_num_inputs([](const nnvm::NodeAttrs& attrs) { - int act_type = dmlc::get(attrs.parsed).act_type; - // for ReLU activation, the backward pass only needs ograd and output - if (act_type == activation::kReLU) return 2; - return 3; - }) + const int act_type = dmlc::get(attrs.parsed).act_type; + return activation::GradNumInputs(act_type); +}) .set_num_outputs(1) .set_attr("TIsBackward", true) #if MXNET_USE_MKLDNN == 1 .set_attr("FInferStorageType", BackwardActStorageType) #endif -.set_attr("FInferShape", ElemwiseShape<3, 1>) -.set_attr("FInferType", ElemwiseType<3, 1>) +.set_attr("FInferShape", ElemwiseShape<-1, 1>) +.set_attr("FInferType", ElemwiseType<-1, 1>) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ return std::vector >{{0, 0}}; }) diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu index 8892cc34f710..ec7db844b100 100644 --- a/src/operator/nn/activation.cu +++ b/src/operator/nn/activation.cu @@ -54,12 +54,13 @@ void ActivationCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); + const int act_type = param.act_type; // SoftReLU and kSoftSign are both not supported by CUDNN yet - if (param.act_type == activation::kSoftReLU) { + if (act_type == activation::kSoftReLU) { ActivationForward(ctx, inputs[0], req[0], outputs[0]); - } else if (param.act_type == activation::kSoftSign) { + } else if (act_type == activation::kSoftSign) { ActivationForward(ctx, inputs[0], req[0], outputs[0]); } else { @@ -76,23 +77,28 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { const ActivationParam& param = nnvm::get(attrs.parsed); - bool relu = param.act_type == activation::kReLU; - CHECK_EQ(inputs.size(), relu ? 2U : 3U); + const int act_type = param.act_type; + CHECK_EQ(inputs.size(), activation::GradNumInputs(act_type)); CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); // both SoftReLU and SoftSign not supported by CUDNN yet - if (param.act_type == activation::kSoftReLU) { + if (act_type == activation::kSoftReLU) { ActivationBackward( - ctx, inputs[0], inputs[1], req[0], outputs[0]); - } else if (param.act_type == activation::kSoftSign) { + ctx, inputs.at(0), inputs.at(1), req[0], outputs[0]); + } else if (act_type == activation::kSoftSign) { ActivationBackward( - ctx, inputs[0], inputs[2], req[0], outputs[0]); - } else { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + ctx, inputs.at(0), inputs.at(2), req[0], outputs[0]); + } else if (act_type == activation::kReLU) { + MSHADOW_REAL_TYPE_SWITCH(inputs.at(0).type_flag_, DType, { // XXX: for y = relu(x), y is passed as "in_data" to Backward() - get_cudnn_op(param).Backward(ctx, inputs[0], relu ? inputs[1] : inputs[2], - inputs[1], req[0], outputs[0]); + get_cudnn_op(param).Backward(ctx, inputs.at(0), inputs.at(1), + inputs.at(1), req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs.at(0).type_flag_, DType, { + get_cudnn_op(param).Backward(ctx, inputs.at(0), inputs.at(2), + inputs.at(1), req[0], outputs[0]); }); } } diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index 1bd8ca89c9f5..bba8a3ec5722 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -38,13 +38,27 @@ const kwargs_t basic_activation_args = { }; * \brief Generic bidirectional sanity test */ TEST(ACTIVATION_PERF, ExecuteBidirectional) { + using namespace std; TShape shape({5, 5}); - kwargs_t kwargs = basic_activation_args; - kwargs.push_back({"act_type", "tanh"}); - - test::op::CoreOperatorRunner runner; - runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( - kwargs, "Activation", "_backward_Activation"), 1); + vector activations = { + "relu", + "sigmoid", + "tanh", + "softrelu", + "softsign" + }; + for (const string& activation : activations) { + kwargs_t activation_args = {{"act_type", activation}}; + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( + activation_args, "Activation", "_backward_Activation"), 1); + } + for (const string& activation : activations) { + kwargs_t activation_args = {{"act_type", activation}}; + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(true, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( + activation_args, "Activation", "_backward_Activation"), 1); + } } /*! diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 3049674821c9..abe6b136fe0c 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -2411,7 +2411,7 @@ def hybrid_forward(self, F, x): x_reshape = x.reshape(self.reshape) out = self.act(x_reshape) return out - acts = ["relu", "sigmoid", "tanh", "softrelu"] + acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"] for act in acts: x = mx.nd.random.uniform(-1, 1, shape=(4, 16, 32, 32)) shape = (4, 32, 32, -1) @@ -2433,7 +2433,7 @@ def hybrid_forward(self, F, x): out = self.act(x_slice) return out - acts = ["relu", "sigmoid", "tanh", "softrelu"] + acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"] for act in acts: x = mx.nd.random.uniform(-1, 1, shape=(8, 32, 64, 64)) slice = [(0, 16, 32, 32), (4, 32, 64, 64)] @@ -2457,7 +2457,7 @@ def hybrid_forward(self, F, x): y_reshape = y.reshape(self.reshape[1]) out = self.act1(y_reshape) return out - acts = ["relu", "sigmoid", "tanh", "softrelu"] + acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"] for idx0, act0 in enumerate(acts): for idx1, act1 in enumerate(acts): if idx1 == idx0: @@ -2484,7 +2484,7 @@ def hybrid_forward(self, F, x): y_slice = y.slice(begin=self.slice[1][0], end=self.slice[1][1]) out = self.act1(y_slice) return out - acts = ["relu", "sigmoid", "tanh", "softrelu"] + acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"] for idx0, act0 in enumerate(acts): for idx1, act1 in enumerate(acts): if idx1 == idx0: @@ -2512,7 +2512,7 @@ def hybrid_forward(self, F, x): y_slice = y.slice(begin=self.slice[0], end=self.slice[1]) out = self.act1(y_slice) return out - acts = ["relu", "sigmoid", "tanh", "softrelu"] + acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"] for idx0, act0 in enumerate(acts): for idx1, act1 in enumerate(acts): if idx1 == idx0: @@ -2541,7 +2541,7 @@ def hybrid_forward(self, F, x): y_reshape = y.reshape(self.reshape) out = self.act1(y_reshape) return out - acts = ["relu", "sigmoid", "tanh", "softrelu"] + acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"] for idx0, act0 in enumerate(acts): for idx1, act1 in enumerate(acts): if idx1 == idx0: